# 数据预处理

### 导入数据

In [1]:
import pandas as pd
import numpy as np

from IPython.display import display

In [2]:
# 创建特征列表
column_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size',
                'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 
                'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']

In [3]:
# 读取数据
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
data = pd.read_csv(url, names = column_names)

data.shape

(699, 11)

In [4]:
# 去除有参数缺失的数据
data = data.replace(to_replace='?', value = np.nan)
data = data.dropna(how='any')

data.shape

(683, 11)

In [5]:
# 显示已读取的数据前五行
display(data.head(n = 3))

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2


### 切分数据

In [6]:
from sklearn.model_selection import train_test_split
# 将75%的数据用于训练集，25%用于测试集
X_train, X_test, y_train, y_test = train_test_split(data[column_names[1:10]],
                    data[column_names[10]], test_size=0.25, random_state=33)

In [7]:
# 查验切分情况
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# 使用pandas的value_counts()方法查看样本数量类别分布
print(y_train.value_counts(),'\n', y_test.value_counts())

(512, 9) (171, 9) (512,) (171,)
2    344
4    168
Name: Class, dtype: int64 
 2    100
4     71
Name: Class, dtype: int64


#### 训练样本512个，344个为良性肿瘤样本，168个为恶性肿瘤样本
#### 测试样本171个，100个为良性肿瘤样本，71个为恶性肿瘤样本

### 数据标准化

In [8]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

## 训练模型

In [9]:
# 调用Logistic和SGDC模型
from sklearn.linear_model import LogisticRegression, SGDClassifier
lr = LogisticRegression()
sgdc = SGDClassifier()

In [10]:
# 训练lr模型
lr.fit(X_train, y_train)
# 使用训练好的lr模型对X_test进行预测，结果存于lr_y_predict中
lr_y_predict = lr.predict(X_test)

# 训练sgdc模型
sgdc.fit(X_train, y_train)
# 使用训练好的sgdc模型对X_test进行预测, 结果存于sgcd_y_predict中
sgdc_y_predict = sgdc.predict(X_test)



## 性能测评

In [11]:
from sklearn.metrics import classification_report

# 使用Logistic 模型自带的评分函数score 测试模型在测试集上的准确性
print('Accuracy of LR Classifier:', lr.score(X_test, y_test))
# 使用classification_report 模块获得LogisticRegression的其他指标
print(classification_report(y_test, lr_y_predict, target_names = ['Benign', 'Malignant']))

# 使用梯度下降模型自带的评分函数score 测试模型在测试集上的准确性
print('Accuracy of SGD Classifier:', sgdc.score(X_test, y_test))
# 使用classification_report 模块获得SGDClassifier 的其他指标
print(classification_report(y_test, sgdc_y_predict, target_names = ['Benign', 'Malignant']))

Accuracy of LR Classifier: 0.9883040935672515
             precision    recall  f1-score   support

     Benign       0.99      0.99      0.99       100
  Malignant       0.99      0.99      0.99        71

avg / total       0.99      0.99      0.99       171

Accuracy of SGD Classifier: 0.9883040935672515
             precision    recall  f1-score   support

     Benign       1.00      0.98      0.99       100
  Malignant       0.97      1.00      0.99        71

avg / total       0.99      0.99      0.99       171



#### LR算法比SGDC有更高的准确性是因为sklearn中采用解析的方式精确计算LR的参数，采用梯度法估算SGDC的参数
#### LR计算时间长，但模型性能高，SGDC计算时间短，但模型性能略低，一般而言，数据集在10万以上时考虑时间耗用，推荐用后者