In [1]:
import pandas as pd 
import numpy as np

In [18]:
# 列索引
"""
   #  Attribute                     Domain
   -- -----------------------------------------
   1. Sample code number            id number
   2. Clump Thickness               1 - 10
   3. Uniformity of Cell Size       1 - 10
   4. Uniformity of Cell Shape      1 - 10
   5. Marginal Adhesion             1 - 10
   6. Single Epithelial Cell Size   1 - 10
   7. Bare Nuclei                   1 - 10
   8. Bland Chromatin               1 - 10
   9. Normal Nucleoli               1 - 10
  10. Mitoses                       1 - 10
  11. Class:                        (2 for benign, 4 for malignant)
"""
column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion',
              'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']
data = pd.read_csv("../data_ml/breast+cancer+wisconsin+original/breast-cancer-wisconsin.data", names=column_name)
data.head(60)

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4
6,1018099,1,1,1,1,2,10,3,1,1,2
7,1018561,2,1,2,1,2,1,3,1,1,2
8,1033078,2,1,1,1,2,1,1,1,5,2
9,1033078,4,2,1,1,2,1,2,1,1,2


In [20]:
data = data.replace(to_replace="?", value=np.nan)
# 检查是否有缺失值
data.isnull().any()

Sample code number             False
Clump Thickness                False
Uniformity of Cell Size        False
Uniformity of Cell Shape       False
Marginal Adhesion              False
Single Epithelial Cell Size    False
Bare Nuclei                     True
Bland Chromatin                False
Normal Nucleoli                False
Mitoses                        False
Class                          False
dtype: bool

In [21]:
# 删除缺失样本
data.dropna(inplace=True)
data.isnull().any()

Sample code number             False
Clump Thickness                False
Uniformity of Cell Size        False
Uniformity of Cell Shape       False
Marginal Adhesion              False
Single Epithelial Cell Size    False
Bare Nuclei                    False
Bland Chromatin                False
Normal Nucleoli                False
Mitoses                        False
Class                          False
dtype: bool

In [22]:
data

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [23]:
# 划分数据集
from sklearn.model_selection import train_test_split

In [25]:
# 刷选特征值和目标值
x = data.iloc[:, 1:-1]
y = data["Class"]

In [26]:
x.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1


In [27]:
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Class, dtype: int64

In [28]:
# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [32]:
x_train

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
576,5,1,1,1,2,1,2,1,1
270,8,4,7,1,3,10,3,9,2
685,1,1,1,1,2,1,1,1,1
74,10,6,4,1,3,4,3,2,3
6,1,1,1,1,2,10,3,1,1
...,...,...,...,...,...,...,...,...,...
449,5,7,9,8,6,10,8,10,1
303,1,1,1,1,2,1,3,1,1
654,3,1,1,1,2,1,3,1,1
152,10,10,8,6,4,5,8,10,1


In [33]:
# 特征工程-标准化
from sklearn.preprocessing import StandardScaler

In [34]:
transfer = StandardScaler()
# 标准化训练集和测试集
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

In [36]:
# 标准化后,均值为0,标准差为1
x_test

array([[-1.19900557, -0.36842263, -0.40263272, ..., -1.00278926,
        -0.62878979, -0.35075798],
       [-0.84517799, -0.69590941, -0.73512941, ..., -0.19360392,
        -0.62878979, -0.35075798],
       [ 1.27778749,  0.28655093,  0.26236067, ..., -0.19360392,
         0.01125071, -0.35075798],
       ...,
       [ 0.57013233,  0.61403771,  0.26236067, ...,  1.42476678,
         1.61135197,  0.78948383],
       [-1.19900557, -0.69590941, -0.73512941, ..., -1.00278926,
        -0.62878979, -0.35075798],
       [-1.19900557, -0.36842263, -0.07013602, ..., -1.00278926,
        -0.62878979, -0.35075798]])

In [37]:
# 逻辑回归预估器
from sklearn.linear_model import LogisticRegression

In [38]:
estimator = LogisticRegression()
estimator.fit(x_train, y_train)

In [39]:
# 逻辑回归模型的参数: 回归系数和偏置
estimator.coef_

array([[1.27472009, 0.46415347, 0.61785317, 0.6937415 , 0.10610354,
        1.23845636, 1.01897042, 0.65676761, 0.42500226]])

In [41]:
estimator.intercept_

array([-1.27259653])

In [42]:
# 模型评估
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("通过模型预测的值是否等于测试集的真实值\n", y_test == y_predict)

score = estimator.score(x_test, y_test)
print("模型对测试集的准确率为：\n", score)

y_predict:
 [2 2 4 2 4 2 4 2 4 2 4 2 2 2 2 2 2 2 2 2 2 2 4 4 4 4 2 4 4 2 2 4 2 2 2 2 2
 2 2 2 4 4 2 4 4 2 2 4 2 4 2 2 2 2 4 4 2 4 2 2 2 4 4 4 2 4 2 2 4 2 4 2 2 2
 4 2 2 2 4 2 2 2 2 2 2 4 4 4 4 2 2 4 2 4 2 2 2 4 4 4 2 2 4 2 2 2 2 4 2 2 4
 4 2 4 2 2 4 2 4 2 2 2 2 2 2 4 2 2 2 2 2 2 2 2 4 4 2 2 2 2 2 2 2 2 2 2 4 4
 2 2 2 2 2 4 4 4 2 4 4 2 2 2 2 4 2 4 2 2 4 2 2]
通过模型预测的值是否等于测试集的真实值
 646    True
33     True
247    True
647    True
36     True
       ... 
161    True
170    True
109    True
585    True
399    True
Name: Class, Length: 171, dtype: bool
模型对测试集的准确率为：
 0.9532163742690059


In [43]:
# 精确率/ 召回率/ F1-score
from sklearn.metrics import classification_report
report = classification_report(y_test, y_predict, labels=[2, 4], target_names=["良性", "恶性"])

In [44]:
print(report)

              precision    recall  f1-score   support

          良性       0.95      0.98      0.96       109
          恶性       0.97      0.90      0.93        62

    accuracy                           0.95       171
   macro avg       0.96      0.94      0.95       171
weighted avg       0.95      0.95      0.95       171



In [46]:
y_test

646    2
33     2
247    4
647    2
36     4
      ..
161    2
170    2
109    4
585    2
399    2
Name: Class, Length: 171, dtype: int64

In [47]:
# 处理样本不均衡问题 ROC AUC
# 将y_test转化为0（反例）,1（正例）表示
y_true = np.where(y_test > 3, 1, 0)

In [48]:
y_true

array([0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0])

In [49]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_true, y_predict)

0.9424385912992009