# 案例：癌症分类预测
- 良性/恶性肿瘤二分类（uci）

**流程：**
1. 获取数据
    - 加上names
2. 数据处理
    - 缺失值处理
3. 数据集划分
4. 特征工程
    - 无量纲化：标准化
5. 逻辑回归
6. 模型评估

In [22]:
import pandas as pd
import numpy as np

# 一、获取数据

In [23]:
path = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                   'Normal Nucleoli', 'Mitoses', 'Class'] # names可以从数据集的.name文件中得到

data = pd.read_csv(path, names=column_name) # 加上names
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [24]:
data.shape

(699, 11)

# 二、数据处理
- 缺失值处理

In [25]:
data.isin(['?']).sum() # 发现有 ? 缺失值

Sample code number              0
Clump Thickness                 0
Uniformity of Cell Size         0
Uniformity of Cell Shape        0
Marginal Adhesion               0
Single Epithelial Cell Size     0
Bare Nuclei                    16
Bland Chromatin                 0
Normal Nucleoli                 0
Mitoses                         0
Class                           0
dtype: int64

In [26]:
data.replace(to_replace='?', value=np.nan, inplace=True) # 先将 ? 缺失值转换成 NaN 缺失值
data.isin(['?']).sum()

Sample code number             0
Clump Thickness                0
Uniformity of Cell Size        0
Uniformity of Cell Shape       0
Marginal Adhesion              0
Single Epithelial Cell Size    0
Bare Nuclei                    0
Bland Chromatin                0
Normal Nucleoli                0
Mitoses                        0
Class                          0
dtype: int64

In [27]:
data.isnull().sum()

Sample code number              0
Clump Thickness                 0
Uniformity of Cell Size         0
Uniformity of Cell Shape        0
Marginal Adhesion               0
Single Epithelial Cell Size     0
Bare Nuclei                    16
Bland Chromatin                 0
Normal Nucleoli                 0
Mitoses                         0
Class                           0
dtype: int64

In [28]:
data.dropna(inplace=True) # 处理缺失值，这里直接去掉缺失值样本
data.isnull().sum()

Sample code number             0
Clump Thickness                0
Uniformity of Cell Size        0
Uniformity of Cell Shape       0
Marginal Adhesion              0
Single Epithelial Cell Size    0
Bare Nuclei                    0
Bland Chromatin                0
Normal Nucleoli                0
Mitoses                        0
Class                          0
dtype: int64

In [29]:
data.shape

(683, 11)

# 三、划分数据集

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
x = data.iloc[:, 1:-1]
y = data.Class # 筛选数据集
x_train, x_test, y_train, y_test = train_test_split(x, y) # 划分数据集
x_train.shape

(512, 9)

# 四、特征工程
- 标准化

In [32]:
from sklearn.preprocessing import StandardScaler

In [33]:
trans = StandardScaler()
x_train = trans.fit_transform(x_train)
x_test = trans.transform(x_test)

In [34]:
x_test

array([[-0.86687274, -0.69313243, -0.73682624, ..., -0.16048515,
        -0.5957248 , -0.33398182],
       [ 0.55001104,  2.27247039,  0.604525  , ...,  1.05983757,
         2.41425314, -0.33398182],
       [ 1.96689481,  0.95442469,  0.26918719, ...,  2.28016029,
         2.41425314, -0.33398182],
       ...,
       [-0.15843085,  0.95442469,  0.9398628 , ...,  1.46661181,
         1.41092716,  0.86181322],
       [-1.22109368, -0.69313243, -0.73682624, ..., -0.97403363,
        -0.5957248 , -0.33398182],
       [ 0.55001104, -0.69313243, -0.06615062, ..., -0.16048515,
        -0.5957248 , -0.33398182]], shape=(171, 9))

# 五、逻辑回归

In [35]:
from sklearn.linear_model import LogisticRegression

In [36]:
esti = LogisticRegression()
esti.fit(x_train, y_train)
esti.coef_ # 查看权重，9个特征对应9个权重

array([[1.22598986, 0.83665382, 0.75995688, 0.67795114, 0.04624282,
        1.04115141, 1.03462492, 0.55708103, 0.5840498 ]])

In [37]:
esti.intercept_ # 查看偏置

array([-1.1327145])

# 六、模型评估

In [38]:
# 方法1：
y_pred = esti.predict(x_test) # 预测
print("y_predict =", y_pred)
accuracy_1 = np.sum(y_pred == y_test)/sum(np.ones(y_test.shape))
print("accuracy_1 =", accuracy_1) # 计算准确率

y_predict = [2 4 4 4 2 2 4 2 2 4 2 2 2 2 2 2 2 2 4 2 4 4 4 2 4 2 2 2 2 2 4 4 2 2 4 2 4
 2 2 2 2 4 2 2 4 4 2 4 4 4 4 4 2 2 2 2 4 4 2 2 2 2 2 2 2 4 2 2 4 2 2 4 2 2
 2 4 2 2 4 2 2 4 4 4 2 4 2 2 2 2 2 2 4 4 2 2 2 4 4 2 2 2 4 2 2 2 4 2 2 2 4
 2 2 2 2 4 2 4 2 2 4 2 2 4 4 2 2 2 2 4 4 2 2 2 2 2 4 2 2 4 4 2 2 2 2 2 4 4
 2 2 2 4 2 4 4 2 2 2 2 2 4 2 2 4 4 4 2 4 4 2 2]
accuracy_1 = 0.9649122807017544


In [39]:
# 方法2：
accuracy_2 = esti.score(x_test, y_test) # 直接计算准确率
print("accuracy_2 =", accuracy_2)

accuracy_2 = 0.9649122807017544


- 评估指标1：精确率、召回率、F1-score等

In [40]:
from sklearn.metrics import classification_report

In [41]:
report = esti = classification_report(y_test, y_pred, labels=[2, 4], target_names=['良性肿瘤', '恶性肿瘤'])
print("预测检验报告：\n", report)

预测检验报告：
               precision    recall  f1-score   support

        良性肿瘤       0.96      0.98      0.97       108
        恶性肿瘤       0.97      0.94      0.95        63

    accuracy                           0.96       171
   macro avg       0.97      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



- 评估指标2：ROC、AUC指标

In [43]:
y_test.head()

364    2
582    4
221    4
205    4
154    2
Name: Class, dtype: int64

In [44]:
# 将y_test转换成0、1表示
y_true = np.where(y_test > 3, 1, 0)
y_true

array([0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0])

In [45]:
from sklearn.metrics import roc_auc_score

In [46]:
roc_auc_score(y_true, y_pred) # 得到AUC指标

np.float64(0.958994708994709)