In [17]:
import numpy as np
import pandas as pd

# 数据划分
from sklearn.model_selection import train_test_split

# 数据标准化
from sklearn.preprocessing import StandardScaler

# 逻辑回归
from sklearn.linear_model import LogisticRegression

# 分类评估报告
from sklearn.metrics import classification_report

# 获取数据

In [2]:
names = [
    "Sample code number",
    "Clump Thickness",
    "Uniformity of Cell Size",
    "Uniformity of Cell Shape",
    "Marginal Adhesion",
    "Single Epithelial Cell Size",
    "Bare Nuclei",
    "Bland Chromatin",
    "Normal Nucleoli",
    "Mitoses",
    "Class",
]

# 下载地址 https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data
# names = name 添加列名 或者后期添加都可以
# data = pd.read_csv('../data/breast-cancer-wisconsin.data', names=names)
data = pd.read_csv("../data/breast-cancer-wisconsin.data")
data.columns = names

In [3]:
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


# 基本数据处理

## 缺失值处理

In [4]:
np.any(data.isna())

False

In [5]:
data.replace(to_replace="?", value=np.NAN, inplace=True)

In [6]:
np.any(data.isna())

True

In [7]:
data.dropna(inplace=True)
np.any(data.isna())

False

## 确定特征值

In [8]:
x = data.iloc[:, 0:-1]
x.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,1002945,5,4,4,5,7,10,3,2,1
1,1015425,3,1,1,1,2,2,3,1,1
2,1016277,6,8,8,1,3,4,3,7,1
3,1017023,4,1,1,3,2,1,3,1,1
4,1017122,8,10,10,8,7,10,9,7,1


In [9]:
y = data["Class"]
y.head()

0    2
1    2
2    2
3    2
4    4
Name: Class, dtype: int64

## 分割数据

In [10]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=0)

In [11]:
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((545, 10), (137, 10), (545,), (137,))

## 特征工程(标准化)

In [12]:
scaler = StandardScaler()

In [13]:
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)

# 逻辑回归

In [14]:
estimator = LogisticRegression()

In [15]:
estimator.fit(x_train_scaled, y_train)

# 模型评估

In [16]:
estimator.score(x_val_scaled, y_val)

0.9635036496350365

# 分类评估报告 classification_report(y_true, y_pred, labels=[], target_names=None)
- y_true：真实目标值
- y_pred：估计器预测目标值
- labels:指定类别对应的数字
- target_names：目标类别名称
- return：每个类别精确率与召回率


精确率: 查的准不准 全部查出的人多少个是正确的 <br>
召回率: 查的全不全 全部的癌症的人中查出来了多少个


In [18]:
y_predict = estimator.predict(x_val_scaled)
y_predict

array([2, 2, 4, 2, 2, 4, 2, 2, 2, 4, 4, 2, 4, 2, 2, 4, 4, 4, 2, 4, 4, 2,
       2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4,
       2, 2, 4, 4, 2, 4, 2, 2, 4, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2,
       2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 4, 4, 2, 4, 2, 2, 4, 2, 2,
       4, 2, 2, 2, 2, 4, 4, 4, 2, 2, 4, 2, 2, 4, 4, 4, 4, 2, 2, 4, 2, 2,
       2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2, 4, 2, 4, 4, 4, 2, 2, 2, 4, 2, 2,
       2, 2, 4, 2, 2], dtype=int64)

In [22]:
# labels=(2,4) 数据中2代表良性,4代表恶性2代表良性,4代表恶性
res = classification_report(
    y_val, y_predict, labels=(2, 4), target_names=("良性", "恶性")
)
print(res)

              precision    recall  f1-score   support

          良性       0.98      0.97      0.97        92
          恶性       0.93      0.96      0.95        45

    accuracy                           0.96       137
   macro avg       0.96      0.96      0.96       137
weighted avg       0.96      0.96      0.96       137

