In [1]:
import numpy as np
import pandas as pd

# 数据划分
from sklearn.model_selection import train_test_split

# 数据标准化
from sklearn.preprocessing import StandardScaler

# 逻辑回归
from sklearn.linear_model import LogisticRegression

# 获取数据

In [2]:
names = [
    "Sample code number",
    "Clump Thickness",
    "Uniformity of Cell Size",
    "Uniformity of Cell Shape",
    "Marginal Adhesion",
    "Single Epithelial Cell Size",
    "Bare Nuclei",
    "Bland Chromatin",
    "Normal Nucleoli",
    "Mitoses",
    "Class",
]

# 下载地址 https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data
# names = name 添加列名 或者后期添加都可以
# data = pd.read_csv('../data/breast-cancer-wisconsin.data', names=names)
data = pd.read_csv("../data/breast-cancer-wisconsin.data")
data.columns = names

In [3]:
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


# 基本数据处理

## 缺失值处理

In [4]:
np.any(data.isna())

False

In [5]:
data.replace(to_replace="?", value=np.NAN, inplace=True)

In [6]:
np.any(data.isna())

True

In [7]:
data.dropna(inplace=True)
np.any(data.isna())

False

## 确定特征值

In [8]:
x = data.iloc[:, 0:-1]
x.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,1002945,5,4,4,5,7,10,3,2,1
1,1015425,3,1,1,1,2,2,3,1,1
2,1016277,6,8,8,1,3,4,3,7,1
3,1017023,4,1,1,3,2,1,3,1,1
4,1017122,8,10,10,8,7,10,9,7,1


In [9]:
y = data["Class"]
y.head()

0    2
1    2
2    2
3    2
4    4
Name: Class, dtype: int64

## 分割数据

In [10]:
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=0)

In [11]:
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((545, 10), (137, 10), (545,), (137,))

## 特征工程(标准化)

In [12]:
scaler = StandardScaler()

In [13]:
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)

# 逻辑回归

In [14]:
estimator = LogisticRegression()

In [15]:
estimator.fit(x_train_scaled, y_train)

# 模型评估

In [26]:
estimator.score(x_val_scaled, y_val)

0.9635036496350365

In [16]:
# 预测类别
estimator.predict(x_val_scaled)

array([2, 2, 4, 2, 2, 4, 2, 2, 2, 4, 4, 2, 4, 2, 2, 4, 4, 4, 2, 4, 4, 2,
       2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4,
       2, 2, 4, 4, 2, 4, 2, 2, 4, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2,
       2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 4, 4, 2, 4, 2, 2, 4, 2, 2,
       4, 2, 2, 2, 2, 4, 4, 4, 2, 2, 4, 2, 2, 4, 4, 4, 4, 2, 2, 4, 2, 2,
       2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2, 4, 2, 4, 4, 4, 2, 2, 2, 4, 2, 2,
       2, 2, 4, 2, 2], dtype=int64)

In [19]:
# 预测类型的可能性得分
estimator.predict_proba(x_val_scaled)[:10]

array([[7.71012341e-01, 2.28987659e-01],
       [9.98338038e-01, 1.66196186e-03],
       [8.33074454e-04, 9.99166926e-01],
       [9.98028683e-01, 1.97131724e-03],
       [9.97429558e-01, 2.57044167e-03],
       [4.17430379e-05, 9.99958257e-01],
       [9.81772303e-01, 1.82276970e-02],
       [9.94482496e-01, 5.51750447e-03],
       [9.93972084e-01, 6.02791574e-03],
       [1.39424656e-02, 9.86057534e-01]])

In [20]:
# 预测类型的可能性log得分
estimator.predict_log_proba(x_val_scaled)[:10]

array([[-2.60050899e-01, -1.47408717e+00],
       [-1.66334445e-03, -6.39975653e+00],
       [-7.09038754e+00, -8.33421654e-04],
       [-1.97326285e-03, -6.22905331e+00],
       [-2.57375093e-03, -5.96367754e+00],
       [-1.00839779e+01, -4.17439092e-05],
       [-1.83958682e-02, -4.00481303e+00],
       [-5.53278212e-03, -5.19982961e+00],
       [-6.04615697e-03, -5.11135398e+00],
       [-4.27281602e+00, -1.40405748e-02]])