癌症分类预测-良／恶性乳腺癌肿瘤预测

In [1]:
# 1. 获取数据集
import pandas as pd
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
col_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                   'Normal Nucleoli', 'Mitoses', 'Class']

data = pd.read_csv(filepath_or_buffer=data_url, names=col_names)

In [2]:
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [3]:
# 2. 缺失值处理
import numpy as np
data.replace(to_replace='?', value=np.nan, inplace=True)
data.dropna(inplace=True)

In [4]:
# 划分数据集
from sklearn import model_selection
data_x = data.iloc[:, 1:-1]
data_y = data['Class']

x_train, x_test, y_train, y_test = model_selection.train_test_split(data_x, data_y)

In [5]:
# 无量纲化（标准化）因为要计算权重值
from sklearn import preprocessing

transfer = preprocessing.StandardScaler()
x_train = transfer.fit_transform(X=x_train)
x_test = transfer.transform(X=x_test)

In [6]:
# 获取逻辑回归预估器
from sklearn import linear_model

estimator = linear_model.LogisticRegression()
estimator.fit(X=x_train, y=y_train)

LogisticRegression()

In [7]:
# 逻辑回归的模型参数：回归系数（权重）和偏置
print('回归系数（权重）：', estimator.coef_)
print('偏置：', estimator.intercept_)

回归系数（权重）： [[1.06922285 1.4364998  1.427362   0.56178682 0.03211093 1.29343556
  0.57794962 0.51153088 0.37077998]]
偏置： [-1.02482042]


In [8]:
# 模型评估
from sklearn import metrics

score = estimator.score(X=x_test, y=y_test)
predict = estimator.predict(X=x_test)
error_mse = metrics.mean_squared_error(y_true=y_test, y_pred=predict)

print('Soure: ', score)
print('MSE 均方误差：', error_mse)

Soure:  0.9532163742690059
MSE 均方误差： 0.1871345029239766


In [9]:
# 精确率与召回率
result = metrics.classification_report(y_true=y_test, y_pred=predict, labels=(2, 4), target_names=['良性', '恶性'])
