## Preprocess

In [1]:
import pandas as pd
import numpy as np

column_names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']
data_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'

In [2]:
data = pd.read_csv(data_uri, names=column_names)
data = data.replace(to_replace='?', value=np.nan)
data = data.dropna(how='any')
data

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4
6,1018099,1,1,1,1,2,10,3,1,1,2
7,1018561,2,1,2,1,2,1,3,1,1,2
8,1033078,2,1,1,1,2,1,1,1,5,2
9,1033078,4,2,1,1,2,1,2,1,1,2


## Train and Test

In [5]:
from sklearn.model_selection import train_test_split
x, y = data[column_names[1:10]], data[column_names[10]]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10)

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
ss = StandardScaler()
x_train =  ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

lr = LogisticRegression()
lr.fit(x_train, y_train)
y_predict = lr.predict(x_test)
y_predict

array([4, 4, 2, 4, 4, 4, 2, 2, 4, 4, 4, 2, 2, 2, 4, 2, 2, 4, 2, 4, 2, 2,
       2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 4, 4, 4, 2, 2, 4, 2, 4, 2,
       4, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 4, 2, 4, 2, 4, 2, 2,
       2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2,
       2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 2, 2, 2, 2, 4, 2,
       4, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 4, 2, 4, 4, 4, 4, 2, 4, 2, 2, 2,
       2, 2, 4, 4, 4], dtype=int64)

In [6]:
from sklearn.metrics import classification_report
print('Accuracy:', lr.score(x_test, y_test))
print(classification_report(y_test, y_predict, target_names=['Benign', 'Malignant']))

Accuracy: 0.9635036496350365
             precision    recall  f1-score   support

     Benign       0.97      0.98      0.97        89
  Malignant       0.96      0.94      0.95        48

avg / total       0.96      0.96      0.96       137

