In [6]:
import pandas as pd
import numpy as np

### read dataset

In [7]:
path="https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                   'Normal Nucleoli', 'Mitoses', 'Class']
data=pd.read_csv(path,names=column_name)

In [8]:
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


missing value handling

In [9]:
#replace np.nan
data=data.replace(to_replace="?",value=np.nan)
#delete lack of samples
data.dropna(inplace=True)

In [10]:
data.isnull().any()

Sample code number             False
Clump Thickness                False
Uniformity of Cell Size        False
Uniformity of Cell Shape       False
Marginal Adhesion              False
Single Epithelial Cell Size    False
Bare Nuclei                    False
Bland Chromatin                False
Normal Nucleoli                False
Mitoses                        False
Class                          False
dtype: bool

# split the dataset

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [13]:
#filter feature and target
x=data.iloc[:,1:-1]
y=data["Class"]

In [14]:
x.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1


In [15]:
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Class, dtype: int64

In [16]:
x_train,x_test,y_train,y_test=train_test_split(x,y)

In [17]:
x_train.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
664,3,1,1,3,2,1,2,1,1
19,6,1,1,1,2,1,3,1,1
245,5,1,1,2,2,2,3,1,1
261,5,10,10,6,10,10,10,6,5
360,6,10,10,10,10,10,8,10,10


feature engineering

In [18]:
#standardScaler
from sklearn.preprocessing import StandardScaler

In [19]:
transfer=StandardScaler()
x_train=transfer.fit_transform(x_train)
x_test=transfer.transform(x_test)

In [20]:
x_train

array([[-0.51107572, -0.70620006, -0.74903553, ..., -0.60360963,
        -0.62393037, -0.35326195],
       [ 0.5598824 , -0.70620006, -0.74903553, ..., -0.19373147,
        -0.62393037, -0.35326195],
       [ 0.20289636, -0.70620006, -0.74903553, ..., -0.19373147,
        -0.62393037, -0.35326195],
       ...,
       [-0.51107572, -0.70620006, -0.74903553, ..., -1.01348778,
        -0.62393037, -0.35326195],
       [ 1.27385448,  1.23949195,  0.57226057, ...,  0.62602484,
         0.66939898,  1.35844264],
       [-1.22504779, -0.70620006, -0.74903553, ..., -1.01348778,
        -0.62393037, -0.35326195]])

In [21]:
from sklearn.linear_model import LogisticRegression

estimator

In [22]:
estimator=LogisticRegression()
estimator.fit(x_train,y_train)

In [23]:
#logistic regression
estimator.coef_

array([[1.27388372, 0.11809421, 0.8652993 , 0.83956798, 0.47644625,
        1.3299373 , 0.83290642, 0.48399454, 0.53954601]])

In [24]:
estimator.intercept_

array([-1.15197345])

model evaluation

In [25]:
#6.model evaluation
y_predict = estimator.predict(x_test)
print("y_predict:\n", y_predict)
print("Compare the actual and predicted values:\n", y_test == y_predict)
# method2 Calculate accuracy
score = estimator.score(x_test, y_test)
print("accuracy:\n", score)

y_predict:
 [4 2 4 2 2 4 2 4 2 2 2 2 2 2 2 2 2 4 4 2 4 4 4 2 4 4 2 4 2 4 2 2 2 4 2 4 4
 2 2 2 4 2 2 2 2 4 2 4 2 2 4 2 2 2 2 2 4 2 2 2 2 2 2 2 2 2 4 2 4 4 4 4 2 2
 2 2 2 2 2 2 2 2 4 2 2 2 2 2 2 2 2 4 4 2 2 2 4 4 2 2 2 2 4 4 2 2 4 2 2 4 2
 2 4 2 2 2 2 4 2 4 2 4 2 2 2 4 4 2 2 2 2 2 4 2 2 4 4 4 4 2 2 4 2 4 4 2 2 4
 2 2 4 2 2 2 2 2 4 2 2 2 2 2 4 2 2 2 2 2 4 2 4]
Compare the actual and predicted values:
 391    True
502    True
41     True
398    True
634    True
       ... 
395    True
396    True
612    True
443    True
467    True
Name: Class, Length: 171, dtype: bool
accuracy:
 0.9707602339181286


In [26]:
#accurate rate  recall
from sklearn.metrics import classification_report

In [27]:
report=classification_report(y_test,y_predict,labels=[2,4],target_names=["benign","malignance"])

In [28]:
print(report)

              precision    recall  f1-score   support

      benign       0.97      0.99      0.98       113
  malignance       0.98      0.93      0.96        58

    accuracy                           0.97       171
   macro avg       0.97      0.96      0.97       171
weighted avg       0.97      0.97      0.97       171



In [29]:
y_test.head()

391    4
502    2
41     4
398    2
634    2
Name: Class, dtype: int64

In [30]:
#y_test transform 0 1
y_true=np.where(y_test>3,1,0)

In [31]:
y_true

array([1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1])

In [32]:
from sklearn.metrics import roc_auc_score


In [33]:
roc_auc_score(y_true,y_predict)

0.9610924626182483