Цель: Реализовать SVM и Logistic Regression для данного датасета: https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original) Метрики оценки: Accuracy, Precision, Recall, F1-Score

In [98]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV

In [67]:
df = pd.read_csv('breast-cancer-wisconsin.data', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [68]:
columns = ['Sample code number',
           'Clump Thickness',
           'Uniformity of Cell Size',
           'Uniformity of Cell Shape',
           'Marginal Adhesion',
           'Single Epithelial Cell Size',
           'Bare Nuclei',
           'Bland Chromatin',
           'Normal Nucleoli',
           'Mitoses',
           'Class']
names = dict(zip(df.columns, columns))
names

{0: 'Sample code number',
 1: 'Clump Thickness',
 2: 'Uniformity of Cell Size',
 3: 'Uniformity of Cell Shape',
 4: 'Marginal Adhesion',
 5: 'Single Epithelial Cell Size',
 6: 'Bare Nuclei',
 7: 'Bland Chromatin',
 8: 'Normal Nucleoli',
 9: 'Mitoses',
 10: 'Class'}

In [69]:
df.set_index(0, inplace=True)

In [70]:
df[6].value_counts()

1     402
10    132
5      30
2      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: 6, dtype: int64

In [74]:
df = df[df[6]!='?']
df[6] = df[6].astype('int')

  result = method(y)


In [72]:
df.describe()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
count,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0,683.0
mean,4.442167,3.150805,3.215227,2.830161,3.234261,3.544656,3.445095,2.869693,1.603221,2.699854
std,2.820761,3.065145,2.988581,2.864562,2.223085,3.643857,2.449697,3.052666,1.732674,0.954592
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,2.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,2.0
50%,4.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,2.0
75%,6.0,5.0,5.0,4.0,4.0,6.0,5.0,4.0,1.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [73]:
df_X = df.drop(10, axis=1)
df_y = df[10]

scaler = StandardScaler()
scaler.fit_transform(df_X, df_y)
X_scaled = scaler.transform(df_X)

In [140]:
transform = PolynomialFeatures(10)
transform.fit_transform(df_X)
X_poly = transform.transform(df_X)

In [141]:
(X_tr_o, X_ts_o, y_tr_o, y_ts_o ) = train_test_split(df_X, df_y, stratify=df_y, test_size= 0.3)
(X_tr_sc, X_ts_sc, y_tr_sc, y_ts_sc) = train_test_split(X_scaled, df_y, stratify = df_y, test_size = 0.30)
(X_tr_p, X_ts_p, y_tr_p, y_ts_p ) = train_test_split(X_poly, df_y, stratify=df_y, test_size= 0.3)

## LogisticRegression

In [142]:
estimator = LogisticRegression()
paramgrid = {'C': [0.01, 0.05, 0.1, 0.5, 1], 'penalty': ['l1','l2']}
optimizer = GridSearchCV(estimator, paramgrid, cv=10)

In [143]:
optimizer.fit(X_tr_o, y_tr_o)
lr_predictions_o = optimizer.best_estimator_.predict(X_ts_o)

In [144]:
optimizer.fit(X_tr_sc, y_tr_sc)
lr_predictions_sc = optimizer.best_estimator_.predict(X_ts_sc)

In [147]:
print('Metrics for Logistic Regression for original input values')
print('Accuracy score', round(accuracy_score(y_ts_o, lr_predictions_o),2))
print(classification_report(y_ts_o, lr_predictions_o), '\n')

print('Metrics for Logistic Regression for scaled input values')
print('Accuracy score', round(accuracy_score(y_ts_o, lr_predictions_sc),2))
print(classification_report(y_ts_sc, lr_predictions_sc), '\n')

Metrics for Logistic Regression for original input values
Accuracy score 0.96
             precision    recall  f1-score   support

          2       0.96      0.97      0.97       133
          4       0.94      0.93      0.94        72

avg / total       0.96      0.96      0.96       205
 

Metrics for Logistic Regression for scaled input values
Accuracy score 0.47
             precision    recall  f1-score   support

          2       0.98      0.94      0.96       133
          4       0.90      0.97      0.93        72

avg / total       0.95      0.95      0.95       205
 



## SVM

In [151]:
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001]}

grid_o = GridSearchCV(SVC(), param_grid, refit=True, verbose=2)
grid_o.fit(X_tr_o, y_tr_o)
svm_predictions_o = grid_o.best_estimator_.predict(X_ts_o)

grid_sc = GridSearchCV(SVC(),param_grid,refit=True,verbose=2)
grid_sc.fit(X_tr_sc, y_tr_sc)
svm_predictions_sc = grid_sc.best_estimator_.predict(X_ts_sc)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=0.1, gamma=1 ..................................................
[CV] ................................... C=0.1, gamma=1, total=   0.0s
[CV] C=0.1, gamma=1 ..................................................
[CV] ................................... C=0.1, gamma=1, total=   0.0s
[CV] C=0.1, gamma=1 ..................................................
[CV] ................................... C=0.1, gamma=1, total=   0.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................................. C=0.1, gamma=0.1, total=   0.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................................. C=0.1, gamma=0.1, total=   0.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ................................. C=0.1, gamma=0.1, total=   0.0s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...........

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] .................................. C=10, gamma=0.1, total=   0.0s
[CV] C=10, gamma=0.1 .................................................
[CV] .................................. C=10, gamma=0.1, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.01 ................................................
[CV] ................................. C=10, gamma=0.01, total=   0.0s
[CV] C=10, gamma=0.001 ...............................................
[CV] ................................ C=10, gamma=0.001, total=   0.0s
[CV] C=10, gamma=0.001 ...............................................
[CV] ................................ C=10, gamma=0.001, total=   0.0s
[CV] C=10, gamma=0.001 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ................................. C=1, gamma=0.001, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ................................. C=1, gamma=0.001, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] ................................. C=1, gamma=0.001, total=   0.0s
[CV] C=10, gamma=1 ...................................................
[CV] .................................... C=10, gamma=1, total=   0.0s
[CV] C=10, gamma=1 ...................................................
[CV] .................................... C=10, gamma=1, total=   0.0s
[CV] C=10, gamma=1 ...................................................
[CV] .................................... C=10, gamma=1, total=   0.0s
[CV] C=10, gamma=0.1 .................................................
[CV] .................................. C=10, gamma=0.1, total=   0.0s
[CV] C=10, gamma=0.1 .................................................
[CV] .

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:    0.2s finished


In [152]:
print('Metrics for SVM for original input values')
print('Accuracy score', round(accuracy_score(y_ts_o, svm_predictions_o),2))
print(classification_report(y_ts_o, svm_predictions_o), '\n')

print('Metrics for SVM for scaled input values')
print('Accuracy score', round(accuracy_score(y_ts_o, svm_predictions_sc),2))
print(classification_report(y_ts_sc, svm_predictions_sc), '\n')

Metrics for SVM for original input values
Accuracy score 0.96
             precision    recall  f1-score   support

          2       0.97      0.96      0.97       133
          4       0.93      0.94      0.94        72

avg / total       0.96      0.96      0.96       205
 

Metrics for SVM for scaled input values
Accuracy score 0.47
             precision    recall  f1-score   support

          2       0.99      0.94      0.97       133
          4       0.90      0.99      0.94        72

avg / total       0.96      0.96      0.96       205
 

