In [61]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import matplotlib.ticker as ticker
%matplotlib inline

warnings.filterwarnings('ignore')

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE


In [62]:
hcv =pd.read_csv('/Users/tokarev/Documents/programmes/DataScale/Machine Learning/Projet/Data/hcvdat0.csv')

In [63]:
mapping_cat = {'0=Blood Donor' : 0, '1=Hepatitis' : 1, '2=Fibrosis' : 2, '3=Cirrhosis' : 3}
mapping_sex = {'m' : 0, 'f' :1}
hcv = hcv.replace({'Category': mapping_cat})
hcv = hcv.replace({'Sex': mapping_sex})

In [64]:
#HANDLING NAN VALUES
blooddonors = hcv.loc[hcv['Category'] == 0]
hepatitis = hcv.loc[hcv['Category'] == 1]
fibrosis = hcv.loc[hcv['Category'] == 2]
cirrhosis = hcv.loc[hcv['Category'] == 3]
means_bd = []
means_hep = []
means_fib = []
means_cir = []
for i in range(4,len(hcv.columns)) :
    #print("%s : %d " % (hcv.columns[i], i-4))
    means_bd.append(hcv.loc[hcv['Category'] == 0][hcv.columns[i]].mean())
    means_hep.append(hcv.loc[hcv['Category'] == 1][hcv.columns[i]].mean())
    means_fib.append(hcv.loc[hcv['Category'] == 2][hcv.columns[i]].mean())
    means_cir.append(hcv.loc[hcv['Category'] == 3][hcv.columns[i]].mean())
    blooddonors[blooddonors.columns[i]] = blooddonors[blooddonors.columns[i]].fillna(value = means_bd[i-4])
    hepatitis[hepatitis.columns[i]] = hepatitis[hepatitis.columns[i]].fillna(value = means_hep[i-4])
    fibrosis[fibrosis.columns[i]] = fibrosis[fibrosis.columns[i]].fillna(value = means_fib[i-4])
    cirrhosis[cirrhosis.columns[i]] = cirrhosis[cirrhosis.columns[i]].fillna(value = means_cir[i-4])
frames = [blooddonors, hepatitis, fibrosis, cirrhosis]
hcv = pd.concat(frames)

In [65]:
Y = hcv.Category
X = hcv.iloc[:,2:]

In [66]:
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.25, random_state=4)

In [67]:
from sklearn import svm

In [68]:
calssifier = svm.SVC(kernel='linear', gamma='auto', C=2)

In [69]:
calssifier.fit(x_train, y_train)
y_predict = calssifier.predict(x_test)

In [70]:
print(accuracy_score(y_test, y_predict))
pd.crosstab(y_test, y_predict)

0.9144736842105263


col_0,0,1,2,3
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,133,0,0,1
1,0,3,4,1
2,0,4,1,0
3,1,1,1,2


In [71]:
from sklearn.metrics import classification_report

In [72]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       134
           1       0.38      0.38      0.38         8
           2       0.17      0.20      0.18         5
           3       0.50      0.40      0.44         5

    accuracy                           0.91       152
   macro avg       0.51      0.49      0.50       152
weighted avg       0.92      0.91      0.92       152



In [73]:
smote = SMOTE()
x_smote, y_smote = smote.fit_sample(X, Y)
x_train_smote, x_test_smote, y_train_smote, y_test_smote = train_test_split(x_smote, y_smote,test_size=0.3, random_state=1)

In [74]:
calssifier2 = svm.SVC(kernel='linear', gamma='auto', C=2)
calssifier2.fit(x_train_smote, y_train_smote)
y_predict2 = calssifier2.predict(x_test_smote)
print(classification_report(y_test_smote, y_predict2))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95       156
           1       0.87      0.81      0.84       171
           2       0.87      0.87      0.87       155
           3       1.00      1.00      1.00       158

    accuracy                           0.91       640
   macro avg       0.91      0.91      0.91       640
weighted avg       0.91      0.91      0.91       640



In [75]:
print(accuracy_score(y_test_smote, y_predict2))
pd.crosstab(y_test_smote, y_predict2)

0.9125


col_0,0,1,2,3
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,153,1,2,0
1,14,138,19,0
2,0,20,135,0
3,0,0,0,158


In [86]:
from sklearn.model_selection import cross_val_score
calssifier3 = svm.SVC(kernel='linear', gamma='auto', C=1)
scores = cross_val_score(calssifier3, x_smote, y_smote, cv=5)

In [87]:
print(scores)

[0.88290398 0.90163934 0.92957746 0.90140845 0.9342723 ]


In [88]:
from sklearn import metrics
scores = cross_val_score(calssifier3, x_smote, y_smote, cv=5)

In [89]:
scores

array([0.88290398, 0.90163934, 0.92957746, 0.90140845, 0.9342723 ])

In [90]:
#grid
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf', 'linear']} 

In [91]:
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 

In [92]:
#grid.fit(x_train_smote, y_train_smote) 

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.251, total=   0.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.251, total=   0.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.262, total=   0.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.275, total=   0.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.285, total=   0.1s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.923, total=   0.2s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.893, total=   0.1s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.943, total=   0.1s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.956, total=   0.2s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.930, total=   0.1s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] .

[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.567, total=   0.1s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.604, total=   0.1s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.611, total=   0.1s
[CV] C=1, gamma=0.1, kernel=linear ...................................
[CV] ....... C=1, gamma=0.1, kernel=linear, score=0.920, total=   1.1s
[CV] C=1, gamma=0.1, kernel=linear ...................................
[CV] ....... C=1, gamma=0.1, kernel=linear, score=0.903, total=   0.9s
[CV] C=1, gamma=0.1, kernel=linear ...................................
[CV] ....... C=1, gamma=0.1, kernel=linear, score=0.943, total=   1.0s
[CV] C=1, gamma=0.1, kernel=linear ...................................
[CV] ....... C=1, gamma=0.1, kernel=linear, score=0.956, total=   0.9s
[CV] C=1, gamma=0.1, kernel=linear ...................................
[CV] .

[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.879, total=   0.1s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.909, total=   0.1s
[CV] C=10, gamma=0.01, kernel=rbf ....................................
[CV] ........ C=10, gamma=0.01, kernel=rbf, score=0.886, total=   0.1s
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV] ..... C=10, gamma=0.01, kernel=linear, score=0.923, total=   5.4s
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV] ..... C=10, gamma=0.01, kernel=linear, score=0.900, total=   4.2s
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV] ..... C=10, gamma=0.01, kernel=linear, score=0.943, total=   8.7s
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV] ..... C=10, gamma=0.01, kernel=linear, score=0.950, total=  11.0s
[CV] C=10, gamma=0.01, kernel=linear .................................
[CV] .

[CV] ... C=100, gamma=0.001, kernel=linear, score=0.923, total= 1.5min
[CV] C=100, gamma=0.001, kernel=linear ...............................
[CV] ... C=100, gamma=0.001, kernel=linear, score=0.903, total=  39.2s
[CV] C=100, gamma=0.001, kernel=linear ...............................
[CV] ... C=100, gamma=0.001, kernel=linear, score=0.936, total=  37.8s
[CV] C=100, gamma=0.001, kernel=linear ...............................
[CV] ... C=100, gamma=0.001, kernel=linear, score=0.956, total= 2.1min
[CV] C=100, gamma=0.001, kernel=linear ...............................
[CV] ... C=100, gamma=0.001, kernel=linear, score=0.919, total= 1.8min
[CV] C=100, gamma=0.0001, kernel=rbf .................................
[CV] ..... C=100, gamma=0.0001, kernel=rbf, score=1.000, total=   0.0s
[CV] C=100, gamma=0.0001, kernel=rbf .................................
[CV] ..... C=100, gamma=0.0001, kernel=rbf, score=0.967, total=   0.0s
[CV] C=100, gamma=0.0001, kernel=rbf .................................
[CV] .

[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.906, total= 1.0min
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.903, total= 1.3min
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.943, total= 1.5min
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.953, total=  49.8s
[CV] C=1000, gamma=0.0001, kernel=linear .............................
[CV] . C=1000, gamma=0.0001, kernel=linear, score=0.909, total=  57.9s


[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed: 63.8min finished


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf', 'linear']},
             verbose=3)

In [94]:
# print best parameter after tuning 
print(grid.best_params_) 
  
# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_)

{'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(C=100, gamma=0.0001)


In [100]:

from sklearn.model_selection import cross_val_score
calssifier4 = svm.SVC(kernel='rbf', gamma=0.0001, C=1)
scores4 = cross_val_score(calssifier4, x_smote, y_smote, cv=5)
print(scores4)


[0.95784543 0.96721311 0.95539906 0.9741784  0.97183099]


              precision    recall  f1-score   support

           0       0.92      0.98      0.95       156
           1       0.87      0.81      0.84       171
           2       0.87      0.87      0.87       155
           3       1.00      1.00      1.00       158

    accuracy                           0.91       640
   macro avg       0.91      0.91      0.91       640
weighted avg       0.91      0.91      0.91       640



In [41]:
smote = SMOTE()
x_smote, y_smote = smote.fit_sample(X, Y)
x_train_smote, x_test_smote, y_train_smote, y_test_smote = train_test_split(x_smote, y_smote,test_size=0.3, random_state=1)
from collections import Counter
print("Before SMOTE : ", Counter(y_train))
print("After SMOTE : ", Counter(y_train_smote))

Before SMOTE :  Counter({0: 399, 3: 25, 2: 16, 1: 16})
After SMOTE :  Counter({2: 378, 0: 377, 3: 375, 1: 362})
