In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC, LinearSVC

In [3]:
data=pd.read_csv("cancer_data.csv")

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
id                         569 non-null int64
diagnosis                  569 non-null int64
radius_mean                569 non-null float64
texture_mean               569 non-null float64
perimeter_mean             569 non-null float64
area_mean                  569 non-null float64
smoothness_mean            569 non-null float64
compactness_mean           569 non-null float64
concavity_mean             569 non-null float64
concave points_mean        569 non-null float64
symmetry_mean              569 non-null float64
fractal_dimension_mean     569 non-null float64
radius_se                  569 non-null float64
texture_se                 569 non-null float64
perimeter_se               569 non-null float64
area_se                    569 non-null float64
smoothness_se              569 non-null float64
compactness_se             569 non-null float64
concavity_se               569 non-

In [5]:
data.describe(include='all')

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,30371830.0,0.372583,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,125020600.0,0.483918,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,8670.0,0.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,869218.0,0.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,906024.0,0.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,8813129.0,1.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,911320500.0,1.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [6]:
in_var=data.drop(['id','diagnosis'],axis=1).columns
out_var='diagnosis'
X=data[in_var]
Y=data[out_var]

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2,random_state=100)

In [7]:
def svc_param_linear(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='linear'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    return grid_search.best_params_

In [8]:
svc_param_linear(X_train,Y_train,2)

{'C': 1, 'gamma': 0.001}

In [9]:
def svc_param_poly(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    deg_vals=[1,2]
    param_grid = {'C': Cs, 'gamma' : gammas,'degree':deg_vals}
    grid_search = GridSearchCV(svm.SVC(kernel='poly'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    return grid_search.best_params_

In [10]:
svc_param_poly(X_train,Y_train,2)

{'C': 0.001, 'degree': 2, 'gamma': 0.01}

In [11]:
def svc_param_rbf(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    #deg_vals=[1,2,3,4]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    return grid_search.best_params_

In [12]:
svc_param_rbf(X_train,Y_train,2)

{'C': 1, 'gamma': 0.001}

In [13]:
def svc_param_sigmoid(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    #deg_vals=[1,2,3,4]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='sigmoid'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    return grid_search.best_params_

In [14]:
svc_param_sigmoid(X_train,Y_train,2)

{'C': 0.001, 'gamma': 0.001}

In [8]:
from sklearn.externals import joblib
model_linear=svm.SVC(kernel='linear',C=1,gamma=0.001)
model_linear.fit(X_train,Y_train)
model_poly=svm.SVC(kernel='poly',C=0.001,gamma=0.01,degree=2)
model_poly.fit(X_train,Y_train)
model_rbf=svm.SVC(kernel='rbf',C=1,gamma=0.001)
model_rbf.fit(X_train,Y_train)
model_sigmoid=svm.SVC(kernel='sigmoid',C=0.001,gamma=0.001)
model_sigmoid.fit(X_train,Y_train)
joblib.dump(model_linear,'model_linear_obj.sav')
joblib.dump(model_poly,'model_poly_obj.sav')
joblib.dump(model_rbf,'model_rbf_obj.sav')
joblib.dump(model_sigmoid,'model_sigmoid_obj.sav')

['model_sigmoid_obj.sav']

In [23]:
y_pred_linear=model_linear.predict(X_test)
y_pred_poly=model_poly.predict(X_test)
y_pred_rbf=model_rbf.predict(X_test)
y_pred_sigmoid=model_sigmoid.predict(X_test)

In [24]:
print('Accuracy score linear kernel:',accuracy_score(y_pred_linear,Y_test))
print('f1 score linear kernel:',f1_score(y_pred_linear,Y_test))
print('Recall score linear kernel:',recall_score(y_pred_linear,Y_test))
print('Precision score linear kernel:',precision_score(y_pred_linear,Y_test))
print('Accuracy score polynomial kernel:',accuracy_score(y_pred_poly,Y_test))
print('f1 score polynomial kernel:',f1_score(y_pred_poly,Y_test))
print('Recall score polynomial kernel:',recall_score(y_pred_poly,Y_test))
print('Precision score polynomial kernel:',precision_score(y_pred_poly,Y_test))
print('Accuracy score rbf kernel:',accuracy_score(y_pred_rbf,Y_test))
print('f1 score rbf kernel:',f1_score(y_pred_rbf,Y_test))
print('Recall score rbf kernel:',recall_score(y_pred_rbf,Y_test))
print('Precision score rbf kernel:',precision_score(y_pred_rbf,Y_test))
print('Accuracy score sigmoid kernel:',accuracy_score(y_pred_sigmoid,Y_test))
#print('f1 score sigmoid kernel:',f1_score(y_pred_sigmoid,Y_test))
#print('Recall score sigmoid kernel:',recall_score(y_pred_sigmoid,Y_test))
print('Precision score sigmoid kernel:',precision_score(y_pred_sigmoid,Y_test))

Accuracy score linear kernel: 0.956140350877193
f1 score linear kernel: 0.9484536082474228
Recall score linear kernel: 0.9583333333333334
Precision score linear kernel: 0.9387755102040817
Accuracy score polynomial kernel: 0.9824561403508771
f1 score polynomial kernel: 0.98
Recall score polynomial kernel: 0.9607843137254902
Precision score polynomial kernel: 1.0
Accuracy score rbf kernel: 0.9473684210526315
f1 score rbf kernel: 0.9423076923076923
Recall score rbf kernel: 0.8909090909090909
Precision score rbf kernel: 1.0
Accuracy score sigmoid kernel: 0.5701754385964912
Precision score sigmoid kernel: 0.0


In [19]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_pred_sigmoid,Y_test)

array([[65, 49],
       [ 0,  0]], dtype=int64)

In [20]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, y_pred_sigmoid))

              precision    recall  f1-score   support

           0       0.57      1.00      0.73        65
           1       0.00      0.00      0.00        49

    accuracy                           0.57       114
   macro avg       0.29      0.50      0.36       114
weighted avg       0.33      0.57      0.41       114



  'precision', 'predicted', average, warn_for)
