### Exploration of SVM algorithm with python

In [24]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for data visualization
import seaborn as sns # for statistical data visualization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
%matplotlib inline

In [2]:
df = pd.read_csv('./data/pulsar_stars.csv')
df.head()

Unnamed: 0,Mean of the integrated profile,Standard deviation of the integrated profile,Excess kurtosis of the integrated profile,Skewness of the integrated profile,Mean of the DM-SNR curve,Standard deviation of the DM-SNR curve,Excess kurtosis of the DM-SNR curve,Skewness of the DM-SNR curve,target_class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [4]:
X = df.drop('target_class',axis=1)
y = df['target_class']
print(X.shape)
print(y.shape)

(17898, 8)
(17898,)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape,X_test.shape)

(14318, 8) (3580, 8)


In [8]:
cols = X_train.columns
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train),columns=cols)
X_test = pd.DataFrame(scaler.fit_transform(X_test),columns=cols)

### Perform Support Vector Machine with default hyperparameterization

In [11]:
svm1 = SVC()
svm1.fit(X_train,y_train)
y_pred = svm1.predict(X_test)
print("The accuracy of base model is {0:0.6f}".format(accuracy_score(y_test,y_pred)))

The accuracy of base model is 0.979050


### Run SVM with linear kernel and C=1000.0

In [16]:
svm2 = SVC(kernel='linear',C=1000)
svm2.fit(X_train,y_train)
y_pred = svm2.predict(X_test)
print("The accuracy of base model is {0:0.6f}".format(accuracy_score(y_test,y_pred)))

The accuracy of base model is 0.979330


In [17]:
#Check for underfitting or overfitting
y_train_pred = svm2.predict(X_train)
print("The accuracy of base model is {0:0.6f}".format(accuracy_score(y_train,y_train_pred)))
print("The accuracy of base model is {0:0.6f}".format(accuracy_score(y_test,y_pred)))

The accuracy of base model is 0.979397
The accuracy of base model is 0.979330


### Run svm with poly kernel and C=1000.0

In [18]:
svm3 = SVC(kernel='poly',C=1000)
svm3.fit(X_train,y_train)
y_pred = svm3.predict(X_test)
print("The accuracy of base model is {0:0.6f}".format(accuracy_score(y_test,y_pred)))

The accuracy of base model is 0.980168


### Run kfold vaildation

In [23]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
linear_svc = SVC(kernel='linear',C=10)
linear_scores = cross_val_score(linear_svc,X,y,cv=kfold)
print(linear_scores)

[0.97932961 0.97793296 0.97960894 0.9782062  0.98100028]


## Run Grid Search with KFold

In [25]:
svc_grid = SVC()
parameters=[
    {'C':[1,10,100],'kernel':['linear']},
    {'C':[1,10,100],'kernel':['rbf'],'gamma':[0.1,0.2,0.3,0.4,0.5]},
    {'C':[1,10,100],'kernel':['poly'],'degree':[2,3],'gamma':[0.01,0.02,0.03,0.04,0.05]}
]
gridsearch = GridSearchCV(
            estimator=svc_grid,
            param_grid=parameters,
            scoring ='accuracy',
            verbose=0,
            cv=kfold

)

gridsearch.fit(X,y)