In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
%matplotlib inline

In [33]:
df = pd.read_csv('SurveyLargeDataSet.csv')

In [34]:
df.keys()

Index(['q1', 'q2', 'q3', 'q4', 'q5', 'q8', 'q9', 'q11', 'q12', 'q15', 'q16',
       'q17', 'q18', 'q21', 'q22', 'q23', 'q24', 'q26', 'q34', 'q37', 'q38',
       'q39', 'q44', 'q49', 'q60', 'q64', 'q65', 'q66', 'q81', 'q82', 'q84',
       'q88', 'q89', 'q97'],
      dtype='object')

In [35]:
df.head(2)

Unnamed: 0,q1,q2,q3,q4,q5,q8,q9,q11,q12,q15,...,q60,q64,q65,q66,q81,q82,q84,q88,q89,q97
0,5,2,3,2,3,2,5,1,1,1,...,1,3,2,4,5,7,1,4,1,1
1,5,1,3,1,4,2,5,1,1,1,...,2,1,1,1,-1,1,1,5,0,1


In [36]:
from sklearn.cross_validation import train_test_split

In [37]:
X = df.drop('q89',axis=1)
y = df['q89']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state = 101)

In [38]:
from sklearn.svm import SVC

In [39]:
model = SVC()

In [40]:
model.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [41]:
predictions = model.predict(X_test)

In [42]:
from sklearn.metrics import classification_report,confusion_matrix

In [43]:
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

[[820 283]
 [288 619]]


             precision    recall  f1-score   support

          0       0.74      0.74      0.74      1103
          1       0.69      0.68      0.68       907

avg / total       0.72      0.72      0.72      2010



In [44]:
from sklearn.model_selection import GridSearchCV

In [45]:
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']} 

In [46]:
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)

In [47]:
grid.fit(X_train,y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] ....... C=0.1, gamma=1, kernel=rbf, score=0.554702, total=   2.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.0s remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=rbf, score=0.555058, total=   1.9s
[CV] C=0.1, gamma=1, kernel=rbf ......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.8s remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=rbf, score=0.555058, total=   1.8s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ..... C=0.1, gamma=0.1, kernel=rbf, score=0.554702, total=   1.4s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ..... C=0.1, gamma=0.1, kernel=rbf, score=0.555058, total=   1.4s
[CV] C=0.1, gamma=0.1, kernel=rbf ....................................
[CV] ..... C=0.1, gamma=0.1, kernel=rbf, score=0.555058, total=   1.5s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] .... C=0.1, gamma=0.01, kernel=rbf, score=0.680102, total=   1.4s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] .... C=0.1, gamma=0.01, kernel=rbf, score=0.676697, total=   1.3s
[CV] C=0.1, gamma=0.01, kernel=rbf ...................................
[CV] .... C=0.1, gamma=0.01, kernel=rbf, score=0.677977, total=   1.3s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  4.0min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [48]:
grid.best_params_

{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}

In [49]:
grid.best_estimator_

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [50]:
grid_predictions = grid.predict(X_test)

In [51]:
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

[[820 283]
 [288 619]]


             precision    recall  f1-score   support

          0       0.74      0.74      0.74      1103
          1       0.69      0.68      0.68       907

avg / total       0.72      0.72      0.72      2010

