In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV


%matplotlib inline

In [38]:
columns = ['parents', 'has_nurs', 'form', 'children', 'housing', 'finance', 'social', 'health', 'classification']
df = pd.read_csv('nursery.data', header=None, names=columns)
df.head()

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,classification
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority


In [7]:
X = pd.get_dummies(df.drop('classification', axis=1))

In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['classification'])

In [9]:
y_df = pd.DataFrame(y, columns=['target'])


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
model = SVC(kernel='linear', C = 1)

In [16]:
model.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [39]:
pred = model.predict(X_test)

In [18]:
model.score(X_test, y_test)

0.9344135802469136

In [20]:
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1320
          1       0.90      0.90      0.90      1272
          2       0.00      0.00      0.00         2
          3       0.90      0.92      0.91      1190
          4       0.88      0.64      0.74       104

avg / total       0.93      0.93      0.93      3888



  'precision', 'predicted', average, warn_for)


In [21]:
print(confusion_matrix(y_test, pred))

[[1320    0    0    0    0]
 [   0 1148    0  117    7]
 [   0    0    0    0    2]
 [   0   92    0 1098    0]
 [   0   37    0    0   67]]


In [25]:
param_grid = {'C': [0.1, 1, 100, 1000], 'gamma': [1, 0.1, 0.001, 0.0001], 'kernel': ['linear']}

In [27]:
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)

In [29]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV]  C=0.1, gamma=1, kernel=linear, score=0.9309090909090909, total=   0.2s
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV]  C=0.1, gamma=1, kernel=linear, score=0.9103835978835979, total=   0.3s
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.2s remaining:    0.0s


[CV]  C=0.1, gamma=1, kernel=linear, score=0.9219318557724115, total=   0.3s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV]  C=0.1, gamma=0.1, kernel=linear, score=0.9309090909090909, total=   0.3s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV]  C=0.1, gamma=0.1, kernel=linear, score=0.9103835978835979, total=   0.2s
[CV] C=0.1, gamma=0.1, kernel=linear .................................
[CV]  C=0.1, gamma=0.1, kernel=linear, score=0.9219318557724115, total=   0.2s
[CV] C=0.1, gamma=0.001, kernel=linear ...............................
[CV]  C=0.1, gamma=0.001, kernel=linear, score=0.9309090909090909, total=   0.3s
[CV] C=0.1, gamma=0.001, kernel=linear ...............................
[CV]  C=0.1, gamma=0.001, kernel=linear, score=0.9103835978835979, total=   0.2s
[CV] C=0.1, gamma=0.001, kernel=linear ...............................
[CV]  C=0.1, gamma=0.001, kernel=linear, score=0.9219318557724115, total=   0.2s
[CV] C=0.1, gamma

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:  4.2min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 100, 1000], 'gamma': [1, 0.1, 0.001, 0.0001], 'kernel': ['linear']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [31]:
svm_best = grid.best_estimator_
svm_best

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [32]:
pred_2 = grid.predict(X_test)

In [33]:
svm_best.score(X_test, y_test)

0.9344135802469136

In [35]:
grid.score(X_test, y_test)

0.9344135802469136

In [36]:


print(classification_report(y_test, pred_2))



             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1320
          1       0.90      0.90      0.90      1272
          2       0.00      0.00      0.00         2
          3       0.90      0.92      0.91      1190
          4       0.88      0.64      0.74       104

avg / total       0.93      0.93      0.93      3888



  'precision', 'predicted', average, warn_for)


In [37]:
print(confusion_matrix(y_test, pred_2))

[[1320    0    0    0    0]
 [   0 1148    0  117    7]
 [   0    0    0    0    2]
 [   0   92    0 1098    0]
 [   0   37    0    0   67]]
