## Support Vector Machines

In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm
import sklearn.metrics as metrics

import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore")

In [80]:
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [81]:
df = pd.read_csv("datasets/lr_data.csv")
df = df.drop(columns="Unnamed: 0")
# df_train = df.iloc[0:1500]
# df_test = df.iloc[1500:]
df_train, df_test = split_train_test(df, 0.2)

X_train = df_train[["tmv", "demand_supply_ratio", "price", "category_grouped", "month"]]
y_train = df_train["is_booked"]
X_test = df_test[["tmv", "demand_supply_ratio", "price", "category_grouped", "month"]]
y_test = df_test["is_booked"]

print(df.shape)
print(df_train.shape)
print(df_test.shape)
df.head()

(1890, 6)
(1512, 6)
(378, 6)


Unnamed: 0,is_booked,tmv,demand_supply_ratio,price,category_grouped,month
0,0,14569,1.902318,30.0,1,5
1,1,4201,14.622831,39.0,0,5
2,1,5724,8.659708,24.0,1,4
3,1,39102,13.57039,121.0,2,12
4,0,9666,1.297453,42.0,1,8


In [82]:
?svm.LinearSVC

Understanding parameters of LinearSVM <br>
C : This is the <xxxx> term. Smaller C means wider margin but more misclassifications.

In [83]:
model = svm.LinearSVC(C=1.464)
model.fit(X_train, y_train)
print(model.coef_)
print(model.intercept_)

[[ 5.91134897e-05  3.44237244e-02 -5.75643458e-03  3.81952508e-04
   7.28734963e-03]]
[0.00163836]


In [84]:
print(model.predict(X_test))
print(model.score(X_test, y_test))

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1]
0.5582010582010583


In [85]:
metrics.confusion_matrix(model.predict(X_test), y_test)

array([[  0,   0],
       [167, 211]])

This is almost like a random guess and I suspect it's because the features in unscaled. Applying scaling might help.

In [86]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [96]:
svm_clf = Pipeline([("scaler", StandardScaler()), 
                   ("linear_svc", svm.LinearSVC(C=1, loss="hinge"))])
svm_clf.fit(X_train, y_train)

print(svm_clf.predict(X_test))
print(svm_clf.score(X_test, y_test))

[1 0 1 1 1 0 0 1 1 0 1 1 1 1 0 0 0 1 1 1 1 1 0 0 1 1 1 0 1 1 1 0 0 0 1 0 0
 0 0 1 0 0 0 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 0 1 0 1 0 1 0 0
 1 1 1 0 0 1 0 0 1 1 1 1 1 1 1 0 0 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0
 1 1 0 0 0 1 0 0 1 1 1 0 1 1 1 0 1 1 0 1 1 0 1 0 1 0 1 0 1 0 1 1 1 1 0 1 1
 1 0 0 1 0 1 1 1 1 1 0 1 0 0 0 1 0 1 1 0 0 1 1 1 0 0 1 0 0 0 0 0 1 0 0 1 1
 1 0 0 0 0 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 0 0 0 1 1 1 0 0 1 1 0 0 1 1 1 0 1
 0 1 0 0 1 1 1 0 0 0 0 0 1 0 0 1 0 1 0 1 0 0 1 0 0 1 0 1 1 0 1 1 0 1 1 1 0
 1 1 1 1 0 1 1 1 0 0 0 0 1 0 0 0 0 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 1 0 1 1 1
 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 0 1 0 1 1 0 1 0 1 0 0 0 1 0 0
 1 1 0 1 0 1 1 1 0 1 1 1 0 0 0 1 1 1 0 0 0 0 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1
 1 0 0 0 1 1 1 0]
0.6957671957671958


In [97]:
metrics.confusion_matrix(svm_clf.predict(X_test), y_test)

array([[106,  54],
       [ 61, 157]])

### Polynomial Kernel

In [45]:
model = svm.SVC()
model.fit(X_train, y_train)
# print(model.coef_)
# print(model.intercept_)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [46]:
print(model.predict(X_test))
print(model.score(X_test, y_test))

[1 1 1 1 0 1 1 1 0 1 0 1 0 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0
 0 1 1 1 0 1 1 1 0 1 1 1 1 0 0 1 1 0 1 1 0 1 1 0 1 1 1 1 1 0 1 0 0 1 0 1 0
 1 1 1 1 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 1 0 0 1 0 0 0 1
 1 1 1 1 0 1 0 0 0 0 1 1 0 0 0 1 0 1 1 0 0 0 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1
 0 1 1 1 0 1 0 0 1 0 0 1 0 0 0 1 1 1 1 0 1 1 0 1 0 0 1 0 0 1 0 1 0 1 0 0 1
 0 1 1 1 1 1 1 1 1 0 0 1 0 1 1 1 1 1 0 0 0 1 1 1 0 0 1 1 1 0 1 1 1 1 0 0 1
 1 0 1 1 1 0 0 0 0 1 1 0 1 0 0 0 1 1 0 0 0 1 1 1 1 0 1 1 0 0 0 1 0 1 0 0 1
 0 1 0 0 1 1 1 1 0 1 0 0 0 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 1 0 1 1
 1 1 1 0 1 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 1 0 0 0 1 1 1 0 1 1 0 1
 1 0 1 1 1 1 0 0 0 1 1 0 1 0 1 0 0 0 1 1 0 0 1 1 0 1 0 1 1 1 0 1 0 1 1 1 1
 1 1 1 1 1 0 1 1]
0.8359788359788359
