In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, auc, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

In [3]:
data = pd.read_csv('data/OJ.csv').drop(columns=['Unnamed: 0'])
data['Purchase_MM'] = [1 if x == 'MM' else 0 for x in data['Purchase']]
X = data.drop(columns=['Purchase_MM', 'Purchase', 'Store7'])
y = data['Purchase_MM']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=800, random_state=1)

In [4]:
svm = SVC(kernel='linear', C=0.01)
svm.fit(X_train, y_train)

print('Train Accuracy: {:.4f}'.format(svm.score(X_train, y_train)))
print('Test  Accuracy: {:.4f}'.format(svm.score(X_test, y_test)))

Train Accuracy: 0.6900
Test  Accuracy: 0.6407


In [5]:
tuned_parameters = [{'C': [0.01, 0.1, 1, 10]}]
clf = GridSearchCV(SVC(kernel='linear'), tuned_parameters, cv=5, scoring='accuracy')
clf.fit(X, y)
clf.best_params_

{'C': 1}

In [6]:
svm = SVC(kernel='linear', C=0.01)
svm.fit(X_train, y_train)

print('Train Accuracy: {:.4f}'.format(svm.score(X_train, y_train)))
print('Test  Accuracy: {:.4f}'.format(svm.score(X_test, y_test)))

Train Accuracy: 0.6900
Test  Accuracy: 0.6407


In [None]:
svm = SVC(kernel='rbf', C=1000, gamma=0.001)
svm.fit(X_train, y_train)

print('Train Accuracy: {:.4f}'.format(svm.score(X_train, y_train)))
print('Test  Accuracy: {:.4f}'.format(svm.score(X_test, y_test)))

In [7]:
tuned_parameters = [{'C': [0.01, 0.1, 1, 5, 10, 100, 1000], 'gamma': [0.001, 0.01, 0.1, 1, 5, 10, 100]}]
clf = GridSearchCV(SVC(kernel='rbf'), tuned_parameters, cv=10, scoring='accuracy')
clf.fit(X, y)
clf.best_params_

{'C': 1000, 'gamma': 0.001}

In [None]:
svm = SVC(kernel='poly', degree=2, C=10000)
svm.fit(X_train, y_train)

print('Train Accuracy: {:.4f}'.format(svm.score(X_train, y_train)))
print('Test  Accuracy: {:.4f}'.format(svm.score(X_test, y_test)))

In [8]:
tuned_parameters = [{'C': [0.01, 0.1, 1, 5, 10, 100, 500, 1000, 5000, 10000]}]
clf = GridSearchCV(SVC(kernel='poly', degree=2), tuned_parameters, cv=10, scoring='accuracy')
clf.fit(X, y)
clf.best_params_

{'C': 10000}

### Best result