# N-Grams + SVM

In [1]:
import json
import scipy
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [None]:
import seaborn as sns

In [2]:
raw_path = '../data/raw/data_train.json'
raw_test_path = '../data/raw/data_test_wo_label.json'

In [3]:
dataset_f = open(raw_path, 'r')
dataset = json.load(dataset_f)

reviews = [item['text'] for item in dataset]

In [4]:
ratings = [item['stars'] for item in dataset]

In [5]:
vectorizer = TfidfVectorizer(reviews, stop_words = 'english', ngram_range = (1,2),
                             max_features = 1000)

In [6]:
X_train = vectorizer.fit(reviews)

In [7]:
X_train = vectorizer.transform(reviews)

In [8]:
X_train_a = X_train.toarray()

In [9]:
y_train = np.array(ratings)

In [None]:
type(X_train_a)

Sample of the dataset used for hyperparameter optimization

In [None]:
y_sam = y_train[:3342]
X_sam = X_train_a[:3342]

In [None]:
params_grid = {   
            'C' : scipy.stats.expon(scale=100),
            'gamma': scipy.stats.expon(scale = .1),
            'kernel': ['rbf']}

In [None]:
MAX_ITER = 20

In [None]:
search = RandomizedSearchCV(SVC(), params_grid, cv = 5,
                               n_jobs = 4,
                               scoring = 'accuracy',
                               n_iter = MAX_ITER,
                               verbose = 10)

In [None]:
search.fit(X_sam, y_sam)

In [None]:
print(search.best_params_)
print(search.best_score_)

In [None]:
best = search.best_params_

In [10]:
best = {'C': 6.350178585169132, 'gamma': 0.10966077531932605, 'kernel': 'rbf'}

In [None]:
best_2 = {'C': 4.473104010959948, 'gamma': 0.10438022374558996, 'kernel': 'rbf'}

In [None]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_train_a, y_train, test_size = 0.2)

In [11]:
b_model = SVC(**best, verbose = 100)

Verify that the parameters are correct

In [None]:
b_model

In [12]:
%%timeit

b_model.fit(X_train[:33000], y_train[:33000])

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]3min 38s ± 8.92 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
y_pred = b_model.predict(X_val)

In [17]:
accuracy_score(y_pred, y_val)

0.6431445280366144

In [12]:
from joblib import dump, load

In [20]:
dump(b_model, 'tuned_svm.joblib')

['tuned_svm.joblib']

In [13]:
tuned_svm = load('tuned_Svm.joblib')

In [24]:
test_dataset_f = open(raw_test_path, 'r')
test_dataset = json.load(test_dataset_f)

test_reviews = [item['text'] for item in test_dataset]

In [25]:
X_test = vectorizer.transform(test_reviews)

In [28]:
tuned_svm

SVC(C=6.350178585169132, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.10966077531932605,
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=100)

In [29]:
y_pred = tuned_svm.predict(X_test)

In [30]:
y_pred

array([5., 5., 5., ..., 4., 5., 4.])

In [31]:
y_pred.shape

(50000,)

In [34]:
submission = pd.DataFrame({'Predictions' : y_pred})

In [36]:
submission.head()

Unnamed: 0,Predictions
0,5.0
1,5.0
2,5.0
3,5.0
4,5.0


In [None]:
train_predictions_svm = tuned_svm.predict(X_train)

In [None]:
np.save_txt(train_predictions_svm, 'train_predictions_svm')

In [37]:
submission.to_csv('predictions_svm.csv', index = False)