In [1]:
import numpy as np
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.exceptions import DataConversionWarning, UndefinedMetricWarning
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DataConversionWarning)
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
%config InlineBackend.figure_format = 'retina'

In [2]:
!ls ../Databases

reviewsV1db  reviewsV1.db  reviewsV2.db  reviewsV3.db


In [3]:
with sqlite3.connect('../Databases/reviewsV1.db') as conn:
    data = pd.read_sql_query('SELECT * FROM Review', conn)

In [4]:
data.head()

Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Time,Summary,Text,Polarity
0,0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1303862400,Good Quality Dog Food,good stew smells bought looks vitality appreci...,positive
1,1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1346976000,Not as Advertised,small product peanuts arrived labeled jumbo si...,negative
2,2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1219017600,"""Delight"" says it all",treat liberally citrus brother nuts highly cas...,positive
3,3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,1307923200,Cough Medicine,secret ingredient flavor root cherry looking o...,negative
4,4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,1350777600,Great taffy,taffy wide delivery assortment great yummy,positive


In [5]:
data.sort_values(by='Time', inplace=True)
data.reset_index(drop=True, inplace=True)
TRAIN_SIZE = int(data.shape[0] * 0.7)
TEST_SIZE = data.shape[0] - TRAIN_SIZE

In [6]:
TRAIN_SIZE

254883

In [7]:
TEST_SIZE

109236

In [8]:
data_train = data[0: TRAIN_SIZE]
data_test = data[TRAIN_SIZE:]

In [9]:
assert(data_train.shape[0] == TRAIN_SIZE)
assert(data_test.shape[0] == TEST_SIZE)
assert(data.Time.max() == data_test.Time.reset_index(drop=True)[TEST_SIZE -1])
assert(data.Time.min() == data_train.Time.reset_index(drop=True)[0])

## 2. Featurization

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
cunt = CountVectorizer()
cunt.fit(data_train.Text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [12]:
Dtrain = cunt.transform(data_train.Text)
Dtrain.get_shape()

(254883, 80740)

In [13]:
Dtest = cunt.transform(data_test.Text)
Dtest.get_shape()

(109236, 80740)

## 3. Scaling the data

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
sclr = StandardScaler(with_mean=False)

In [16]:
Dtrain_scaled = sclr.fit_transform(Dtrain)

In [17]:
Dtest_scaled = sclr.fit_transform(Dtest)

In [18]:
# The top alpha to give best result for F1 score in BOW
# We are taking the reciprocal of the alpha value because alpha and C works opposite
c = 1 / 0.167683

In [19]:
from sklearn.svm import SVC

In [36]:
# Max iter is 10 because it converged in 5 iterations with 
# a hope that it would converge well in 10 iterations
# This impotant to decrase the training time
clf = SVC(random_state=42, kernel='rbf', verbose=3, C=c, max_iter=1500, gamma=1000)

In [37]:
features = Dtrain_scaled
labels = data_train.Polarity.apply(lambda x: 1 if x == 'positive' else 0).values

In [38]:
clf.fit(features, labels)

[LibSVM]



SVC(C=5.963633761323449, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1000, kernel='rbf',
  max_iter=1500, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=3)

In [39]:
xTrain = Dtest_scaled
actuals = data_test.Polarity.apply(lambda x: 1 if x == 'positive' else 0).values

In [40]:
preds = clf.predict(xTrain)

In [41]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

In [42]:
acs = accuracy_score(actuals, preds)
pre = precision_score(actuals, preds)
rec = recall_score(actuals, preds)
f1 = f1_score(actuals, preds)

from prettytable import PrettyTable

x = PrettyTable()

x.field_names = ['Vectorization', 'Accuracy', 'Precision', 'Recall', 'F1']
x.add_row(['BOW', acs, pre, rec, f1])
print(x)

+---------------+--------------------+--------------------+--------+--------------------+
| Vectorization |      Accuracy      |     Precision      | Recall |         F1         |
+---------------+--------------------+--------------------+--------+--------------------+
|      BOW      | 0.8253689259950932 | 0.8253689259950932 |  1.0   | 0.9043310798611808 |
+---------------+--------------------+--------------------+--------+--------------------+
