In [1]:
import numpy as np
import pandas as pd
import sqlite3

In [2]:
!ls ../Databases

reviewsV1.db  reviewsV1db  reviewsV2.db  reviewsV3.db


## 1. Loading the dataset

In [3]:
with sqlite3.connect('../Databases/reviewsV1.db') as conn:
    data = pd.read_sql_query('SELECT * FROM Review', conn)

In [5]:
data.head()

Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Time,Summary,Text,Polarity
0,0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1303862400,Good Quality Dog Food,good stew smells bought looks vitality appreci...,positive
1,1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1346976000,Not as Advertised,small product peanuts arrived labeled jumbo si...,negative
2,2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1219017600,"""Delight"" says it all",treat liberally citrus brother nuts highly cas...,positive
3,3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,1307923200,Cough Medicine,secret ingredient flavor root cherry looking o...,negative
4,4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,1350777600,Great taffy,taffy wide delivery assortment great yummy,positive


In [6]:
data.sort_values(by='Time', inplace=True)
data.reset_index(drop=True, inplace=True)
TRAIN_SIZE = int(data.shape[0] * 0.7)
TEST_SIZE = data.shape[0] - TRAIN_SIZE

In [7]:
TRAIN_SIZE

254883

In [8]:
TEST_SIZE

109236

In [9]:
data_train = data[0: TRAIN_SIZE]
data_test = data[TRAIN_SIZE:]

In [10]:
assert(data_train.shape[0] == TRAIN_SIZE)
assert(data_test.shape[0] == TEST_SIZE)
assert(data.Time.max() == data_test.Time.reset_index(drop=True)[TEST_SIZE -1])
assert(data.Time.min() == data_train.Time.reset_index(drop=True)[0])

## 2. Featurization

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
cunt = CountVectorizer(max_features=3000)
cunt.fit(data_train.Text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=3000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [13]:
Dtrain = cunt.transform(data_train.Text)
Dtrain.get_shape()

(254883, 3000)

In [14]:
Dtest = cunt.transform(data_test.Text)
Dtest.get_shape()

(109236, 3000)

## 3. Scaling the data

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
Dtrain = Dtrain.toarray() # Almost 11 GB of RAM

In [17]:
sclr = StandardScaler(copy=True)

In [18]:
Dtrain_scaled = sclr.fit_transform(Dtrain)  # CAUTION: 2x RAM Usage, copy = False seems broken



In [19]:
del Dtrain ## Free up some RAM

In [20]:
Dtest = Dtest.toarray()

In [21]:
Dtest_scaled = sclr.fit_transform(Dtest)  # CAUTION: 2x RAM Usage, copy = False seems broken



In [22]:
del Dtest ## Free up some RAM

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [24]:
params = {
    'C': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]
}

In [25]:
estimator = LogisticRegression(random_state=42)

In [26]:
print(estimator)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


In [27]:
grid = GridSearchCV(estimator=estimator,
                    param_grid=params,
                    scoring={'accuracy', 'precision', 'f1', 'recall'},
                    refit='f1', # Because we are using multiple evaluation metrics
                    cv=10,
                    return_train_score=True,
                    verbose=2)

In [28]:
features = Dtrain_scaled
labels = data_train.Polarity.apply(lambda x: 1 if x == 'positive' else 0).values

In [29]:
features.shape

(254883, 3000)

In [30]:
len(labels)

254883

In [31]:
grid.fit(features, labels)

Fitting 10 folds for each of 9 candidates, totalling 90 fits
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total= 1.4min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s


[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total= 1.5min
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total= 1.5min
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total= 1.4min
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total= 1.5min
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total= 1.4min
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total= 1.5min
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total= 1.5min
[CV] C

[CV] ........................................... C=10.0, total= 3.4min
[CV] C=100.0 .........................................................
[CV] .......................................... C=100.0, total= 3.2min
[CV] C=100.0 .........................................................
[CV] .......................................... C=100.0, total= 3.2min
[CV] C=100.0 .........................................................
[CV] .......................................... C=100.0, total= 3.2min
[CV] C=100.0 .........................................................
[CV] .......................................... C=100.0, total= 3.2min
[CV] C=100.0 .........................................................
[CV] .......................................... C=100.0, total= 3.2min
[CV] C=100.0 .........................................................
[CV] .......................................... C=100.0, total= 3.5min
[CV] C=100.0 .........................................................
[CV] .

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed: 273.6min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]},
       pre_dispatch='2*n_jobs', refit='f1', return_train_score=True,
       scoring={'recall', 'f1', 'accuracy', 'precision'}, verbose=2)

In [35]:
from collections import defaultdict

In [36]:
def get_summary(grid):
    parms = list(grid.param_grid.keys())
    columns = ['Iter #']
    scorings = grid.scoring
    for scoring in scorings:
        for typ in ['train', 'test']:
            columns.append(f'{typ}_{scoring}')
    columns.extend(['fit_time', 'score_time'])
    for param in grid.param_grid.keys():
        columns.append(param)
    df = pd.DataFrame(columns=columns)
    cv_res = grid.cv_results_
    for col in columns[1:-len(parms)]:
        df[col] = cv_res['mean_' + col]
    items = 1
    for k, v in grid.param_grid.items():
        items *= len(v)
    df['Iter #'] = np.array(range(items)) + 1
    res = defaultdict(list)
    for each in grid.cv_results_['params']:
        for p in parms:
            res[p].append(each[p])
    for k, v in res.items():
        df[k] = v
    return df

In [37]:
get_summary(grid)

Unnamed: 0,Iter #,train_recall,test_recall,train_f1,test_f1,train_accuracy,test_accuracy,train_precision,test_precision,fit_time,score_time,C
0,1,0.976441,0.972968,0.946823,0.944122,0.906684,0.902006,0.918949,0.916985,87.309455,0.39395,0.0001
1,2,0.976319,0.973055,0.950039,0.946812,0.912636,0.906981,0.925138,0.921988,129.000631,0.403928,0.001
2,3,0.974651,0.970842,0.950819,0.947103,0.914216,0.90773,0.928124,0.924535,186.766967,0.546892,0.01
3,4,0.974341,0.970519,0.950913,0.947133,0.914417,0.907817,0.928586,0.924885,197.997197,0.532091,0.1
4,5,0.974301,0.970492,0.950921,0.947147,0.914434,0.907844,0.928636,0.924937,200.108753,0.537229,1.0
5,6,0.974299,0.970492,0.950924,0.947149,0.91444,0.907848,0.928644,0.924941,199.290516,0.578361,10.0
6,7,0.974299,0.970487,0.950924,0.947146,0.91444,0.907844,0.928644,0.92494,198.965694,0.571653,100.0
7,8,0.974299,0.970487,0.950924,0.947146,0.91444,0.907844,0.928644,0.92494,200.596844,0.579638,1000.0
8,9,0.974299,0.970487,0.950924,0.947146,0.91444,0.907844,0.928644,0.92494,200.238887,0.587246,10000.0


In [32]:
grid.best_estimator_

LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## 4. Saving the best estimator

In [47]:
from sklearn.externals import joblib

In [48]:
joblib.dump(grid.best_estimator_, 'best.model')

['best.model']

## 5. Loading the best estimator

In [49]:
clf = joblib.load('best.model')

In [50]:
clf.coef_

array([[-0.00953235, -0.00435521,  0.04769258, ...,  0.11586617,
        -0.04707746,  0.02314036]])