In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
%config InlineBackend.figure_format = 'retina'

In [4]:
!ls ..

Amazon-Review----Logistic-Regression  cert	 Processed Datasets.zip
Amazon-Review---Naive-Bayes	      Databases  Tools
anaconda3			      __MACOSX


## 1. Loading the dataset

In [43]:
with sqlite3.connect('../Databases/reviewsV1.db') as conn:
    data = pd.read_sql_query('SELECT * FROM Review', conn)

In [44]:
data.drop('index', inplace=True, axis=1)
data = data[data.index != 258456]

## 2. Time Based Splitting

In [45]:
data.sort_values(by='Time', inplace=True)
data.reset_index(drop=True, inplace=True)

In [46]:
TRAIN_SIZE = int(data.shape[0] * 0.7)
TEST_SIZE = data.shape[0] - TRAIN_SIZE

In [47]:
TRAIN_SIZE

254882

In [48]:
TEST_SIZE

109236

In [49]:
data_train = data[0: TRAIN_SIZE]
data_test = data[TRAIN_SIZE:]

#### 2.1 Check if the Splitting was performed properly

In [50]:
assert(data_train.shape[0] == TRAIN_SIZE)
assert(data_test.shape[0] == TEST_SIZE)

In [51]:
assert(data.Time.max() == data_test.Time.reset_index(drop=True)[TEST_SIZE -1])

In [52]:
assert(data.Time.min() == data_train.Time.reset_index(drop=True)[0])


## Training Word2Vec Model on data_train

#### Preparing a corpus to create a Word2Vec model

In [53]:
corpus = [review.split() for review in data_train.Text.values]


In [54]:
assert(len(corpus) == TRAIN_SIZE)

In [55]:
print(data.Text.values[0])
print("*****************************************************************")
print(corpus[0])

special acting effects movie well written everything delighted view chose beetlejuice
*****************************************************************
['special', 'acting', 'effects', 'movie', 'well', 'written', 'everything', 'delighted', 'view', 'chose', 'beetlejuice']


#### Training Word2Vec on train data

In [56]:
import multiprocessing
from gensim.models.word2vec import Word2Vec
params = {
    'size': 100,
    'min_count': 5, 
    'workers': max(1, multiprocessing.cpu_count()),}
model = Word2Vec(corpus, **params)

In [57]:
model.wv.most_similar('disappointed')

[('sadly', 0.8150196671485901),
 ('pictured', 0.7997119426727295),
 ('calling', 0.7924827933311462),
 ('description', 0.7873637080192566),
 ('marked', 0.7853572368621826),
 ('expectations', 0.7799867391586304),
 ('earlier', 0.7767242789268494),
 ('writing', 0.7761337757110596),
 ('returning', 0.7696765065193176),
 ('wrote', 0.7684863805770874)]

In [58]:
model.wv.most_similar('love')

[('lots', 0.6477276086807251),
 ('excellent', 0.638221263885498),
 ('enjoy', 0.6336777806282043),
 ('beat', 0.6081674098968506),
 ('enough', 0.6070799827575684),
 ('creamy', 0.6005898714065552),
 ('remember', 0.5996477007865906),
 ('enjoying', 0.5981705188751221),
 ('unlike', 0.5971493721008301),
 ('hooked', 0.5941672325134277)]

## Creating our feature sets

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Fitting TFIDF vectorizer on train data

In [60]:
features = data_train.Text
labels = data_train.Polarity

In [61]:
tfidf = TfidfVectorizer(ngram_range=(1, 2))
tf_idf_vect = tfidf.fit_transform(features.values)
tfidf_feat = tfidf.get_feature_names()

In [62]:
tf_idf_vect.get_shape()

(254882, 1864296)

In [63]:
tfidf_feat[100000:100010]

['auto ryvita',
 'auto said',
 'auto sealed',
 'auto seaseme',
 'auto selection',
 'auto sent',
 'auto separate',
 'auto service',
 'auto setting',
 'auto ship']

In [64]:
features[0]


'special acting effects movie well written everything delighted view chose beetlejuice'

In [65]:
tf_idf_vect[0, tfidf_feat.index('special')]

0.118903489844112

#### Creating a hashmap to find the index of the tfidf vector for a review

In [66]:
z = pd.DataFrame(tfidf_feat).reset_index()
z.columns = ['index', 'word']
z = z.set_index('word')
zd = z.to_dict()['index']

In [67]:
%%timeit
zd['well']

45.5 ns ± 1.16 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [68]:
from tqdm import tqdm_notebook as tqdm

In [69]:
def process_tfidf_word2vec(corpus, table):
    global review_vec
    global rejected
    row = 0
    for doc in tqdm(corpus):
        review = [model.wv[word] for word in doc 
                  if model.wv.__contains__(word)]
        review = np.array(review)
        if review.shape[0] == 0:
#             print(doc)
#             print(review)
#             print(row)
            rejected.append(row)
            row += 1
            continue
        tfidfs = [tf_idf_vect[row, table[word]] for word in doc 
                  if model.wv.__contains__(word)]
        tfidfs = np.array(tfidfs)
        if tfidfs.shape[0] == 0:
#             print(doc)
#             print(tfidfs)
#             print(row)
            rejected.append(row)
            row += 1
            continue
        denominator = tfidfs.sum()
        if denominator == 0.0:
#             print(doc)
#             print(tfidfs)
#             print(row)
            rejected.append(row)
            row += 1
            continue
        numerator = review * tfidfs.reshape(review.shape[0], 1)

        all_vecs = numerator / denominator
        avg = np.sum(review, axis=0)
        avg_list = avg.tolist()
        review_vec.append(avg_list)
        row += 1

#### Creating training data

In [70]:
review_vec = []
rejected = []

In [71]:
process_tfidf_word2vec(corpus, zd)

HBox(children=(IntProgress(value=0, max=254882), HTML(value='')))




In [72]:
rejected

[]

In [73]:
assert(len(review_vec) == TRAIN_SIZE - len(rejected))

In [74]:
Dtrain = pd.DataFrame(review_vec)

In [75]:
Dtrain['Label'] = data_train.Polarity.values

In [76]:
Dtrain.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Label
0,-3.919083,6.698974,-1.606934,2.167706,-1.569327,-3.441705,-2.344378,3.434829,-3.354669,-6.088552,...,3.079075,0.838742,2.306767,6.72586,-3.526942,13.108291,-10.133601,5.196177,3.032531,positive
1,-2.563767,0.373244,2.188727,0.788589,0.456519,0.843408,-5.964007,-1.396787,-1.616609,2.659016,...,-0.127668,0.461477,-0.251318,0.438959,1.208,4.025536,-4.070363,-2.498171,-0.258417,positive
2,-4.769782,15.742468,2.814626,0.801279,5.544626,1.04307,-2.063164,4.071217,-5.034811,-0.271394,...,4.492488,-6.571737,-5.140901,3.777081,5.010118,17.393679,-9.799546,1.904413,-0.260088,positive
3,-3.174931,1.728552,3.800104,0.411368,8.071153,0.47303,-3.728433,-1.721413,-7.12508,-3.926273,...,5.894567,-3.641558,-7.267379,3.75165,5.668237,13.071115,-6.745919,-1.734539,-1.152124,positive
4,-11.398147,4.173595,17.720469,5.839493,11.931543,-0.80351,-4.28941,-2.61689,-13.540527,-4.709617,...,9.43185,-7.322816,-16.493114,11.26676,5.345065,17.641626,-7.568055,0.206682,-6.126608,negative


#### Fitting TFIDF vectorizer on test data

In [77]:
features = data_test.Text.reset_index(drop=True)
labels = data_test.Polarity.reset_index(drop=True)

tfidf = TfidfVectorizer(ngram_range=(1, 2))
tf_idf_vect = tfidf.fit_transform(features.values)
tfidf_feat = tfidf.get_feature_names()

tf_idf_vect.get_shape()

(109236, 1036883)

In [78]:
tfidf_feat[100000:100010]

['bowl overjoyed',
 'bowl packets',
 'bowl pains',
 'bowl pair',
 'bowl papaya',
 'bowl paranoia',
 'bowl parents',
 'bowl peanut',
 'bowl peculiar',
 'bowl peterson']

In [79]:
features[0]

'stop semi could worth chips extra like husband taste ghirardelli sweet eating chocolate'

In [80]:
tf_idf_vect[0, tfidf_feat.index('chips')]

0.135775381865707

#### Creating a hashmap to find the index of the tfidf vector for a review

In [81]:
z = pd.DataFrame(tfidf_feat).reset_index()
z.columns = ['index', 'word']
z = z.set_index('word')
zd = z.to_dict()['index']

#### Creating testing data

In [82]:
review_vec = []
rejected = []
corpus = [review.split() for review in features]
process_tfidf_word2vec(corpus, zd)

HBox(children=(IntProgress(value=0, max=109236), HTML(value='')))




In [83]:
rejected

[]

In [84]:
assert(len(review_vec) == TEST_SIZE)

In [85]:
Dtest = pd.DataFrame(review_vec)
Dtest['Label'] = labels

In [86]:
Dtest.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Label
0,2.488526,20.701178,-8.581582,0.195872,4.09567,4.767016,-3.030686,-5.08141,8.403255,-2.266223,...,3.838959,-5.953161,-8.748277,0.449905,10.146443,1.17659,0.427269,-5.266908,0.472882,positive
1,-5.746807,22.576071,-11.535794,3.042543,2.957444,12.293008,12.64228,-4.397332,12.586325,-0.635215,...,6.59942,7.905776,-36.465202,3.938663,15.171597,13.696743,-14.31051,0.122235,-7.225564,positive
2,6.782385,19.200939,-1.147379,8.869946,-2.341001,10.69712,-3.938172,-1.970506,-3.708237,-9.499657,...,2.247544,-6.149432,-7.272695,-0.844343,5.66786,-1.665368,-7.349421,-6.404699,8.484426,positive
3,3.630759,19.67063,1.481296,8.434503,-3.86217,-0.274631,-1.422816,5.987234,-7.538827,-9.865144,...,-2.878778,-11.124446,4.355464,7.803609,1.964467,6.491103,-13.555345,8.2985,12.504223,positive
4,-5.435309,37.83865,-8.913232,21.145014,4.484749,8.219103,-2.97543,9.300751,-8.307211,-5.729055,...,17.301378,-4.348619,-23.882622,14.147071,16.473051,37.749058,-21.590834,5.346241,18.967165,positive


## Scaling the data

In [87]:
from sklearn.preprocessing import StandardScaler

In [89]:
sclr = StandardScaler(copy=True)

In [91]:
Dtrain.drop(['Label'], inplace=True, axis=1)

In [92]:
Dtest.drop(['Label'], inplace=True, axis=1)

In [93]:
Dtrain_scaled = sclr.fit_transform(Dtrain)

In [94]:
Dtest_scaled = sclr.fit_transform(Dtest)

In [95]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [96]:
params = {
    'C': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]
}

In [98]:
data_train.Polarity.value_counts()

positive    216859
negative     38023
Name: Polarity, dtype: int64

As we can see that it is highly imbalanced dataset, we can use class_weights to manage that

In [99]:
216859 // 38023

5

In [100]:
estimator = LogisticRegression(random_state=42, class_weight={0: 5, 1: 1}, solver='saga')

In [101]:
estimator

LogisticRegression(C=1.0, class_weight={0: 5, 1: 1}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='saga', tol=0.0001, verbose=0, warm_start=False)

In [102]:
grid = GridSearchCV(estimator=estimator,
                    param_grid=params,
                    scoring={'accuracy', 'precision', 'f1', 'recall'},
                    refit='f1', # Because we are using multiple evaluation metrics
                    cv=10,
                    return_train_score=True,
                    verbose=2,
                    n_jobs=8)  ## Using only 8 cores out of 12 cores

In [103]:
features = Dtrain_scaled
labels = data_train.Polarity.apply(lambda x: 1 if x == 'positive' else 0).values

In [104]:
features.shape


(254882, 100)

In [105]:
len(labels)


254882

In [106]:
grid.fit(features, labels)

Fitting 10 folds for each of 9 candidates, totalling 90 fits
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   7.6s
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   7.5s
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   8.3s
[CV] C=0.001 ...

[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:   49.6s


[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  14.9s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  16.7s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  14.7s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  16.5s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  18.6s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  17.3s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  16.6s
[CV] C

[CV] ........................................ C=10000.0, total=  15.1s
[CV] ........................................ C=10000.0, total=  15.9s
[CV] ........................................ C=10000.0, total=  16.0s
[CV] ........................................ C=10000.0, total=  13.3s
[CV] ........................................ C=10000.0, total=  14.7s


[Parallel(n_jobs=8)]: Done  90 out of  90 | elapsed:  3.1min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight={0: 5, 1: 1}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='saga', tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]},
       pre_dispatch='2*n_jobs', refit='f1', return_train_score=True,
       scoring={'f1', 'recall', 'precision', 'accuracy'}, verbose=2)

In [107]:
from collections import defaultdict

In [108]:
def get_summary(grid):
    parms = list(grid.param_grid.keys())
    columns = ['Iter #']
    scorings = grid.scoring
    for scoring in scorings:
        for typ in ['train', 'test']:
            columns.append(f'{typ}_{scoring}')
    columns.extend(['fit_time', 'score_time'])
    for param in grid.param_grid.keys():
        columns.append(param)
    df = pd.DataFrame(columns=columns)
    cv_res = grid.cv_results_
    for col in columns[1:-len(parms)]:
        df[col] = cv_res['mean_' + col]
    items = 1
    for k, v in grid.param_grid.items():
        items *= len(v)
    df['Iter #'] = np.array(range(items)) + 1
    res = defaultdict(list)
    for each in grid.cv_results_['params']:
        for p in parms:
            res[p].append(each[p])
    for k, v in res.items():
        df[k] = v
    return df

In [109]:
get_summary(grid)

Unnamed: 0,Iter #,train_f1,test_f1,train_recall,test_recall,train_precision,test_precision,train_accuracy,test_accuracy,fit_time,score_time,C
0,1,0.897138,0.896777,0.842333,0.841865,0.95957,0.959531,0.835658,0.83519,8.903398,0.119493,0.0001
1,2,0.896312,0.895947,0.839583,0.839209,0.961262,0.961065,0.834727,0.834221,12.243578,0.067481,0.001
2,3,0.895802,0.89547,0.838566,0.838227,0.961423,0.961252,0.834021,0.833566,15.762055,0.073786,0.01
3,4,0.895741,0.895359,0.838446,0.838019,0.961441,0.961269,0.833936,0.833409,16.89213,0.073409,0.1
4,5,0.895735,0.89535,0.838432,0.838001,0.961446,0.961273,0.833929,0.833397,16.810234,0.088442,1.0
5,6,0.895733,0.895358,0.838429,0.83801,0.961444,0.961279,0.833925,0.833409,16.364698,0.07331,10.0
6,7,0.895733,0.895358,0.838429,0.83801,0.961444,0.961279,0.833925,0.833409,17.083408,0.066737,100.0
7,8,0.895732,0.895358,0.838429,0.83801,0.961444,0.961279,0.833925,0.833409,16.663428,0.098894,1000.0
8,9,0.895732,0.895358,0.838429,0.83801,0.961444,0.961279,0.833925,0.833409,15.791569,0.05804,10000.0


In [110]:
grid.best_estimator_

LogisticRegression(C=0.0001, class_weight={0: 5, 1: 1}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='saga', tol=0.0001, verbose=0, warm_start=False)

Stange the class weights didn't imporove the recall and f1 score but it drastically improved the precision. Lets try decresing the penalty of negative class see if it improves

In [111]:
estimator = LogisticRegression(random_state=42, class_weight={0: 3, 1: 1}, solver='saga')

In [112]:
grid.fit(features, labels)

Fitting 10 folds for each of 9 candidates, totalling 90 fits
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   7.9s
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   7.5s
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   7.8s
[CV] C=0.001 ...

[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:   47.7s


[CV] ........................................... C=0.01, total=  16.6s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  16.2s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  16.5s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  15.9s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  14.4s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  16.8s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  16.8s
[CV] C=0.1 ...........................................................
[CV] .

[CV] ........................................ C=10000.0, total=  16.4s
[CV] ........................................ C=10000.0, total=  16.1s
[CV] ........................................ C=10000.0, total=  16.2s
[CV] ........................................ C=10000.0, total=  15.9s
[CV] ........................................ C=10000.0, total=  14.7s
[CV] ........................................ C=10000.0, total=  13.5s


[Parallel(n_jobs=8)]: Done  90 out of  90 | elapsed:  3.1min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight={0: 5, 1: 1}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='saga', tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]},
       pre_dispatch='2*n_jobs', refit='f1', return_train_score=True,
       scoring={'f1', 'recall', 'precision', 'accuracy'}, verbose=2)

In [113]:
get_summary(grid)

Unnamed: 0,Iter #,train_f1,test_f1,train_recall,test_recall,train_precision,test_precision,train_accuracy,test_accuracy,fit_time,score_time,C
0,1,0.897138,0.896777,0.842333,0.841865,0.95957,0.959531,0.835658,0.83519,8.935793,0.118431,0.0001
1,2,0.896312,0.895947,0.839583,0.839209,0.961262,0.961065,0.834727,0.834221,12.219735,0.070869,0.001
2,3,0.895802,0.89547,0.838566,0.838227,0.961423,0.961252,0.834021,0.833566,15.906968,0.077983,0.01
3,4,0.895741,0.895359,0.838446,0.838019,0.961441,0.961269,0.833936,0.833409,16.322985,0.068721,0.1
4,5,0.895735,0.89535,0.838432,0.838001,0.961446,0.961273,0.833929,0.833397,16.294483,0.082232,1.0
5,6,0.895733,0.895358,0.838429,0.83801,0.961444,0.961279,0.833925,0.833409,16.095374,0.078157,10.0
6,7,0.895733,0.895358,0.838429,0.83801,0.961444,0.961279,0.833925,0.833409,16.489736,0.066706,100.0
7,8,0.895732,0.895358,0.838429,0.83801,0.961444,0.961279,0.833925,0.833409,16.680834,0.080976,1000.0
8,9,0.895732,0.895358,0.838429,0.83801,0.961444,0.961279,0.833925,0.833409,15.692518,0.069746,10000.0


In [114]:
grid.best_estimator_

LogisticRegression(C=0.0001, class_weight={0: 5, 1: 1}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='saga', tol=0.0001, verbose=0, warm_start=False)

Let's also try with no class weights but straitified cross validations

In [115]:
estimator = LogisticRegression(random_state=42, solver='saga')

In [116]:
from sklearn.model_selection import StratifiedKFold

In [118]:
skf = StratifiedKFold(n_splits=10)

In [119]:
grid = GridSearchCV(estimator=estimator,
                    param_grid=params,
                    scoring={'accuracy', 'precision', 'f1', 'recall'},
                    refit='f1', # Because we are using multiple evaluation metrics
                    cv=skf,
                    return_train_score=True,
                    verbose=2,
                    n_jobs=8)  ## Using only 8 cores out of 12 cores

In [120]:
features = Dtrain_scaled
labels = data_train.Polarity.apply(lambda x: 1 if x == 'positive' else 0).values

In [121]:
grid.fit(features, labels)

Fitting 10 folds for each of 9 candidates, totalling 90 fits
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   7.4s
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   7.4s
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   8.3s
[CV] ...........

[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:  1.0min


[CV] ........................................... C=0.01, total=  29.6s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  28.4s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  28.9s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  25.6s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  33.4s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  32.3s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  33.3s
[CV] C=0.1 ...........................................................
[CV] .

[CV] ........................................ C=10000.0, total=  35.8s
[CV] ........................................ C=10000.0, total=  32.9s
[CV] ........................................ C=10000.0, total=  32.0s
[CV] ........................................ C=10000.0, total=  31.0s
[CV] ........................................ C=10000.0, total=  29.8s
[CV] ........................................ C=10000.0, total=  28.0s


[Parallel(n_jobs=8)]: Done  90 out of  90 | elapsed:  5.6min finished


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='saga', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]},
       pre_dispatch='2*n_jobs', refit='f1', return_train_score=True,
       scoring={'f1', 'recall', 'precision', 'accuracy'}, verbose=2)

In [122]:
get_summary(grid)

Unnamed: 0,Iter #,train_f1,test_f1,train_recall,test_recall,train_precision,test_precision,train_accuracy,test_accuracy,fit_time,score_time,C
0,1,0.931444,0.931336,0.985193,0.985101,0.883257,0.883154,0.87661,0.876409,8.069294,0.153777,0.0001
1,2,0.936629,0.936528,0.975514,0.975376,0.900725,0.900693,0.887688,0.887509,15.173029,0.080283,0.001
2,3,0.937527,0.93724,0.972654,0.972259,0.904849,0.904695,0.88971,0.889211,29.016176,0.072158,0.01
3,4,0.937629,0.937447,0.972175,0.971927,0.905454,0.90537,0.889956,0.889639,33.197467,0.07476,0.1
4,5,0.937637,0.937458,0.97212,0.971899,0.905517,0.905415,0.889978,0.889663,33.639132,0.055022,1.0
5,6,0.937637,0.937457,0.972113,0.971894,0.905522,0.905418,0.889979,0.889663,33.517958,0.097441,10.0
6,7,0.937637,0.937457,0.972112,0.971894,0.905522,0.905418,0.889978,0.889663,33.617094,0.068962,100.0
7,8,0.937636,0.937457,0.972112,0.971894,0.905522,0.905418,0.889977,0.889663,33.722761,0.074197,1000.0
8,9,0.937636,0.937457,0.972112,0.971894,0.905522,0.905418,0.889977,0.889663,32.952096,0.083683,10000.0


In [123]:
grid.best_estimator_

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='saga', tol=0.0001,
          verbose=0, warm_start=False)

Finally we can see for C = 1, we get the best estimator with 0.937458 as the F1 score

## Saving the best estimator

In [124]:
from sklearn.externals import joblib

In [125]:
joblib.dump(grid.best_estimator_, 'best_w2v_tfidf.model')

['best_w2v_tfidf.model']

## Loading the best estimator

In [126]:
clf = joblib.load('best_w2v_tfidf.model')

In [128]:
xTest = Dtest_scaled
actuals = data_test.Polarity.apply(lambda x: 1 if x == 'positive' else 0).values

In [129]:
preds = clf.predict(xTest)

## Reporting Evaluation metrics for best estimator accoring to F1 score

In [130]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

In [134]:
acs = accuracy_score(actuals, preds)
pre = precision_score(actuals, preds)
rec = recall_score(actuals, preds)
f1 = f1_score(actuals, preds)

from prettytable import PrettyTable

x = PrettyTable()

x.field_names = ['Featurisation', 'test_accuracy', 'test_precision', 'test_recall', 'test_f1']
x.add_row(['BOW', 0.8963253872349775, 0.9077152225405725, 0.9733473824312333, 0.9393863165613174])
x.add_row(['TFIDF', 0.899932256765169, 0.9132666367610086, 0.9709738243123336, 0.9412365536483224])
x.add_row(['w2v-TFIDF', acs, pre, rec, f1])

print(x)


+---------------+--------------------+--------------------+--------------------+--------------------+
| Featurisation |   test_accuracy    |   test_precision   |    test_recall     |      test_f1       |
+---------------+--------------------+--------------------+--------------------+--------------------+
|      BOW      | 0.8963253872349775 | 0.9077152225405725 | 0.9733473824312333 | 0.9393863165613174 |
|     TFIDF     | 0.899932256765169  | 0.9132666367610086 | 0.9709738243123336 | 0.9412365536483224 |
|   w2v-TFIDF   | 0.873347614339595  | 0.8840710993014876 | 0.9742992468359345 | 0.9269947811954176 |
+---------------+--------------------+--------------------+--------------------+--------------------+


## Sparsity and Latency

In [135]:
features = Dtrain_scaled
labels = data_train.Polarity.apply(lambda x: 1 if x == 'positive' else 0).values

In [137]:
estimators = []

In [138]:
for c in [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]:
    clf = LogisticRegression(penalty='l1', C=c, solver='saga', verbose=2)
    clf.fit(features, labels)
    estimators.append(clf)
    print(f"Done for c={c}")

convergence after 15 epochs took 10 seconds
Done for c=0.0001


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.4s finished


convergence after 20 epochs took 14 seconds
Done for c=0.001


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.5s finished


convergence after 34 epochs took 23 seconds
Done for c=0.01


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.2s finished


convergence after 37 epochs took 26 seconds
Done for c=0.1


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.0s finished


convergence after 34 epochs took 25 seconds
Done for c=1.0


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   25.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   25.2s finished


convergence after 31 epochs took 22 seconds
Done for c=10.0


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.0s finished


convergence after 30 epochs took 23 seconds
Done for c=100.0


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.5s finished


convergence after 30 epochs took 24 seconds
Done for c=1000.0


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.0s finished


convergence after 30 epochs took 24 seconds
Done for c=10000.0


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.8s finished


In [139]:
latency = pd.DataFrame(columns=['C', 'sparsity', 'latency'])


In [140]:
xTest = Dtest_scaled
actuals = data_test.Polarity.apply(lambda x: 1 if x == 'positive' else 0).values

In [141]:
spr = [np.nonzero(estimator.coef_.ravel())[0].shape[0] for estimator in estimators]
C = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]

In [142]:
import time

In [143]:
latn = []

In [144]:
for estimator in estimators:
    start = time.clock()
    estimator.predict(xTrain)
    latn.append(time.clock() - start)

In [145]:
latency['C'] = C
latency['sparsity'] = spr
latency['latency'] = latn

In [146]:
latency

Unnamed: 0,C,sparsity,latency
0,0.0001,9,0.739715
1,0.001,49,0.319007
2,0.01,92,0.163876
3,0.1,100,0.167309
4,1.0,100,0.163344
5,10.0,100,0.16215
6,100.0,100,0.164987
7,1000.0,100,0.165414
8,10000.0,100,0.164082


The behavious is bit strange. The sparsity(non zero elements) is incresing and latency is also increasing. This is not expected.