In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
%config InlineBackend.figure_format = 'retina'

In [19]:
!ls ..

Amazon-Review----Logistic-Regression  cert	 Processed Datasets.zip
Amazon-Review---Naive-Bayes	      Databases  Tools
anaconda3			      __MACOSX


## 1. Loading the dataset

In [20]:
with sqlite3.connect('../Databases/reviewsV1.db') as conn:
    data = pd.read_sql_query('SELECT * FROM Review', conn)

In [21]:
data.drop('index', inplace=True, axis=1)
data = data[data.index != 258456]

## 2. Time Based Splitting

In [22]:
data.sort_values(by='Time', inplace=True)
data.reset_index(drop=True, inplace=True)

In [23]:
TRAIN_SIZE = int(data.shape[0] * 0.7)
TEST_SIZE = data.shape[0] - TRAIN_SIZE

In [24]:
TRAIN_SIZE

254882

In [25]:
TEST_SIZE

109236

In [26]:
data_train = data[0: TRAIN_SIZE]
data_test = data[TRAIN_SIZE:]

#### 2.1 Check if the Splitting was performed properly

In [27]:
assert(data_train.shape[0] == TRAIN_SIZE)
assert(data_test.shape[0] == TEST_SIZE)

In [28]:
assert(data.Time.max() == data_test.Time.reset_index(drop=True)[TEST_SIZE -1])

In [29]:
assert(data.Time.min() == data_train.Time.reset_index(drop=True)[0])


## Training Word2Vec Model on data_train

#### Preparing a corpus to create a Word2Vec model

In [30]:
corpus = [review.split() for review in data_train.Text.values]


In [31]:
assert(len(corpus) == TRAIN_SIZE)

In [32]:
print(data.Text.values[0])
print("*****************************************************************")
print(corpus[0])

special acting effects movie well written everything delighted view chose beetlejuice
*****************************************************************
['special', 'acting', 'effects', 'movie', 'well', 'written', 'everything', 'delighted', 'view', 'chose', 'beetlejuice']


#### Training Word2Vec on train data

In [33]:
import multiprocessing
from gensim.models.word2vec import Word2Vec
params = {
    'size': 100,
    'min_count': 5, 
    'workers': max(1, multiprocessing.cpu_count()),}
model = Word2Vec(corpus, **params)

In [34]:
model.wv.most_similar('disappointed')

[('sadly', 0.8381236791610718),
 ('calling', 0.7918726205825806),
 ('earlier', 0.7881958484649658),
 ('writing', 0.7853830456733704),
 ('identical', 0.7839192152023315),
 ('description', 0.781782329082489),
 ('pictured', 0.7806835174560547),
 ('filled', 0.7772977352142334),
 ('suppliers', 0.7768746018409729),
 ('returning', 0.7756742238998413)]

In [35]:
model.wv.most_similar('love')

[('lots', 0.6638856530189514),
 ('enjoy', 0.6352674961090088),
 ('excellent', 0.6208349466323853),
 ('unlike', 0.6117261052131653),
 ('beat', 0.6106811761856079),
 ('enjoying', 0.6009395122528076),
 ('enough', 0.5983511209487915),
 ('skeptical', 0.5975606441497803),
 ('gotten', 0.5942325592041016),
 ('everyone', 0.592284619808197)]

## Creating our feature sets

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Fitting TFIDF vectorizer on train data

In [37]:
features = data_train.Text
labels = data_train.Polarity

In [40]:
tfidf = TfidfVectorizer(ngram_range=(1, 1))
tf_idf_vect = tfidf.fit_transform(features.values)
tfidf_feat = tfidf.get_feature_names()

In [41]:
tf_idf_vect.get_shape()

(254882, 80733)

In [43]:
tfidf_feat[80000:80010]

['yard',
 'yardbird',
 'yardbirds',
 'yardin',
 'yards',
 'yardstick',
 'yare',
 'yarfing',
 'yarks',
 'yarn']

In [44]:
features[0]


'special acting effects movie well written everything delighted view chose beetlejuice'

In [45]:
tf_idf_vect[0, tfidf_feat.index('special')]

0.23001252750185927

#### Creating a hashmap to find the index of the tfidf vector for a review

In [46]:
z = pd.DataFrame(tfidf_feat).reset_index()
z.columns = ['index', 'word']
z = z.set_index('word')
zd = z.to_dict()['index']

In [47]:
%%timeit
zd['well']

54.8 ns ± 0.675 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [48]:
from tqdm import tqdm_notebook as tqdm

In [49]:
def process_tfidf_word2vec(corpus, table):
    global review_vec
    global rejected
    row = 0
    for doc in tqdm(corpus):
        review = [model.wv[word] for word in doc 
                  if model.wv.__contains__(word)]
        review = np.array(review)
        if review.shape[0] == 0:
#             print(doc)
#             print(review)
#             print(row)
            rejected.append(row)
            row += 1
            continue
        tfidfs = [tf_idf_vect[row, table[word]] for word in doc 
                  if model.wv.__contains__(word)]
        tfidfs = np.array(tfidfs)
        if tfidfs.shape[0] == 0:
#             print(doc)
#             print(tfidfs)
#             print(row)
            rejected.append(row)
            row += 1
            continue
        denominator = tfidfs.sum()
        if denominator == 0.0:
#             print(doc)
#             print(tfidfs)
#             print(row)
            rejected.append(row)
            row += 1
            continue
        numerator = review * tfidfs.reshape(review.shape[0], 1)

        all_vecs = numerator / denominator
        avg = np.sum(review, axis=0)
        avg_list = avg.tolist()
        review_vec.append(avg_list)
        row += 1

#### Creating training data

In [50]:
review_vec = []
rejected = []

In [51]:
process_tfidf_word2vec(corpus, zd)

HBox(children=(IntProgress(value=0, max=254882), HTML(value='')))




In [52]:
rejected

[]

In [53]:
assert(len(review_vec) == TRAIN_SIZE - len(rejected))

In [54]:
Dtrain = pd.DataFrame(review_vec)

In [55]:
Dtrain['Label'] = data_train.Polarity.values

In [56]:
Dtrain.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Label
0,0.124132,0.690627,0.186075,-2.972061,0.559061,-1.947294,-0.898517,-3.268192,0.033878,-6.211611,...,-0.556352,-3.736009,6.305069,2.817507,-2.745073,-2.30224,-7.083508,-6.108217,2.363632,positive
1,-1.297451,-3.487176,-1.900977,2.923317,4.703219,1.343819,-3.89817,1.089229,-3.194784,0.130931,...,2.290709,2.017211,0.717947,-3.253783,-0.422405,-1.484561,-0.250539,2.674775,-0.388883,positive
2,-1.409702,2.63023,-0.170995,-3.193642,0.842995,-0.501407,-5.79572,-2.877341,7.527938,-0.784455,...,-2.659028,2.648695,8.567437,-0.111613,1.176608,-2.064354,2.99944,-11.611585,3.155114,positive
3,-0.027507,-0.289149,0.363027,0.346066,-0.557217,1.13109,-2.861841,-0.892604,-3.300861,-3.033913,...,-0.456559,-1.709518,2.656567,-6.408174,4.982338,-4.777662,-0.461821,-3.159812,1.508133,positive
4,1.1917,9.968606,-2.060853,4.582136,-2.336024,6.745582,-1.528853,-11.381527,-0.101736,0.389369,...,-3.930562,-3.888461,3.900736,-4.855195,7.468403,-13.492609,-1.323302,-8.756943,2.113569,negative


#### Fitting TFIDF vectorizer on test data

In [58]:
features = data_test.Text.reset_index(drop=True)
labels = data_test.Polarity.reset_index(drop=True)

tfidf = TfidfVectorizer(ngram_range=(1, 1))
tf_idf_vect = tfidf.fit_transform(features.values)
tfidf_feat = tfidf.get_feature_names()

tf_idf_vect.get_shape()

(109236, 53046)

In [59]:
tfidf_feat[53000:53010]

['ziyad',
 'zoid',
 'zojirushi',
 'zojorushi',
 'zojurishi',
 'zoku',
 'zola',
 'zombie',
 'zombies',
 'zomg']

In [60]:
features[0]

'stop semi could worth chips extra like husband taste ghirardelli sweet eating chocolate'

In [61]:
tf_idf_vect[0, tfidf_feat.index('chips')]

0.29243409746031235

#### Creating a hashmap to find the index of the tfidf vector for a review

In [62]:
z = pd.DataFrame(tfidf_feat).reset_index()
z.columns = ['index', 'word']
z = z.set_index('word')
zd = z.to_dict()['index']

#### Creating testing data

In [63]:
review_vec = []
rejected = []
corpus = [review.split() for review in features]
process_tfidf_word2vec(corpus, zd)

HBox(children=(IntProgress(value=0, max=109236), HTML(value='')))




In [64]:
rejected

[]

In [65]:
assert(len(review_vec) == TEST_SIZE)

In [66]:
Dtest = pd.DataFrame(review_vec)
Dtest['Label'] = labels

In [67]:
Dtest.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Label
0,-6.67402,-9.341356,-11.094802,-2.116736,-2.97314,-9.272035,-4.50983,0.355996,4.096109,-8.820267,...,-3.871864,-4.205803,-7.584278,-7.21555,-2.532655,6.771099,11.56401,-4.61953,3.81234,positive
1,-19.934109,-3.250811,-1.080076,5.138292,-6.326931,3.563781,0.180514,-18.293318,-3.321959,-1.347359,...,-8.987338,-3.428568,4.603112,-17.550989,-3.623773,7.290879,5.91129,-15.283457,-2.971651,positive
2,-6.213966,6.137169,-2.930566,-5.255229,-7.922285,-2.661392,-2.43847,-7.05676,12.457644,-2.482828,...,-12.839907,-2.039152,-2.395099,0.164565,1.596511,6.288093,11.616546,-2.045568,-7.658936,positive
3,-0.904325,5.048011,-8.070699,-3.871047,-7.269782,-7.707406,1.666549,1.454569,11.073346,2.123173,...,-8.268493,-4.135805,4.004866,12.495537,2.461176,1.826005,0.449489,-15.097639,4.947555,positive
4,-17.84609,14.703689,-12.341253,-0.058647,-4.254901,-17.472908,-19.519054,-15.043742,13.908079,-13.646329,...,-16.335865,3.962137,17.734367,0.080899,0.832742,9.426176,9.454819,-19.845797,23.77376,positive


## Scaling the data

In [68]:
from sklearn.preprocessing import StandardScaler

In [69]:
sclr = StandardScaler(copy=True)

In [70]:
Dtrain.drop(['Label'], inplace=True, axis=1)

Dtest.drop(['Label'], inplace=True, axis=1)

Dtrain_scaled = sclr.fit_transform(Dtrain)

Dtest_scaled = sclr.fit_transform(Dtest)

In [71]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [72]:
params = {
    'C': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]
}

In [73]:
data_train.Polarity.value_counts()

positive    216859
negative     38023
Name: Polarity, dtype: int64

As we can see that it is highly imbalanced dataset, we can use class_weights to manage that

In [74]:
216859 // 38023

5

## Fitting the Grid

In [101]:
estimator = LogisticRegression(random_state=42, class_weight={0: 5, 1: 1}, solver='saga')

In [102]:
estimator

LogisticRegression(C=1.0, class_weight={0: 5, 1: 1}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='saga', tol=0.0001, verbose=0, warm_start=False)

In [103]:
grid = GridSearchCV(estimator=estimator,
                    param_grid=params,
                    scoring={'accuracy', 'precision', 'f1', 'recall'},
                    refit='f1', # Because we are using multiple evaluation metrics
                    cv=10,
                    return_train_score=True,
                    verbose=2,
                    n_jobs=8)  ## Using only 8 cores out of 12 cores

In [104]:
features = Dtrain_scaled
labels = data_train.Polarity.apply(lambda x: 1 if x == 'positive' else 0).values

In [105]:
features.shape


(254882, 100)

In [106]:
len(labels)


254882

In [107]:
grid.fit(features, labels)

Fitting 10 folds for each of 9 candidates, totalling 90 fits
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   7.2s
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   8.2s
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   7.8s
[CV] C=0.001 ...

[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:   45.2s


[CV] ........................................... C=0.01, total=  14.8s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  14.2s
[CV] ........................................... C=0.01, total=  16.3s
[CV] C=0.1 ...........................................................
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  15.1s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  14.6s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  15.4s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  16.6s
[CV] C=0.1 ...........................................................
[CV] .

[CV] ........................................ C=10000.0, total=  17.5s
[CV] ........................................ C=10000.0, total=  15.8s
[CV] ........................................ C=10000.0, total=  15.6s
[CV] ........................................ C=10000.0, total=  15.4s
[CV] ........................................ C=10000.0, total=  15.3s
[CV] ........................................ C=10000.0, total=  14.2s


[Parallel(n_jobs=8)]: Done  90 out of  90 | elapsed:  3.0min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight={0: 5, 1: 1}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='saga', tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]},
       pre_dispatch='2*n_jobs', refit='f1', return_train_score=True,
       scoring={'recall', 'accuracy', 'f1', 'precision'}, verbose=2)

In [108]:
from collections import defaultdict

In [109]:
def get_summary(grid):
    parms = list(grid.param_grid.keys())
    columns = ['Iter #']
    scorings = grid.scoring
    for scoring in scorings:
        for typ in ['train', 'test']:
            columns.append(f'{typ}_{scoring}')
    columns.extend(['fit_time', 'score_time'])
    for param in grid.param_grid.keys():
        columns.append(param)
    df = pd.DataFrame(columns=columns)
    cv_res = grid.cv_results_
    for col in columns[1:-len(parms)]:
        df[col] = cv_res['mean_' + col]
    items = 1
    for k, v in grid.param_grid.items():
        items *= len(v)
    df['Iter #'] = np.array(range(items)) + 1
    res = defaultdict(list)
    for each in grid.cv_results_['params']:
        for p in parms:
            res[p].append(each[p])
    for k, v in res.items():
        df[k] = v
    return df

In [110]:
get_summary(grid)

Unnamed: 0,Iter #,train_recall,test_recall,train_accuracy,test_accuracy,train_f1,test_f1,train_precision,test_precision,fit_time,score_time,C
0,1,0.841958,0.841893,0.835128,0.834998,0.896799,0.896672,0.959283,0.959249,8.67393,0.113013,0.0001
1,2,0.839364,0.838942,0.834608,0.834111,0.896221,0.895852,0.96134,0.961205,12.27789,0.068625,0.001
2,3,0.838597,0.838162,0.834298,0.833692,0.895961,0.895532,0.96175,0.961487,14.886162,0.062169,0.01
3,4,0.838481,0.837978,0.834223,0.833582,0.895906,0.895451,0.961774,0.96154,16.083658,0.066572,0.1
4,5,0.838462,0.837978,0.834211,0.833582,0.895897,0.89545,0.96178,0.96154,15.715493,0.075494,1.0
5,6,0.83846,0.837982,0.83421,0.833582,0.895896,0.895451,0.961781,0.961535,15.964543,0.068058,10.0
6,7,0.83846,0.837982,0.83421,0.833582,0.895897,0.895451,0.961781,0.961535,16.473803,0.057097,100.0
7,8,0.83846,0.837982,0.83421,0.833582,0.895897,0.895451,0.961781,0.961535,15.984305,0.076914,1000.0
8,9,0.83846,0.837982,0.83421,0.833582,0.895897,0.895451,0.961781,0.961535,15.655957,0.059825,10000.0


In [111]:
grid.best_estimator_

LogisticRegression(C=0.0001, class_weight={0: 5, 1: 1}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='saga', tol=0.0001, verbose=0, warm_start=False)

Stange the class weights didn't imporove the recall and f1 score but it drastically improved the precision. Lets try decresing the penalty of negative class see if it improves

In [113]:
estimator = LogisticRegression(random_state=42, class_weight={0: 3, 1: 1}, solver='saga')

In [114]:
grid = GridSearchCV(estimator=estimator,
                    param_grid=params,
                    scoring={'accuracy', 'precision', 'f1', 'recall'},
                    refit='f1', # Because we are using multiple evaluation metrics
                    cv=10,
                    return_train_score=True,
                    verbose=2,
                    n_jobs=8)  ## Using only 8 cores out of 12 cores

In [115]:
grid.fit(features, labels)

Fitting 10 folds for each of 9 candidates, totalling 90 fits
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   7.3s
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   7.3s
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   7.9s
[CV] C=0.001 ...

[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:   50.1s


[CV] ........................................... C=0.01, total=  17.4s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  18.0s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  18.2s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  18.2s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  18.1s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  18.5s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  20.1s
[CV] C=0.1 ...........................................................
[CV] .

[CV] ........................................ C=10000.0, total=  19.1s
[CV] ........................................ C=10000.0, total=  18.7s
[CV] ........................................ C=10000.0, total=  20.1s
[CV] ........................................ C=10000.0, total=  18.0s
[CV] ........................................ C=10000.0, total=  16.6s
[CV] ........................................ C=10000.0, total=  18.1s


[Parallel(n_jobs=8)]: Done  90 out of  90 | elapsed:  3.5min finished


GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight={0: 3, 1: 1}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='saga', tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]},
       pre_dispatch='2*n_jobs', refit='f1', return_train_score=True,
       scoring={'recall', 'accuracy', 'f1', 'precision'}, verbose=2)

In [116]:
get_summary(grid)

Unnamed: 0,Iter #,train_recall,test_recall,train_accuracy,test_accuracy,train_f1,test_f1,train_precision,test_precision,fit_time,score_time,C
0,1,0.916767,0.9166,0.876595,0.876358,0.926694,0.926528,0.936838,0.936789,8.529412,0.128212,0.0001
1,2,0.905421,0.905077,0.873878,0.873408,0.924334,0.924021,0.944054,0.943892,13.508412,0.075851,0.001
2,3,0.902713,0.902508,0.872734,0.872376,0.923488,0.923248,0.945243,0.945078,17.967196,0.065255,0.01
3,4,0.902276,0.901946,0.872497,0.872047,0.923323,0.923021,0.945374,0.945218,19.205624,0.077475,0.1
4,5,0.902232,0.901927,0.872474,0.872043,0.923307,0.923017,0.945389,0.945231,19.375408,0.075823,1.0
5,6,0.902222,0.901932,0.872466,0.872047,0.923301,0.92302,0.945389,0.945231,19.536902,0.105512,10.0
6,7,0.902221,0.901927,0.872465,0.872043,0.923301,0.923017,0.945389,0.945231,19.664502,0.083058,100.0
7,8,0.902221,0.901927,0.872465,0.872043,0.923301,0.923017,0.945389,0.945231,19.985861,0.077017,1000.0
8,9,0.902221,0.901927,0.872465,0.872043,0.923301,0.923017,0.945389,0.945231,18.953875,0.060605,10000.0


In [117]:
grid.best_estimator_

LogisticRegression(C=0.0001, class_weight={0: 3, 1: 1}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=42,
          solver='saga', tol=0.0001, verbose=0, warm_start=False)

Let's also try with no class weights but straitified cross validations

In [118]:
estimator = LogisticRegression(random_state=42, solver='saga')

In [119]:
grid = GridSearchCV(estimator=estimator,
                    param_grid=params,
                    scoring={'accuracy', 'precision', 'f1', 'recall'},
                    refit='f1', # Because we are using multiple evaluation metrics
                    cv=10,
                    return_train_score=True,
                    verbose=2,
                    n_jobs=8)  ## Using only 8 cores out of 12 cores

In [120]:
from sklearn.model_selection import StratifiedKFold

In [122]:
skf = StratifiedKFold(n_splits=10)  # For better distrbution of class label among each cv folds

In [123]:
grid = GridSearchCV(estimator=estimator,
                    param_grid=params,
                    scoring={'accuracy', 'precision', 'f1', 'recall'},
                    refit='f1', # Because we are using multiple evaluation metrics
                    cv=skf,
                    return_train_score=True,
                    verbose=2,
                    n_jobs=8)  ## Using only 8 cores out of 12 cores

In [124]:
features = Dtrain_scaled
labels = data_train.Polarity.apply(lambda x: 1 if x == 'positive' else 0).values

In [125]:
grid.fit(features, labels)

Fitting 10 folds for each of 9 candidates, totalling 90 fits
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   7.1s
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   7.4s
[CV] C=0.0001 ........................................................
[CV] ......................................... C=0.0001, total=   7.7s
[CV] C=0.001 ...

[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:  1.0min


[CV] ........................................... C=0.01, total=  28.5s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  25.9s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  27.0s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  26.3s
[CV] C=0.1 ...........................................................
[CV] ........................................... C=0.01, total=  26.3s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  31.1s
[CV] C=0.1 ...........................................................
[CV] ............................................ C=0.1, total=  30.6s
[CV] C=0.1 ...........................................................
[CV] .

[CV] ........................................ C=10000.0, total=  30.9s
[CV] ........................................ C=10000.0, total=  32.8s
[CV] ........................................ C=10000.0, total=  29.6s
[CV] ........................................ C=10000.0, total=  30.2s
[CV] ........................................ C=10000.0, total=  28.0s
[CV] ........................................ C=10000.0, total=  31.8s


[Parallel(n_jobs=8)]: Done  90 out of  90 | elapsed:  5.3min finished


GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='saga', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=8,
       param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]},
       pre_dispatch='2*n_jobs', refit='f1', return_train_score=True,
       scoring={'recall', 'accuracy', 'f1', 'precision'}, verbose=2)

In [126]:
get_summary(grid)

Unnamed: 0,Iter #,train_recall,test_recall,train_accuracy,test_accuracy,train_f1,test_f1,train_precision,test_precision,fit_time,score_time,C
0,1,0.984919,0.984797,0.876393,0.876249,0.931314,0.931233,0.883243,0.883214,8.061984,0.132092,0.0001
1,2,0.975908,0.975846,0.887593,0.887462,0.936603,0.936532,0.900341,0.900299,15.20497,0.060756,0.001
2,3,0.973016,0.972743,0.890005,0.889706,0.937705,0.937532,0.904867,0.904822,26.974514,0.051922,0.01
3,4,0.972543,0.972295,0.890389,0.890075,0.937881,0.9377,0.905605,0.905524,30.949615,0.067455,0.1
4,5,0.972506,0.97224,0.890437,0.890118,0.937904,0.93772,0.905681,0.905608,31.201063,0.063763,1.0
5,6,0.972501,0.972231,0.890445,0.890114,0.937908,0.937717,0.905692,0.905611,32.059676,0.089651,10.0
6,7,0.972502,0.972231,0.890446,0.890114,0.937909,0.937717,0.905692,0.905611,31.45845,0.07756,100.0
7,8,0.972502,0.972231,0.890446,0.890114,0.937909,0.937717,0.905692,0.905611,31.561316,0.05203,1000.0
8,9,0.972502,0.972231,0.890446,0.890114,0.937909,0.937717,0.905692,0.905611,30.872852,0.046854,10000.0


In [127]:
grid.best_estimator_

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='saga', tol=0.0001,
          verbose=0, warm_start=False)

Finally we can see for C = 1, we get the best estimator with 0.937904 as the F1 score

## Saving the best estimator

In [128]:
from sklearn.externals import joblib

In [129]:
joblib.dump(grid.best_estimator_, 'best_w2v_tfidf.model')

['best_w2v_tfidf.model']

## Loading the best estimator

In [130]:
clf = joblib.load('best_w2v_tfidf.model')

In [131]:
xTest = Dtest_scaled
actuals = data_test.Polarity.apply(lambda x: 1 if x == 'positive' else 0).values

In [132]:
preds = clf.predict(xTest)

## Reporting Evaluation metrics for best estimator accoring to F1 score

In [133]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)

In [134]:
!ls

Avg W2V.ipynb	      BOW -- Logistic Regression.ipynb	TFIDF.ipynb
best.model	      README.md				TFIDF W2v.ipynb
best_w2v_tfidf.model  tfidf_best.model


In [136]:
acs = accuracy_score(actuals, preds)
pre = precision_score(actuals, preds)
rec = recall_score(actuals, preds)
f1 = f1_score(actuals, preds)

from prettytable import PrettyTable

x = PrettyTable()

x.field_names = ['Vectorization', 'Accuracy', 'Precision', 'Recall', 'F1']
x.add_row(['BOW', 0.8963253872349775, 0.9077152225405725, 0.9733473824312333, 0.9393863165613174])
x.add_row(['TFIDF', 0.899932256765169, 0.9132666367610086, 0.9709738243123336, 0.9412365536483224])
x.add_row(['Avg. W2v', 0.88492804569922, 0.9978506179473401, 1.0, 0.9320202582549146])
x.add_row(['w2v-TFIDF', acs, pre, rec, f1])

print(x)


+---------------+--------------------+--------------------+--------------------+--------------------+
| Vectorization |      Accuracy      |     Precision      |       Recall       |         F1         |
+---------------+--------------------+--------------------+--------------------+--------------------+
|      BOW      | 0.8963253872349775 | 0.9077152225405725 | 0.9733473824312333 | 0.9393863165613174 |
|     TFIDF     | 0.899932256765169  | 0.9132666367610086 | 0.9709738243123336 | 0.9412365536483224 |
|    Avg. W2v   |  0.88492804569922  | 0.9978506179473401 |        1.0         | 0.9320202582549146 |
|   w2v-TFIDF   | 0.8726518730088981 | 0.883704076497232  | 0.973855556664781  | 0.9265921911526467 |
+---------------+--------------------+--------------------+--------------------+--------------------+


## Sparsity and Latency

In [135]:
features = Dtrain_scaled
labels = data_train.Polarity.apply(lambda x: 1 if x == 'positive' else 0).values

In [137]:
estimators = []

In [138]:
for c in [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]:
    clf = LogisticRegression(penalty='l1', C=c, solver='saga', verbose=2)
    clf.fit(features, labels)
    estimators.append(clf)
    print(f"Done for c={c}")

convergence after 15 epochs took 10 seconds
Done for c=0.0001


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.4s finished


convergence after 20 epochs took 14 seconds
Done for c=0.001


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   13.5s finished


convergence after 34 epochs took 23 seconds
Done for c=0.01


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.2s finished


convergence after 37 epochs took 26 seconds
Done for c=0.1


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.0s finished


convergence after 34 epochs took 25 seconds
Done for c=1.0


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   25.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   25.2s finished


convergence after 31 epochs took 22 seconds
Done for c=10.0


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.0s finished


convergence after 30 epochs took 23 seconds
Done for c=100.0


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.5s finished


convergence after 30 epochs took 24 seconds
Done for c=1000.0


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.0s finished


convergence after 30 epochs took 24 seconds
Done for c=10000.0


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   23.8s finished


In [139]:
latency = pd.DataFrame(columns=['C', 'sparsity', 'latency'])


In [140]:
xTest = Dtest_scaled
actuals = data_test.Polarity.apply(lambda x: 1 if x == 'positive' else 0).values

In [141]:
spr = [np.nonzero(estimator.coef_.ravel())[0].shape[0] for estimator in estimators]
C = [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4]

In [142]:
import time

In [143]:
latn = []

In [144]:
for estimator in estimators:
    start = time.clock()
    estimator.predict(xTrain)
    latn.append(time.clock() - start)

In [145]:
latency['C'] = C
latency['sparsity'] = spr
latency['latency'] = latn

In [146]:
latency

Unnamed: 0,C,sparsity,latency
0,0.0001,9,0.739715
1,0.001,49,0.319007
2,0.01,92,0.163876
3,0.1,100,0.167309
4,1.0,100,0.163344
5,10.0,100,0.16215
6,100.0,100,0.164987
7,1000.0,100,0.165414
8,10000.0,100,0.164082


The behavious is bit strange. The sparsity(non zero elements) is incresing and latency is also increasing. This is not expected.