In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
import matplotlib.pyplot as plt
import pickle
import copy

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
"""Word2Vec experiment"""
from gensim.models import KeyedVectors

"""Loading W2V matrix"""
google_w2v = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

In [9]:
"""Loading the data"""
data = pickle.load(open('./label_tag_data.p', 'rb'))
y = list(data.pi)

In [10]:
"""Computing the average embedding of the tweets"""
X = []
for k in tqdm(range(data.shape[0])):
    local = data.tag_df.iloc[k]
    local = local[local.tag.isin(['A','R','N','V','@'])]
    vectors = []
    for word in list(local.word):
        try:
            vectors.append(google_w2v[word])
        except:
            pass
    embedding = 0
    if len(vectors)==0:
        embedding = np.zeros(300)
    else:
        embedding = np.vstack([v.reshape((1,-1)) for v in vectors]).mean(axis=0)
    X.append(list(embedding))

HBox(children=(IntProgress(value=0, max=11540), HTML(value='')))




In [11]:
"""Balanced split for cross validation"""

"""Retrieving the positive and negative indexes"""
neg_index = [k for k in range(len(y)) if y[k]==0]
pos_index = [k for k in range(len(y)) if y[k]==1]

"""Shuffling both the positive and negative indexes"""
np.random.seed(seed=0)
np.random.shuffle(neg_index)
np.random.shuffle(pos_index)

"""Computing train and test index sets for the chosen number of folds"""
cv = 10

s = len(neg_index)//cv
neg_index_list = [neg_index[k*s:(k+1)*s] for k in range(cv-1)]
neg_index_list.append(neg_index[(cv-1)*s:])
assert(np.sum([len(e) for e in neg_index_list])==len(neg_index)) # Check on negative index completion

s = len(pos_index)//cv
pos_index_list = [pos_index[k*s:(k+1)*s] for k in range(cv-1)]
pos_index_list.append(pos_index[(cv-1)*s:])
assert(np.sum([len(e) for e in pos_index_list])==len(pos_index))# Check on positive index completion

In [12]:
"""Performing cross-validation for logistic regression on Word2Vec embedding representation"""

"""Initializing the list of outputs, predictions and probabilities to computed CV-ly"""
Y = []
Y_Pred = []
Y_Proba = []

"""Intializing the list of cross-validated features"""
precision_list = []
recall_list = []
roc_auc_list = []
f1_list = []

"""Storing coefficients and biases for stability evaluation"""
biases = []
weights = []

"""Performing the cross-validation of the model using the features of interest
A caveat to be mentioned is that the feature selection was performed on the whole dataset, which may be a little biased towards choosing the right features.
This effect will be neglected during this test."""
for k in range(cv):
    """Splitting the data into train and test"""
    print('{}-th fold'.format(k+1))
    print('Splitting the data')
    neg_test = [X[i] for i in neg_index_list[k]]
    pos_test = [X[i] for i in pos_index_list[k]]
    neg_train = [X[i] for i in set(neg_index).difference(neg_index_list[k])]
    pos_train = [X[i] for i in set(pos_index).difference(pos_index_list[k])]

    X_train = pos_train + neg_train
    X_test = pos_test + neg_test

    y_train = [1] * len(pos_train) + [0] * len(neg_train)
    y_test = [1] * len(pos_test) + [0] * len(neg_test)

    """Fitting the model"""
    print('Fitting the model')
    model = LogisticRegression(C=1e4)
    model.fit(X_train, y_train)
    
    print('Evaluation and storage of model parameters\n')
    """Outputting the predictions and the probability scores"""
    y_score = model.predict_proba(X_test)[:,1]
    y_pred = model.predict(X_test)

    """Computing the various metrics"""
    f1_list.append(f1_score(y_test, y_pred))
    precision_list.append(precision_score(y_test, y_pred))
    recall_list.append(recall_score(y_test, y_pred))
    roc_auc_list.append(roc_auc_score(y_test, y_score))

    """Adding predictions and scores to computed global cross-validated performance after the end of the process"""
    Y += list(y_test)
    Y_Pred += list(y_pred)
    Y_Proba += list(y_score)
    
"""Performance analysis for our model with 10-fold CV"""
print('Averaged scores')
print('Average {}-fold F1 score : {}'.format(cv, np.mean(f1_list)))
print('Average {}-fold precision : {}'.format(cv, np.mean(precision_list)))
print('Average {}-fold recall : {}'.format(cv, np.mean(recall_list)))
print('Average {}-fold ROC AUC : {}'.format(cv, np.mean(roc_auc_list)))
print('\n')

1-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

2-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

3-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

4-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

5-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

6-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

7-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

8-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

9-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

10-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

Averaged scores
Average 10-fold F1 score : 0.39154198881607316
Average 10-fold precision 