## NeighLoR notebook
In this notebook, we evaluate the following models on McDonald's data set:
1. NeighLoR
2. Logistic regression
3. NeighLoR + pretrained Word2Vec
4. NeighLoR + custom Word2Vec
5. Logistic regression + pretrained Word2Vec
6. Logistic regression + custom Word2Vec

In [21]:
import pandas as pd
import numpy as np
from ark_tweet_nlp import CMUTweetTagger
from tqdm import tqdm_notebook as tqdm
import multiprocessing
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import confusion_matrix
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import re
import pickle
import nltk.data
from multiprocessing import Pool
import enchant
from spellchecker import SpellChecker
from copy import deepcopy
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

In [23]:
import pickle

data = pickle.load(open('./mcd_data.b', 'rb'))

In [24]:
class MaxLog:
    
    def __init__(self, d, fit_intercept=True, init_w=None, init_b=None, alpha=1e-3, epsilon=1e-2):
        """Defining the dimension and fit_intercept parameters"""
        self.dim = d
        self.grad = np.zeros(self.dim)
        self.fit_intercept = fit_intercept
        
        """Initializing the weights"""
        self.w = 0.
        if init_w is None:
            self.w = np.random.normal(size=self.dim)
        elif isinstance(init_w, np.ndarray) and init_w.shape[0]==self.dim:
            self.w = init_w
        else:
            raise ValueError('Wrong dimension in intial parameters.')
        
        """Initializing the bias"""
        self.b = 0.
        if fit_intercept:
            if init_b is None:
                self.b = np.random.normal()
            elif isinstance(init_b, (float, np.float64)):
                self.b = init_b
            else:
                raise ValueError('Bias initialization must be a float type.')
                
        """Setting the learning rate"""
        if isinstance(alpha, (float, np.float64)):
            self.alpha = alpha
        else:
            raise ValueError('Learning rate must be a float type.')
        
        """Setting the stopping criterion for the gradient descent"""
        if isinstance(epsilon, (float, np.float64)):
            self.epsilon = epsilon
        else:
            raise ValueError('Tolerance must be a float type.')
        
        """Initializing error sequence"""
        self.errors = []
        
        pass
    
    
    def check_dim(self, X):
        """Computing check on dimension of observations"""
        dim_check = any(x.shape[1]!=self.dim for x in X)
        if dim_check:
            raise ValueError('Wrong dimension in observations')
        pass
    
    
    def check_attr(self, attr, message=''):
        """Function to check whether the instance has an attribute"""
        if not hasattr(self, attr):
            if message=='':
                raise ValueError("Instance doesn't have attribute '{}'".format(attr))
            else:
                raise ValueError(message)
        pass
    
    
    def predict(self, X):
        """Computing check on dimension of observations"""
        self.check_dim(X)
            
        """Computing the predictions"""
        pred = [np.max(expit(self.b + x.dot(self.w))) for x in X]
        return np.array([int(p>=0.5) for p in pred])
    
    
    def predict_proba(self, X):
        """Computing check on dimension of observations"""
        self.check_dim(X)
            
        """Computing the probability"""
        pred = [np.max(expit(self.b + x.dot(self.w))) for x in X]
        return np.array(pred)
    
    
    def fit(self, X, y, parallel=False, n_iter=None):
        """Function to fit the model
        Needs : compute the forward propagation, compute the backward propagation"""
        """Computing check on dimension of observations"""
        self.check_dim(X)
        
        """Initiating the error sequence"""
        self.errors = []
        self.grad = 10
        
        """Performing the gradient descent"""
        error = np.inf
        if n_iter==None:
            while error >= self.epsilon:
                self.forward(X, y)
                self.backward(X, y)
                error = self.grad
        else:
            for k in tqdm(range(n_iter), desc='Fitting the model'):
                self.forward(X, y)
                self.backward(X, y)
                if self.grad<=self.epsilon:
                    break
        pass
    
    
    def forward(self, X, y):
        """Performing the forward pass of the model : registering the maximum indexes, the current proba scores, and the error"""
        probas = [expit(self.b + x.dot(self.w)) for x in X]
        self.max_ind = [np.argmax(v) for v in probas]
        y_pred = np.array([np.max(v) for v in probas])
        self.eta = y_pred
        self.errors.append(- np.mean(np.array(y) * np.log(y_pred) + (1-np.array(y)) * np.log(1-y_pred)))
        pass
    
    
    def backward(self, X, y):
        grad_w, grad_b = self.gradient(X, y)
        self.w = self.w - self.alpha * grad_w
        self.b = self.b - self.alpha * grad_b
        self.grad = np.sqrt(np.sum(grad_w**2)+grad_b**2)
        pass
    
    
    def gradient(self, X, y):
        """Check on error and maximum index for gradient computation"""
        self.check_attr('errors', message='Need to compute forward pass to have error')
        self.check_attr('max_ind', message='Need to compute forward pass to have maximum index')
        self.check_attr('eta', message='Need to compute forward pass to have current estimate probabilities')
        
        """Computing the gradient for every observation"""
        mi = self.max_ind
        current_X = np.vstack([X[k][mi[k]].reshape((1,-1)) for k in range(len(X))])
        grad_w = current_X.T.dot(self.eta - np.array(y))/current_X.shape[0]
        grad_b = 0.
        if self.fit_intercept:
            grad_b = np.mean(self.eta - np.array(y))
        return grad_w, grad_b

In [55]:
data['text'].iloc[0]

" McDonald 's is trash, in this case I mean literally. They are not worth the money. UberEats definitely needs to refund you."

In [25]:
data.rename(columns={'tagged_data':'tag_df'}, inplace=True)

for k in tqdm(range(data.shape[0])):
    local = data.tag_df.iloc[k]
    local['word'] = local['word'].apply(lambda x: x.replace('-','').replace('/','').replace('\\','').replace("'",'').replace(' ','').lower())
    data['tag_df'].at[k] = deepcopy(local)

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




In [26]:
"""Collecting verbs, adverbs, and adjectives"""
from nltk import WordNetLemmatizer

lmtzr = WordNetLemmatizer()
unique_verbs = set()
unique_adverbs = set()
unique_adjectives = set()

for k in tqdm(range(data.shape[0])):
    local = data.tag_df.iloc[k]
    local = local[local.word.apply(lambda x: x.count('-')<=1 and x.count('/')<=1 and not(any(str(k) in x for k in range(10))) and not('#' in x))]
    unique_verbs.update(list(local[local.tag=='V'].word.apply(lambda x: lmtzr.lemmatize(x.replace('-','').replace('/','').replace('\\',''),'v'))))
    unique_adverbs.update(list(local[local.tag=='R'].word.apply(lambda x: x.replace('-','').replace('/','').replace('\\',''))))
    unique_adjectives.update(list(local[local.tag=='A'].word.apply(lambda x: x.replace('-','').replace('/','').replace('\\',''))))

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




In [27]:
"""Lemmatizing all verbs"""
lem_verbs = {verb: lmtzr.lemmatize(verb, 'v') for verb in unique_verbs}
lem_adj = {adj: lmtzr.lemmatize(adj, 'a') for adj in unique_adjectives}
lem_adv = {adv: adv for adv in unique_adverbs}
lem_words = dict()
lem_words.update(lem_adv)
lem_words.update(lem_adj)
lem_words.update(lem_verbs)

In [28]:
"""Building full representation"""
full_representation = []
for k in tqdm(range(data.shape[0])):
    local = data.tag_df.iloc[k]
    if local.shape[0]>0:
        local['word'] = local.word.apply(lambda x: x.replace('-',' ').replace('/',' ').replace('\\','').replace("'",''))
        local['word'] = local.T.apply(lambda x: (lmtzr.lemmatize(x['word'].replace("'",''), x['tag'].lower()) if x['tag'] in ['V','R','A','N'] else x['word'].lower()) if type(x['word'])==str else '')
        full_representation.append(list(local.word))
    else:
        full_representation.append([])

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




In [29]:
"""Building vector representations based on full representation"""
vec_reps = []
for k in tqdm(range(len(full_representation))):
    loc = full_representation[k]
    v = []
    for w in loc:
        t = w.replace('-',' ').replace('/',' ').replace('\\','')
        try:
            t = lem_words[t]
            v.append(t)
        except:
            pass
    vec_reps.append(deepcopy(v))

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




In [30]:
"""Feature extraction"""
data['pi'] = data['neg_pi']
ys = list(data.pi)
verb_cond = {}
adverb_cond = {}
adj_cond = {}
verb_count = {}
adverb_count = {}
adj_count = {}

unique_verb = list(lem_verbs.keys())
unique_adv = list(set(list(lem_adv)).difference(unique_verbs))
unique_adj = list(set(lem_adj).difference(unique_adverbs).difference(unique_verbs))

for k in tqdm(range(data.shape[0])):
    """Loop over the tweets"""
    y = ys[k]
    local_pattern = full_representation[k]
    visited = set()
    for w in local_pattern:
        if w.count('-')<=1 and w.count('/')<=1 and not(any(str(k) in w for k in range(10))) and not('#' in w):
            w = w.replace('-',' ').replace('/',' ').replace('\\','')
            if w in unique_verb:
                try:
                    verb_count[lem_verbs[w]] += 1
                    verb_cond[lem_verbs[w]] += data.pi.iloc[k]
                except:
                    verb_count[lem_verbs[w]] = 1
                    verb_cond[lem_verbs[w]] = data.pi.iloc[k]
            elif w in unique_adv:
                try:
                    adverb_count[w] += 1
                    adverb_cond[w] += data.pi.iloc[k]
                except:
                    adverb_count[w] = 1
                    adverb_cond[w] = data.pi.iloc[k]
            elif w in unique_adj:
                try:
                    adj_count[lem_adj[w]] += 1
                    adj_cond[lem_adj[w]] += data.pi.iloc[k]
                except:
                    adj_count[lem_adj[w]] = 1
                    adj_cond[lem_adj[w]] = data.pi.iloc[k]

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




In [31]:
"""Computing the probabilities"""
verb_cond = pd.Series(verb_cond)
adverb_cond = pd.Series(adverb_cond)
adj_cond = pd.Series(adj_cond)
verb_count = pd.Series(verb_count)
adverb_count = pd.Series(adverb_count)
adj_count = pd.Series(adj_count)

verb_cond = verb_cond/verb_count
adverb_cond = adverb_cond/adverb_count
adj_cond = adj_cond/adj_count

verb_cond.sort_values(ascending=False, inplace=True)
adverb_cond.sort_values(ascending=False, inplace=True)
adj_cond.sort_values(ascending=False, inplace=True)

verb_cond = pd.DataFrame([verb_cond, verb_count], index=['cond_proba','count']).T
adverb_cond = pd.DataFrame([adverb_cond, adverb_count], index=['cond_proba','count']).T
adj_cond = pd.DataFrame([adj_cond, adj_count], index=['cond_proba','count']).T

verb_cond['lower_bound'] = (verb_cond['cond_proba'] - 1.96*np.sqrt(verb_cond['cond_proba']*(1-verb_cond['cond_proba'])/verb_cond['count'])).apply(lambda x: max(x,0))
adverb_cond['lower_bound'] = (adverb_cond['cond_proba'] - 1.96*np.sqrt(adverb_cond['cond_proba']*(1-adverb_cond['cond_proba'])/adverb_cond['count'])).apply(lambda x: max(x,0))
adj_cond['lower_bound'] = (adj_cond['cond_proba'] - 1.96*np.sqrt(adj_cond['cond_proba']*(1-adj_cond['cond_proba'])/adj_cond['count'])).apply(lambda x: max(x,0))

verb_cond['cond_proba_no_pi'] = 1. - verb_cond['cond_proba']
verb_cond['lower_bound_no_pi'] = verb_cond['cond_proba_no_pi'] - 1.96 * np.sqrt(verb_cond['cond_proba_no_pi'] * (1-verb_cond['cond_proba_no_pi'])/verb_cond['count'])

adverb_cond['cond_proba_no_pi'] = 1. - adverb_cond['cond_proba']
adverb_cond['lower_bound_no_pi'] = adverb_cond['cond_proba_no_pi'] - 1.96 * np.sqrt(adverb_cond['cond_proba_no_pi'] * (1-adverb_cond['cond_proba_no_pi'])/adverb_cond['count'])

adj_cond['cond_proba_no_pi'] = 1. - adj_cond['cond_proba']
adj_cond['lower_bound_no_pi'] = adj_cond['cond_proba_no_pi'] - 1.96 * np.sqrt(adj_cond['cond_proba_no_pi'] * (1-adj_cond['cond_proba_no_pi'])/adj_cond['count'])

verb_cond = verb_cond[verb_cond['count']>=10]
adverb_cond = adverb_cond[adverb_cond['count']>=10]
adj_cond = adj_cond[adj_cond['count']>=10]

verb_features_10 = list(verb_cond.sort_values('lower_bound', ascending=False).index[:verb_cond.shape[0]//10]) + list(verb_cond.sort_values('lower_bound_no_pi', ascending=False).index[:verb_cond.shape[0]//10])
verb_features_20 = list(verb_cond.sort_values('lower_bound', ascending=False).index[:verb_cond.shape[0] * 2//10]) + list(verb_cond.sort_values('lower_bound_no_pi', ascending=False).index[:verb_cond.shape[0] * 2//10])
verb_features_30 = list(verb_cond.sort_values('lower_bound', ascending=False).index[:verb_cond.shape[0] * 3//10]) + list(verb_cond.sort_values('lower_bound_no_pi', ascending=False).index[:verb_cond.shape[0] * 3//10])

adverb_features_10 = list(adverb_cond.sort_values('lower_bound', ascending=False).index[:adverb_cond.shape[0]//10]) + list(adverb_cond.sort_values('lower_bound_no_pi', ascending=False).index[:adverb_cond.shape[0]//10])
adverb_features_20 = list(adverb_cond.sort_values('lower_bound', ascending=False).index[:adverb_cond.shape[0] * 2//10]) + list(adverb_cond.sort_values('lower_bound_no_pi', ascending=False).index[:adverb_cond.shape[0] * 2//10])
adverb_features_30 = list(adverb_cond.sort_values('lower_bound', ascending=False).index[:adverb_cond.shape[0] * 3//10]) + list(adverb_cond.sort_values('lower_bound_no_pi', ascending=False).index[:adverb_cond.shape[0] * 3//10])

adj_features_10 = list(adj_cond.sort_values('lower_bound', ascending=False).index[:adj_cond.shape[0]//10]) + list(adj_cond.sort_values('lower_bound_no_pi', ascending=False).index[:adj_cond.shape[0]//10])
adj_features_20 = list(adj_cond.sort_values('lower_bound', ascending=False).index[:adj_cond.shape[0] * 2//10]) + list(adj_cond.sort_values('lower_bound_no_pi', ascending=False).index[:adj_cond.shape[0] * 2//10])
adj_features_30 = list(adj_cond.sort_values('lower_bound', ascending=False).index[:adj_cond.shape[0] * 3//10]) + list(adj_cond.sort_values('lower_bound_no_pi', ascending=False).index[:adj_cond.shape[0] * 3//10])

"""Function to compute the one-hot encodings"""
features_10 = verb_features_10 + adverb_features_10 + adj_features_10
features_20 = verb_features_20 + adverb_features_20 + adj_features_20
features_30 = verb_features_30 + adverb_features_30 + adj_features_30

In [37]:
"""Getting feature representation for logistic regression"""
feat_reps = np.array([[int(w in fr) for w in features_20] for fr in tqdm(full_representation)])

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




In [33]:
"""Collecting the negative bigrams"""
import spacy

nlp = spacy.load('en')

def get_neg_pairs(text):
    doc = nlp(text)
    dep_df = pd.DataFrame([[token.text, token.pos_, token.dep_, token.head.text, token.head.pos_] for token in doc], columns=['token','token_pos','dep','head','head_pos'])
    neg_dep = dep_df[(dep_df.dep=='neg') & (dep_df.head_pos=='VERB')]
    
    from nltk import WordNetLemmatizer
    lmtzr = WordNetLemmatizer()
    pairs = [[neg_dep.token.iloc[i], lmtzr.lemmatize(neg_dep['head'].iloc[i], 'v')] for i in range(neg_dep.shape[0])]
    return pairs

p = Pool(40)

neg_pairs_data = []
N = data.shape[0]

for k in tqdm(range(N//500+1)):
    neg_pairs_data += p.map(get_neg_pairs, list(data.text.iloc[k*500:min(N,(k+1)*500)]))

HBox(children=(IntProgress(value=0, max=47), HTML(value='')))




In [34]:
"""Recording the verb bigrams"""
verb_bigrams = {}
for k in range(len(neg_pairs_data)):
    for i in range(len(neg_pairs_data[k])):
        if neg_pairs_data[k][i][1] in verb_features_30:
            try:
                verb_bigrams[neg_pairs_data[k][i][1]] += 1
            except:
                verb_bigrams[neg_pairs_data[k][i][1]] = 1

verb_bigrams = pd.Series(verb_bigrams)
verb_bigrams = verb_bigrams[verb_bigrams>=5]
verb_bigrams = list(verb_bigrams.index)

In [35]:
"""Building the negative bigrams representations for neighborhoods"""
neg_reps = []
neg_presence = []
for k in tqdm(range(data.shape[0])):
    nr = []
        
    for neg_pair in neg_pairs_data[k]:
        if len(set(full_representation[k]).intersection(neg_pair))==2:
            if neg_pair[1] in verb_bigrams:
                nr.append(neg_pair[1])
            if neg_pair[0] in verb_bigrams:
                nr.append(neg_pair[0])    
    neg_reps.append(deepcopy(nr))
    
neg_reps = np.array([[int(v in nr) for v in verb_bigrams] for nr in tqdm(neg_reps)])

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




In [38]:
feat_reps.shape

(23102, 610)

In [39]:
neg_reps.shape

(23102, 86)

## Simple logistic regression

In [40]:
from scipy.special import expit, logit
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression

"""Building indexes"""
n = len(ys)
X_neg_1 = np.hstack([feat_reps, neg_reps])
d = X_neg_1.shape[1]

"""Retrieving the positive and negative indexes"""
neg_index = [k for k in range(len(ys)) if ys[k]==0]
pos_index = [k for k in range(len(ys)) if ys[k]==1]

"""Shuffling both the positive and negative indexes"""
np.random.seed(seed=0)
np.random.shuffle(neg_index)
np.random.shuffle(pos_index)

"""Computing train and test index sets for the chosen number of folds"""
cv = 10

s = len(neg_index)//cv
neg_index_list = [neg_index[k*s:(k+1)*s] for k in range(cv-1)]
neg_index_list.append(neg_index[(cv-1)*s:])
assert(np.sum([len(e) for e in neg_index_list])==len(neg_index)) # Check on negative index completion

s = len(pos_index)//cv
pos_index_list = [pos_index[k*s:(k+1)*s] for k in range(cv-1)]
pos_index_list.append(pos_index[(cv-1)*s:])
assert(np.sum([len(e) for e in pos_index_list])==len(pos_index))# Check on positive index completion

"""Cross-validation on 10% features, without hashtags"""
from scipy.special import logit

"""Initializing the list of outputs, predictions and probabilities to computed CV-ly"""
Y = ys
Y_Pred = []
Y_Proba = []

"""Intializing the list of cross-validated features"""
precision_list = []
recall_list = []
roc_auc_list = []
f1_list = []

"""Performing the cross-validation of the model using the features of interest
A caveat to be mentioned is that the feature selection was performed on the whole dataset, which may be a little biased towards choosing the right features.
This effect will be neglected during this test."""
for k in range(cv):
    """Splitting the data into train and test"""
    print('{}-th fold'.format(k+1))
    print('Splitting the data')
    neg_test = np.vstack([X_neg_1[i:i+1] for i in neg_index_list[k]])
    pos_test = np.vstack([X_neg_1[i:i+1] for i in pos_index_list[k]])
    neg_train = np.vstack([X_neg_1[i:i+1] for i in set(neg_index).difference(neg_index_list[k])])
    pos_train = np.vstack([X_neg_1[i:i+1] for i in set(pos_index).difference(pos_index_list[k])])

    X_train = np.vstack([pos_train, neg_train])
    X_test = np.vstack([pos_test, neg_test])

    y_train = [1] * len(pos_train) + [0] * len(neg_train)
    y_test = [1] * len(pos_test) + [0] * len(neg_test)

    """Fitting the model"""
    print('Fitting the model')
    d = X_train.shape[1]
    model = LogisticRegression(fit_intercept=True, C=1e6)
    model.fit(X_train, y_train)

    print('Evaluation and storage of model parameters\n')
    """Outputting the predictions and the probability scores"""
    y_score = model.predict_proba(X_test)[:,1]
    y_pred = model.predict(X_test)

    """Computing the various metrics"""
    f1_list.append(f1_score(y_test, y_pred))
    precision_list.append(precision_score(y_test, y_pred))
    recall_list.append(recall_score(y_test, y_pred))
    roc_auc_list.append(roc_auc_score(y_test, y_score))

    """Adding predictions and scores to computed global cross-validated performance after the end of the process"""
    Y_Pred.append(list(y_pred))
    Y_Proba.append(list(y_score))

print("Precision : {} +- {}\nRecall : {} +- {}\nF1 : {} +- {}\nROC AUC : {} +- {}".format(np.mean(precision_list), np.std(precision_list),
                                                                                          np.mean(recall_list), np.std(recall_list),
                                                                                          np.mean(f1_list), np.std(f1_list),
                                                                                          np.mean(roc_auc_list), np.std(roc_auc_list)))

1-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

2-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

3-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

4-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

5-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

6-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

7-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

8-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

9-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

10-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

Precision : 0.622349282185821 +- 0.028318278755104835
Recall : 0.422443621965915 +- 0.031

## NeighLoR approach

In [41]:
"""Building the neighborhoods for the NeighLoR model"""
unique_corpus = unique_verbs.union(unique_adverbs).union(unique_adjectives)

def build_neighborhood(local):
    #Instantiating lemmatizer
    lmtzr = WordNetLemmatizer()
    
    try:
        #Removing non interesting tags, tweet specific tags (emojis, existentials, numbers, URLs, the &, punctuation, unknown, @ mentions, determinants)
        local = local[local.word.apply(lambda x: x.count('-')<=1 and x.count('/')<=1 and not(any(str(k) in x for k in range(10))) and not('#' in x))]
        local = local[local.tag.isin(['U','&',',','$','!','^','#']).apply(lambda x: not x)]
        if local.shape[0]==0:
            return([],[],[])
        else: 
            #Lemmatizing the words to remove the verb and adverb tokens to be considered
            local['word'] = local.word.apply(lambda x: x.replace('-',' ').replace('/',' ').replace('\\','').replace("'",''))
            #Extracting the verb and adverb patterns
            local_words_5 = []
            local_words_7 = []
            local_words_9 = []
            for i in range(local.shape[0]):
                w = local.word.iloc[i]
                if w in unique_corpus or local.tag.iloc[i] in ['V','R','A']:#,'N']:
                    neighborhood_5 = local.iloc[max(0,i-2):min(local.shape[0],i+3)]
                    neighborhood_7 = local.iloc[max(0,i-3):min(local.shape[0],i+4)]
                    neighborhood_9 = local.iloc[max(0,i-4):min(local.shape[0],i+5)]
                    local_words_5.append(list(neighborhood_5.word))
                    local_words_7.append(list(neighborhood_7.word))
                    local_words_9.append(list(neighborhood_9.word))
                #If the word is a verb, add it to the bank of verbs
                if (local.tag.iloc[i]=='V') and ('#' not in w) and ('&' not in w) and not(any(str(k) in w for k in range(10))) and ('-' not in w) and ('/' not in w):
                    if type(w)==list:
                        unique_verbs.update(set(w))
                    else:
                        unique_verbs.add(w)
            return(local_words_5, local_words_7, local_words_9)
    except:
        return ([],[],[])

neighborhoods = []
p = Pool(40)
for k in tqdm(range(N//500+1)):
    neighborhoods += p.map(build_neighborhood, list(data.tag_df.iloc[500*k:min(500*(k+1),N)]))

HBox(children=(IntProgress(value=0, max=47), HTML(value='')))




In [42]:
actual_words_7 = [[[lem_words[w.lower()] if w.lower() in lem_words.keys() else w.lower() for w in v] for v in neighborhoods[k][1]] for k in tqdm(range(len(neighborhoods)))]

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




In [60]:
print(data.text.iloc[0])
actual_words_7[0]

 McDonald 's is trash, in this case I mean literally. They are not worth the money. UberEats definitely needs to refund you.


[['s', 'is', 'trash', 'in'],
 ['s', 'is', 'trash', 'in', 'this'],
 ['s', 'is', 'trash', 'in', 'this', 'case'],
 ['trash', 'in', 'this', 'case', 'i', 'mean', 'literally'],
 ['this', 'case', 'i', 'mean', 'literally', 'they', 'are'],
 ['case', 'i', 'mean', 'literally', 'they', 'are', 'not'],
 ['mean', 'literally', 'they', 'are', 'not', 'worth', 'the'],
 ['literally', 'they', 'are', 'not', 'worth', 'the', 'money'],
 ['they', 'are', 'not', 'worth', 'the', 'money', 'definitely'],
 ['worth', 'the', 'money', 'definitely', 'needs', 'to', 'refund'],
 ['the', 'money', 'definitely', 'needs', 'to', 'refund', 'you'],
 ['money', 'definitely', 'needs', 'to', 'refund', 'you'],
 ['definitely', 'needs', 'to', 'refund', 'you']]

In [43]:
"""Building representations"""
d = len(features_20)

def represent(x):
    return [int(w in x) for w in features_20]

X_basic = [[represent(pattern) for pattern in tweet] for tweet in tqdm(actual_words_7)]
X_basic = [np.array(tweet) for tweet in tqdm(X_basic)]

for k in tqdm(range(len(X_basic))):
    if X_basic[k].shape[0]==0:
        X_basic[k] = np.zeros((1,d))

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




In [44]:
"""Building the negative bigrams representations for neighborhoods"""
neg_reps = []
neg_presence = []
for k in tqdm(range(data.shape[0])):
    nr = []
    npr = []
    
    for i in range(len(actual_words_7[k])):
        local = []
        neighborhood = set(actual_words_7[k][i])
        
        for neg_pair in neg_pairs_data[k]:
            if len(neighborhood.intersection(neg_pair))==2:
                if neg_pair[1] in verb_bigrams:
                    local.append(neg_pair[1])
                if neg_pair[0] in verb_bigrams:
                    local.append(neg_pair[0])
        
        if len(local)>0:
            npr.append(1)
        else:
            npr.append(0)
        nr.append(deepcopy(local))
    
    neg_reps.append(deepcopy(nr))
    neg_presence.append(np.array(npr).reshape((len(actual_words_7[k]),1)))
    
neg_reps = [np.array([[int(v in local) for v in verb_bigrams] for local in tweet]) for tweet in neg_reps]

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




In [45]:
"""Correcting the defects of negation representations"""
d_neg = len(verb_bigrams)

for k in range(len(neg_reps)):
    if neg_reps[k].shape[0]==0:
        neg_reps[k] = np.zeros((1,d_neg))
    if neg_presence[k].shape[0]==0:
        neg_presence[k] = np.zeros((1,1))

In [46]:
"""Complementing the representations for both methods of handling negations"""
X_neg_1 = [np.hstack([X_basic[k], neg_reps[k]]) for k in range(len(X_basic))]
d1 = len(features_20) + len(verb_bigrams)
feat_1 = features_20 + ['not_' + w for w in verb_bigrams]

In [92]:
from scipy.special import expit, logit
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

"""Building indexes"""
n = len(ys)
d = X_neg_1[0].shape[1]

"""Retrieving the positive and negative indexes"""
neg_index = [k for k in range(len(ys)) if ys[k]==0]
pos_index = [k for k in range(len(ys)) if ys[k]==1]

"""Shuffling both the positive and negative indexes"""
np.random.seed(seed=0)
np.random.shuffle(neg_index)
np.random.shuffle(pos_index)

"""Computing train and test index sets for the chosen number of folds"""
cv = 10

s = len(neg_index)//cv
neg_index_list = [neg_index[k*s:(k+1)*s] for k in range(cv-1)]
neg_index_list.append(neg_index[(cv-1)*s:])
assert(np.sum([len(e) for e in neg_index_list])==len(neg_index)) # Check on negative index completion

s = len(pos_index)//cv
pos_index_list = [pos_index[k*s:(k+1)*s] for k in range(cv-1)]
pos_index_list.append(pos_index[(cv-1)*s:])
assert(np.sum([len(e) for e in pos_index_list])==len(pos_index))# Check on positive index completion

"""Cross-validation on 10% features, without hashtags"""
from scipy.special import logit

"""Initializing the list of outputs, predictions and probabilities to computed CV-ly"""
Y = ys
Y_Pred = []
Y_Proba = []

"""Intializing the list of cross-validated features"""
precision_list = []
recall_list = []
roc_auc_list = []
f1_list = []

"""Storing coefficients and biases for stability evaluation"""
biases = []
weights = []

"""Performing the cross-validation of the model using the features of interest
A caveat to be mentioned is that the feature selection was performed on the whole dataset, which may be a little biased towards choosing the right features.
This effect will be neglected during this test."""
for k in range(cv):
    """Splitting the data into train and test"""
    print('{}-th fold'.format(k+1))
    print('Splitting the data')
    neg_test = [X_neg_1[i] for i in neg_index_list[k]]
    pos_test = [X_neg_1[i] for i in pos_index_list[k]]
    neg_train = [X_neg_1[i] for i in set(neg_index).difference(neg_index_list[k])]
    pos_train = [X_neg_1[i] for i in set(pos_index).difference(pos_index_list[k])]

    X_train = pos_train + neg_train
    X_test = pos_test + neg_test

    y_train = [1] * len(pos_train) + [0] * len(neg_train)
    y_test = [1] * len(pos_test) + [0] * len(neg_test)

    """Fitting the model"""
    print('Fitting the model')
    d = X_train[0].shape[1]
    model = MaxLog(d=d, fit_intercept=True, alpha=10., epsilon=1e-4, init_b=logit(np.mean(y_train)), init_w=np.zeros(d))
    model.fit(X_train, y_train, n_iter=5000, parallel=False)

    print('Evaluation and storage of model parameters\n')
    """Outputting the predictions and the probability scores"""
    y_score = model.predict_proba(X_test)
    y_pred = model.predict(X_test)

    """Computing the various metrics"""
    f1_list.append(f1_score(y_test, y_pred))
    precision_list.append(precision_score(y_test, y_pred))
    recall_list.append(recall_score(y_test, y_pred))
    roc_auc_list.append(roc_auc_score(y_test, y_score))

    """Adding predictions and scores to computed global cross-validated performance after the end of the process"""
    Y_Pred.append(list(y_pred))
    Y_Proba.append(list(y_score))

    """Storing the model's coefficients"""
    biases.append(model.b)
    weights.append(model.w)
    
print("Precision : {} +- {}\nRecall : {} +- {}\nF1 : {} +- {}\nROC AUC : {} +- {}".format(np.mean(precision_list), np.std(precision_list),
                                                                                          np.mean(recall_list), np.std(recall_list),
                                                                                          np.mean(f1_list), np.std(f1_list),
                                                                                          np.mean(roc_auc_list), np.std(roc_auc_list)))

1-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=5000), HTML(value='')))

Evaluation and storage of model parameters

2-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=5000), HTML(value='')))

Evaluation and storage of model parameters

3-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=5000), HTML(value='')))

Evaluation and storage of model parameters

4-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=5000), HTML(value='')))

Evaluation and storage of model parameters

5-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=5000), HTML(value='')))

Evaluation and storage of model parameters

6-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=5000), HTML(value='')))

Evaluation and storage of model parameters

7-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=5000), HTML(value='')))

Evaluation and storage of model parameters

8-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=5000), HTML(value='')))

Evaluation and storage of model parameters

9-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=5000), HTML(value='')))

Evaluation and storage of model parameters

10-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=5000), HTML(value='')))

Evaluation and storage of model parameters

Precision : 0.6790144704884488 +- 0.03880330055449566
Recall : 0.5267085556894474 +- 0.02634014347925252
F1 : 0.5928325669676058 +- 0.02749710994785614
ROC AUC : 0.9485173815476425 +- 0.007414074217384614


## Pretrained Word2Vec + Logistic regression

In [61]:
"""Loading W2V matrix and building representation"""
from gensim.models import KeyedVectors

google_w2v = KeyedVectors.load_word2vec_format('/home/qrg-researchlab/Downloads/GoogleNews-vectors-negative300.bin', binary=True)

def get_embed(x):
    local = list(x[x.tag.isin(['V','A','R','N'])].word)
    embed = [google_w2v[w].reshape((1,-1)) for w in local if w in google_w2v.vocab.keys()]
    if len(embed)>0:
        return np.vstack(embed).sum(axis=0)
    else:
        return np.zeros(300)

X_pt_w2v = np.vstack([get_embed(data.tag_df.iloc[k]).reshape((1,-1)) for k in tqdm(range(data.shape[0]))])

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




In [62]:
from scipy.special import expit, logit
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression

"""Building indexes"""
n = len(ys)
d = X_pt_w2v.shape[1]

"""Retrieving the positive and negative indexes"""
neg_index = [k for k in range(len(ys)) if ys[k]==0]
pos_index = [k for k in range(len(ys)) if ys[k]==1]

"""Shuffling both the positive and negative indexes"""
np.random.seed(seed=0)
np.random.shuffle(neg_index)
np.random.shuffle(pos_index)

"""Computing train and test index sets for the chosen number of folds"""
cv = 10

s = len(neg_index)//cv
neg_index_list = [neg_index[k*s:(k+1)*s] for k in range(cv-1)]
neg_index_list.append(neg_index[(cv-1)*s:])
assert(np.sum([len(e) for e in neg_index_list])==len(neg_index)) # Check on negative index completion

s = len(pos_index)//cv
pos_index_list = [pos_index[k*s:(k+1)*s] for k in range(cv-1)]
pos_index_list.append(pos_index[(cv-1)*s:])
assert(np.sum([len(e) for e in pos_index_list])==len(pos_index))# Check on positive index completion

"""Initializing the list of outputs, predictions and probabilities to computed CV-ly"""
Y = ys
Y_Pred = []
Y_Proba = []

"""Intializing the list of cross-validated features"""
precision_list = []
recall_list = []
roc_auc_list = []
f1_list = []

"""Performing the cross-validation of the model using the features of interest
"""
for k in range(cv):
    """Splitting the data into train and test"""
    print('{}-th fold'.format(k+1))
    print('Splitting the data')
    neg_test = np.vstack([X_pt_w2v[i:i+1] for i in neg_index_list[k]])
    pos_test = np.vstack([X_pt_w2v[i:i+1] for i in pos_index_list[k]])
    neg_train = np.vstack([X_pt_w2v[i:i+1] for i in set(neg_index).difference(neg_index_list[k])])
    pos_train = np.vstack([X_pt_w2v[i:i+1] for i in set(pos_index).difference(pos_index_list[k])])

    X_train = np.vstack([pos_train, neg_train])
    X_test = np.vstack([pos_test, neg_test])

    y_train = [1] * len(pos_train) + [0] * len(neg_train)
    y_test = [1] * len(pos_test) + [0] * len(neg_test)

    """Fitting the model"""
    print('Fitting the model')
    d = X_train.shape[1]
    model = LogisticRegression(fit_intercept=True, C=1e4)
    model.fit(X_train, y_train)

    print('Evaluation and storage of model parameters\n')
    """Outputting the predictions and the probability scores"""
    y_score = model.predict_proba(X_test)[:,1]
    y_pred = model.predict(X_test)

    """Computing the various metrics"""
    f1_list.append(f1_score(y_test, y_pred))
    precision_list.append(precision_score(y_test, y_pred))
    recall_list.append(recall_score(y_test, y_pred))
    roc_auc_list.append(roc_auc_score(y_test, y_score))

    """Adding predictions and scores to computed global cross-validated performance after the end of the process"""
    Y_Pred.append(list(y_pred))
    Y_Proba.append(list(y_score))

print("Cross-validation evaluation of logistic regression + pretrained Word2Vec:")
print("Precision : {} +- {}\nRecall : {} +- {}\nF1 : {} +- {}\nROC AUC : {} +- {}".format(np.mean(precision_list), np.std(precision_list),
                                                                                          np.mean(recall_list), np.std(recall_list),
                                                                                          np.mean(f1_list), np.std(f1_list),
                                                                                          np.mean(roc_auc_list), np.std(roc_auc_list)))

1-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

2-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

3-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

4-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

5-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

6-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

7-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

8-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

9-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

10-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

Cross-validation evaluation of logistic regression + pretrained Word2Vec:
Precision : 0.6

## Pretrained Word2Vec + NeighLoR

In [63]:
"""Building representations for word embeddings"""
d = 300

def represent(x):
    rep = []
    for w in x:
        try:
            rep.append(google_w2v[w])
        except:
            pass
    if len(rep)>0:
        return np.vstack(rep).sum(axis=0)
    else:
        return np.zeros(300)

X_basic = [[represent(pattern).reshape((1,-1)) for pattern in tweet] if len(tweet)>0 else np.zeros((1,300)) for tweet in tqdm(actual_words_7)]
X_basic = [np.vstack(tweet) for tweet in tqdm(X_basic)]

for k in tqdm(range(len(X_basic))):
    if X_basic[k].shape[0]==0:
        X_basic[k] = np.zeros((1,d))

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




In [64]:
"""Building indexes"""
n = len(ys)
d = X_basic[0].shape[1]

"""Retrieving the positive and negative indexes"""
neg_index = [k for k in range(len(ys)) if ys[k]==0]
pos_index = [k for k in range(len(ys)) if ys[k]==1]

"""Shuffling both the positive and negative indexes"""
np.random.seed(seed=0)
np.random.shuffle(neg_index)
np.random.shuffle(pos_index)

"""Computing train and test index sets for the chosen number of folds"""
cv = 10

s = len(neg_index)//cv
neg_index_list = [neg_index[k*s:(k+1)*s] for k in range(cv-1)]
neg_index_list.append(neg_index[(cv-1)*s:])
assert(np.sum([len(e) for e in neg_index_list])==len(neg_index)) # Check on negative index completion

s = len(pos_index)//cv
pos_index_list = [pos_index[k*s:(k+1)*s] for k in range(cv-1)]
pos_index_list.append(pos_index[(cv-1)*s:])
assert(np.sum([len(e) for e in pos_index_list])==len(pos_index))# Check on positive index completion

"""Cross-validation on 10% features, without hashtags"""
from scipy.special import logit

"""Initializing the list of outputs, predictions and probabilities to computed CV-ly"""
Y = ys
Y_Pred = []
Y_Proba = []

"""Intializing the list of cross-validated features"""
precision_list = []
recall_list = []
roc_auc_list = []
f1_list = []

"""Storing coefficients and biases for stability evaluation"""
biases = []
weights = []

"""Performing the cross-validation of the model using the features of interest
A caveat to be mentioned is that the feature selection was performed on the whole dataset, which may be a little biased towards choosing the right features.
This effect will be neglected during this test."""
for k in range(cv):
    """Splitting the data into train and test"""
    print('{}-th fold'.format(k+1))
    print('Splitting the data')
    neg_test = [X_basic[i] for i in neg_index_list[k]]
    pos_test = [X_basic[i] for i in pos_index_list[k]]
    neg_train = [X_basic[i] for i in set(neg_index).difference(neg_index_list[k])]
    pos_train = [X_basic[i] for i in set(pos_index).difference(pos_index_list[k])]

    X_train = pos_train + neg_train
    X_test = pos_test + neg_test

    y_train = [1] * len(pos_train) + [0] * len(neg_train)
    y_test = [1] * len(pos_test) + [0] * len(neg_test)

    """Fitting the model"""
    print('Fitting the model')
    d = X_train[0].shape[1]
    model = MaxLog(d=d, fit_intercept=True, alpha=10., epsilon=1e-4, init_b=logit(np.mean(y_train)), init_w=np.zeros(d))
    model.fit(X_train, y_train, n_iter=3000, parallel=False)

    print('Evaluation and storage of model parameters\n')
    """Outputting the predictions and the probability scores"""
    y_score = model.predict_proba(X_test)
    y_pred = model.predict(X_test)

    """Computing the various metrics"""
    f1_list.append(f1_score(y_test, y_pred))
    precision_list.append(precision_score(y_test, y_pred))
    recall_list.append(recall_score(y_test, y_pred))
    roc_auc_list.append(roc_auc_score(y_test, y_score))

    """Adding predictions and scores to computed global cross-validated performance after the end of the process"""
    Y_Pred.append(list(y_pred))
    Y_Proba.append(list(y_score))

    """Storing the model's coefficients"""
    biases.append(model.b)
    weights.append(model.w)

print("Cross-validation evaluation of the pretrained Word2Vec + NeighLoR approach:")
print("Precision : {} +- {}\nRecall : {} +- {}\nF1 : {} +- {}\nROC AUC : {} +- {}".format(np.mean(precision_list), np.std(precision_list),
                                                                                          np.mean(recall_list), np.std(recall_list),
                                                                                          np.mean(f1_list), np.std(f1_list),
                                                                                          np.mean(roc_auc_list), np.std(roc_auc_list)))

1-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

2-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

3-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

4-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

5-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

6-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

7-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

8-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

9-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

10-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

Cross-validation evaluation of the pretrained Word2Vec + NeighLoR approach:
Precision : 0.47646366130020257 +- 0.20170974089467342
Recall : 0.5338268204510243 +- 0.3686147533978055
F1 : 0.40309420669281104 +- 0.2370649924487037
ROC AUC : 0.9160995513875927 +- 0.05478500654908393


## Training custom Word2Vec

In [72]:
"""Loading large BA data to train our own specialized embedding"""
from gensim.models import Word2Vec
import os
from nltk import WordNetLemmatizer

"""Loading British Airways data"""
big_data = []

for file in tqdm(os.listdir('./processed_data/mcdonalds/')):
    if file[-2:]=='.b':
        big_data.append(pickle.load(open(os.path.join('./processed_data/mcdonalds/', file), 'rb')))

data_df = pd.concat(big_data, axis=0, sort=False)
data_df.index = list(range(data_df.shape[0]))

"""Collecting verbs, adverbs, and adjectives"""
lmtzr = WordNetLemmatizer()
unique_verbs = set()
unique_adverbs = set()
unique_adjectives = set()

N = data_df.shape[0]//3

for k in tqdm(range(N)):
    local = data_df.tag_df.iloc[k]
    local = local[local.word.apply(lambda x: x.count('-')<=1 and x.count('/')<=1 and not(any(str(k) in x for k in range(10))) and not('#' in x))]
    unique_verbs.update(list(local[local.tag=='V'].word.apply(lambda x: lmtzr.lemmatize(x.replace('-','').replace('/','').replace('\\',''),'v'))))
    unique_adverbs.update(list(local[local.tag=='R'].word.apply(lambda x: x.replace('-','').replace('/','').replace('\\',''))))
    unique_adjectives.update(list(local[local.tag=='A'].word.apply(lambda x: x.replace('-','').replace('/','').replace('\\',''))))
    
"""Lemmatizing all verbs"""
lem_verbs = {verb: lmtzr.lemmatize(verb, 'v') for verb in unique_verbs}
lem_adj = {adj: lmtzr.lemmatize(adj, 'a') for adj in unique_adjectives}
lem_adv = {adv: adv for adv in unique_adverbs}
lem_words = dict()
lem_words.update(lem_adv)
lem_words.update(lem_adj)
lem_words.update(lem_verbs)

HBox(children=(IntProgress(value=0, max=221), HTML(value='')))




HBox(children=(IntProgress(value=0, max=329236), HTML(value='')))




In [73]:
"""Applying lemmatization to the text"""
for k in tqdm(range(N)):
    local = data_df.tag_df.iloc[k]
    local['word'] = local.word.apply(lambda x: lem_words[x] if x in lem_words.keys() else x)
    data_df.tag_df['word'] = deepcopy(local.word)

"""Training the embedding"""
sentences = [list(data_df.tag_df.iloc[k][data_df.tag_df.iloc[k].tag.isin(['V','R','A','N'])].word) for k in tqdm(range(N))]
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=50,
                     alpha=0.05,
                     min_alpha=0.001,
                     negative=5,
                     workers=40)
w2v_model.build_vocab(sentences, progress_per=10000)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

HBox(children=(IntProgress(value=0, max=329236), HTML(value='')))




HBox(children=(IntProgress(value=0, max=329236), HTML(value='')))




(75682287, 90847260)

## Custom Word2Vec + logistic regression

In [74]:
"""Building the representation"""
def get_embed(x):
    local = list(x[x.tag.isin(['V','A','R','N'])].word)
    embed = [w2v_model[w].reshape((1,-1)) for w in local if w in w2v_model.wv.vocab.keys()]
    if len(embed)>0:
        return np.vstack(embed).sum(axis=0)
    else:
        return np.zeros(50)

X_pt_w2v = np.vstack([get_embed(data.tag_df.iloc[k]).reshape((1,-1)) for k in tqdm(range(data.shape[0]))])

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))

  after removing the cwd from sys.path.





In [75]:
"""Building indexes"""
n = len(ys)
d = X_pt_w2v.shape[1]

"""Retrieving the positive and negative indexes"""
neg_index = [k for k in range(len(ys)) if ys[k]==0]
pos_index = [k for k in range(len(ys)) if ys[k]==1]

"""Shuffling both the positive and negative indexes"""
np.random.seed(seed=0)
np.random.shuffle(neg_index)
np.random.shuffle(pos_index)

"""Computing train and test index sets for the chosen number of folds"""
cv = 10

s = len(neg_index)//cv
neg_index_list = [neg_index[k*s:(k+1)*s] for k in range(cv-1)]
neg_index_list.append(neg_index[(cv-1)*s:])
assert(np.sum([len(e) for e in neg_index_list])==len(neg_index)) # Check on negative index completion

s = len(pos_index)//cv
pos_index_list = [pos_index[k*s:(k+1)*s] for k in range(cv-1)]
pos_index_list.append(pos_index[(cv-1)*s:])
assert(np.sum([len(e) for e in pos_index_list])==len(pos_index))# Check on positive index completion

"""Cross-validation on 10% features, without hashtags"""
from scipy.special import logit

"""Initializing the list of outputs, predictions and probabilities to computed CV-ly"""
Y = ys
Y_Pred = []
Y_Proba = []

"""Intializing the list of cross-validated features"""
precision_list = []
recall_list = []
roc_auc_list = []
f1_list = []

"""Performing the cross-validation of the model using the features of interest
A caveat to be mentioned is that the feature selection was performed on the whole dataset, which may be a little biased towards choosing the right features.
This effect will be neglected during this test."""
for k in range(cv):
    """Splitting the data into train and test"""
    print('{}-th fold'.format(k+1))
    print('Splitting the data')
    neg_test = np.vstack([X_pt_w2v[i:i+1] for i in neg_index_list[k]])
    pos_test = np.vstack([X_pt_w2v[i:i+1] for i in pos_index_list[k]])
    neg_train = np.vstack([X_pt_w2v[i:i+1] for i in set(neg_index).difference(neg_index_list[k])])
    pos_train = np.vstack([X_pt_w2v[i:i+1] for i in set(pos_index).difference(pos_index_list[k])])

    X_train = np.vstack([pos_train, neg_train])
    X_test = np.vstack([pos_test, neg_test])

    y_train = [1] * len(pos_train) + [0] * len(neg_train)
    y_test = [1] * len(pos_test) + [0] * len(neg_test)

    """Fitting the model"""
    print('Fitting the model')
    d = X_train.shape[1]
    model = LogisticRegression(fit_intercept=True, C=1e4)
    model.fit(X_train, y_train)

    print('Evaluation and storage of model parameters\n')
    """Outputting the predictions and the probability scores"""
    y_score = model.predict_proba(X_test)[:,1]
    y_pred = model.predict(X_test)

    """Computing the various metrics"""
    f1_list.append(f1_score(y_test, y_pred))
    precision_list.append(precision_score(y_test, y_pred))
    recall_list.append(recall_score(y_test, y_pred))
    roc_auc_list.append(roc_auc_score(y_test, y_score))

    """Adding predictions and scores to computed global cross-validated performance after the end of the process"""
    Y_Pred.append(list(y_pred))
    Y_Proba.append(list(y_score))

print("Cross-validation evaluation of custom Word2Vec + logistic regression:")
print("Precision : {} +- {}\nRecall : {} +- {}\nF1 : {} +- {}\nROC AUC : {} +- {}".format(np.mean(precision_list), np.std(precision_list),
                                                                                          np.mean(recall_list), np.std(recall_list),
                                                                                          np.mean(f1_list), np.std(f1_list),
                                                                                          np.mean(roc_auc_list), np.std(roc_auc_list)))

1-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

2-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

3-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

4-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

5-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

6-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

7-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

8-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

9-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

10-th fold
Splitting the data
Fitting the model
Evaluation and storage of model parameters

Cross-validation evaluation of custom Word2Vec + logistic regression:
Precision : 0.59899

## Custom Word2Vec + NeighLoR

In [76]:
"""Building representations for word embeddings"""
d = 50

def represent(x):
    rep = []
    for w in x:
        try:
            rep.append(w2v_model[w])
        except:
            pass
    if len(rep)>0:
        return np.vstack(rep).sum(axis=0)
    else:
        return np.zeros(d)

X_basic = [[represent(pattern).reshape((1,-1)) for pattern in tweet] if len(tweet)>0 else np.zeros((1,d)) for tweet in tqdm(actual_words_7)]
X_basic = [np.vstack(X_basic[k]) for k in tqdm(range(len(X_basic)))]


for k in tqdm(range(len(X_basic))):
    if X_basic[k].shape[0]==0:
        X_basic[k] = np.zeros((1,d))

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))

  





HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))




In [77]:
from scipy.special import expit, logit
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

"""Building indexes"""
n = len(ys)
d = X_basic[0].shape[1]

"""Retrieving the positive and negative indexes"""
neg_index = [k for k in range(len(ys)) if ys[k]==0]
pos_index = [k for k in range(len(ys)) if ys[k]==1]

"""Shuffling both the positive and negative indexes"""
np.random.seed(seed=0)
np.random.shuffle(neg_index)
np.random.shuffle(pos_index)

"""Computing train and test index sets for the chosen number of folds"""
cv = 10

s = len(neg_index)//cv
neg_index_list = [neg_index[k*s:(k+1)*s] for k in range(cv-1)]
neg_index_list.append(neg_index[(cv-1)*s:])
assert(np.sum([len(e) for e in neg_index_list])==len(neg_index)) # Check on negative index completion

s = len(pos_index)//cv
pos_index_list = [pos_index[k*s:(k+1)*s] for k in range(cv-1)]
pos_index_list.append(pos_index[(cv-1)*s:])
assert(np.sum([len(e) for e in pos_index_list])==len(pos_index))# Check on positive index completion

"""Cross-validation on 10% features, without hashtags"""
from scipy.special import logit

"""Initializing the list of outputs, predictions and probabilities to computed CV-ly"""
Y = ys
Y_Pred = []
Y_Proba = []

"""Intializing the list of cross-validated features"""
precision_list = []
recall_list = []
roc_auc_list = []
f1_list = []

"""Storing coefficients and biases for stability evaluation"""
biases = []
weights = []

"""Performing the cross-validation of the model using the features of interest
A caveat to be mentioned is that the feature selection was performed on the whole dataset, which may be a little biased towards choosing the right features.
This effect will be neglected during this test."""
for k in range(cv):
    """Splitting the data into train and test"""
    print('{}-th fold'.format(k+1))
    print('Splitting the data')
    neg_test = [X_basic[i] for i in neg_index_list[k]]
    pos_test = [X_basic[i] for i in pos_index_list[k]]
    neg_train = [X_basic[i] for i in set(neg_index).difference(neg_index_list[k])]
    pos_train = [X_basic[i] for i in set(pos_index).difference(pos_index_list[k])]

    X_train = pos_train + neg_train
    X_test = pos_test + neg_test

    y_train = [1] * len(pos_train) + [0] * len(neg_train)
    y_test = [1] * len(pos_test) + [0] * len(neg_test)

    """Fitting the model"""
    print('Fitting the model')
    d = X_train[0].shape[1]
    model = MaxLog(d=d, fit_intercept=True, alpha=10., epsilon=1e-4, init_b=logit(np.mean(y_train)), init_w=np.zeros(d))
    model.fit(X_train, y_train, n_iter=3000, parallel=False)

    print('Evaluation and storage of model parameters\n')
    """Outputting the predictions and the probability scores"""
    y_score = model.predict_proba(X_test)
    y_pred = model.predict(X_test)

    """Computing the various metrics"""
    f1_list.append(f1_score(y_test, y_pred))
    precision_list.append(precision_score(y_test, y_pred))
    recall_list.append(recall_score(y_test, y_pred))
    roc_auc_list.append(roc_auc_score(y_test, y_score))

    """Adding predictions and scores to computed global cross-validated performance after the end of the process"""
    Y_Pred.append(list(y_pred))
    Y_Proba.append(list(y_score))

    """Storing the model's coefficients"""
    biases.append(model.b)
    weights.append(model.w)
    
print("Precision : {} +- {}\nRecall : {} +- {}\nF1 : {} +- {}\nROC AUC : {} +- {}".format(np.mean(precision_list), np.std(precision_list),
                                                                                          np.mean(recall_list), np.std(recall_list),
                                                                                          np.mean(f1_list), np.std(f1_list),
                                                                                          np.mean(roc_auc_list), np.std(roc_auc_list)))

1-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))




Evaluation and storage of model parameters

2-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

3-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

4-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

5-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

6-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

7-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

8-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

9-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

10-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=3000), HTML(value='')))


Evaluation and storage of model parameters

Precision : 0.23245150909030662 +- 0.21292063669519432
Recall : 0.21756756756756754 +- 0.2671919285116916
F1 : 0.1985917900177823 +- 0.21676004093296022
ROC AUC : 0.6127302845459981 +- 0.2395909525406043
