In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
import multiprocessing
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm_notebook as tqdm
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score
import matplotlib.pyplot as plt
import re
import pickle

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
class MaxLog:
    
    def __init__(self, d, fit_intercept=True, init_w=None, init_b=None, alpha=1e-3, epsilon=1e-2):
        """Defining the dimension and fit_intercept parameters"""
        self.dim = d
        self.grad = np.zeros(self.dim)
        self.fit_intercept = fit_intercept
        
        """Initializing the weights"""
        self.w = 0.
        if init_w is None:
            self.w = np.random.normal(size=self.dim)
        elif isinstance(init_w, np.ndarray) and init_w.shape[0]==self.dim:
            self.w = init_w
        else:
            raise ValueError('Wrong dimension in intial parameters.')
        
        """Initializing the bias"""
        self.b = 0.
        if fit_intercept:
            if init_b is None:
                self.b = np.random.normal()
            elif isinstance(init_b, (float, np.float64)):
                self.b = init_b
            else:
                raise ValueError('Bias initialization must be a float type.')
                
        """Setting the learning rate"""
        if isinstance(alpha, (float, np.float64)):
            self.alpha = alpha
        else:
            raise ValueError('Learning rate must be a float type.')
        
        """Setting the stopping criterion for the gradient descent"""
        if isinstance(epsilon, (float, np.float64)):
            self.epsilon = epsilon
        else:
            raise ValueError('Tolerance must be a float type.')
        
        """Initializing error sequence"""
        self.errors = []
        
        pass
    
    
    def check_dim(self, X):
        """Computing check on dimension of observations"""
        dim_check = any(x.shape[1]!=self.dim for x in X)
        if dim_check:
            raise ValueError('Wrong dimension in observations')
        pass
    
    
    def check_attr(self, attr, message=''):
        """Function to check whether the instance has an attribute"""
        if not hasattr(self, attr):
            if message=='':
                raise ValueError("Instance doesn't have attribute '{}'".format(attr))
            else:
                raise ValueError(message)
        pass
    
    
    def predict(self, X):
        """Computing check on dimension of observations"""
        self.check_dim(X)
            
        """Computing the predictions"""
        pred = [np.max(expit(self.b + x.dot(self.w))) for x in X]
        return np.array([int(p>=0.5) for p in pred])
    
    
    def predict_proba(self, X):
        """Computing check on dimension of observations"""
        self.check_dim(X)
            
        """Computing the probability"""
        pred = [np.max(expit(self.b + x.dot(self.w))) for x in X]
        return np.array(pred)
    
    
    def fit(self, X, y, parallel=False, n_iter=None):
        """Function to fit the model
        Needs : compute the forward propagation, compute the backward propagation"""
        """Computing check on dimension of observations"""
        self.check_dim(X)
        
        """Initiating the error sequence"""
        self.errors = []
        self.grad = 10
        
        """Performing the gradient descent"""
        error = np.inf
        if n_iter==None:
            while error >= self.epsilon:
                self.forward(X, y)
                self.backward(X, y)
                error = self.grad
        else:
            for k in tqdm(range(n_iter), desc='Fitting the model'):
                self.forward(X, y)
                self.backward(X, y)
                if self.grad<=self.epsilon:
                    break
        pass
    
    
    def forward(self, X, y):
        """Performing the forward pass of the model : registering the maximum indexes, the current proba scores, and the error"""
        probas = [expit(self.b + x.dot(self.w)) for x in X]
        self.max_ind = [np.argmax(v) for v in probas]
        y_pred = np.array([np.max(v) for v in probas])
        self.eta = y_pred
        self.errors.append(- np.mean(np.array(y) * np.log(y_pred) + (1-np.array(y)) * np.log(1-y_pred)))
        pass
    
    
    def backward(self, X, y):
        grad_w, grad_b = self.gradient(X, y)
        self.w = self.w - self.alpha * grad_w
        self.b = self.b - self.alpha * grad_b
        self.grad = np.sqrt(np.sum(grad_w**2)+grad_b**2)
        pass
    
    
    def gradient(self, X, y):
        """Check on error and maximum index for gradient computation"""
        self.check_attr('errors', message='Need to compute forward pass to have error')
        self.check_attr('max_ind', message='Need to compute forward pass to have maximum index')
        self.check_attr('eta', message='Need to compute forward pass to have current estimate probabilities')
        
        """Computing the gradient for every observation"""
        mi = self.max_ind
        current_X = np.vstack([X[k][mi[k]].reshape((1,-1)) for k in range(len(X))])
        grad_w = current_X.T.dot(self.eta - np.array(y))/current_X.shape[0]
        grad_b = 0.
        if self.fit_intercept:
            grad_b = np.mean(self.eta - np.array(y))
        return grad_w, grad_b

In [3]:
"""Loading the features"""
top_verbs_20 = pickle.load(open('../features_20/top_verbs_20.p', 'rb'))
bot_verbs_20 = pickle.load(open('../features_20/bot_verbs_20.p', 'rb'))
top_adverbs_20 = pickle.load(open('../features_20/top_adverbs_20.p', 'rb'))
bot_adverbs_20 = pickle.load(open('../features_20/bot_adverbs_20.p', 'rb'))

In [4]:
"""Loading the neighborhoods and the target"""
actual_words_5 = pickle.load(open('../neighborhoods/actual_words_hashtag_free_5.p','rb'))
hashtag_words = pickle.load(open('../neighborhoods/hashtag_words.p','rb'))
data = pd.read_csv('../basic_data.csv', sep='\t')
ys = list(data.pi)

In [5]:
def represent(pattern):
    tv = [int(v in pattern) for v in top_verbs_20]
    bv = [int(v in pattern) for v in bot_verbs_20]
    ta = [int(v in pattern) for v in top_adverbs_20]
    ba = [int(v in pattern) for v in bot_adverbs_20]
    return tv+bv+ta+ba

In [6]:
X_basic = [[represent(pattern) for pattern in tweet] for tweet in tqdm(actual_words_5)]
X_hashtag = [[represent(pattern) for pattern in tweet] for tweet in tqdm(hashtag_words)]

X_full = [X_basic[k] + X_hashtag[k] for k in tqdm(range(len(ys)))]

X_basic = [np.array(tweet) for tweet in tqdm(X_basic)]
X_full = [np.array(tweet) for tweet in tqdm(X_full)]

d = len(top_adverbs_20) + len(top_verbs_20) + len(bot_adverbs_20) + len(bot_verbs_20)
for k in tqdm(range(len(X_basic))):
    if X_basic[k].shape[0]==0:
        X_basic[k] = np.zeros((1,d))
    if X_full[k].shape[0]==0:
        X_full[k] = np.zeros((1,d))

HBox(children=(IntProgress(value=0, max=11540), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11540), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11540), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11540), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11540), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11540), HTML(value='')))




In [7]:
"""Building indexes"""

n = len(ys)
d = X_basic[0].shape[1]

"""Retrieving the positive and negative indexes"""
neg_index = [k for k in range(len(ys)) if ys[k]==0]
pos_index = [k for k in range(len(ys)) if ys[k]==1]

"""Shuffling both the positive and negative indexes"""
np.random.seed(seed=0)
np.random.shuffle(neg_index)
np.random.shuffle(pos_index)

"""Computing train and test index sets for the chosen number of folds"""
cv = 10

s = len(neg_index)//cv
neg_index_list = [neg_index[k*s:(k+1)*s] for k in range(cv-1)]
neg_index_list.append(neg_index[(cv-1)*s:])
assert(np.sum([len(e) for e in neg_index_list])==len(neg_index)) # Check on negative index completion

s = len(pos_index)//cv
pos_index_list = [pos_index[k*s:(k+1)*s] for k in range(cv-1)]
pos_index_list.append(pos_index[(cv-1)*s:])
assert(np.sum([len(e) for e in pos_index_list])==len(pos_index))# Check on positive index completion

In [26]:
"""Cross-validation on 20% features, without hashtags"""
from scipy.special import logit, expit

"""Initializing the list of outputs, predictions and probabilities to computed CV-ly"""
Y = ys
Y_Pred = []
Y_Proba = []

"""Intializing the list of cross-validated features"""
precision_list = []
recall_list = []
roc_auc_list = []
f1_list = []

"""Storing coefficients and biases for stability evaluation"""
biases = []
weights = []

"""Performing the cross-validation of the model using the features of interest
A caveat to be mentioned is that the feature selection was performed on the whole dataset, which may be a little biased towards choosing the right features.
This effect will be neglected during this test."""
for k in range(cv):
    """Splitting the data into train and test"""
    print('{}-th fold'.format(k+1))
    print('Splitting the data')
    neg_test = [X_basic[i] for i in neg_index_list[k]]
    pos_test = [X_basic[i] for i in pos_index_list[k]]
    neg_train = [X_basic[i] for i in set(neg_index).difference(neg_index_list[k])]
    pos_train = [X_basic[i] for i in set(pos_index).difference(pos_index_list[k])]

    X_train = pos_train + neg_train
    X_test = pos_test + neg_test

    y_train = [1] * len(pos_train) + [0] * len(neg_train)
    y_test = [1] * len(pos_test) + [0] * len(neg_test)

    """Fitting the model"""
    print('Fitting the model')
    d = X_train[0].shape[1]
    model = MaxLog(d=d, fit_intercept=True, alpha=10., epsilon=1e-4, init_b=logit(np.mean(y_train)), init_w=np.zeros(d))
    model.fit(X_train, y_train, n_iter=4000, parallel=False)

    print('Evaluation and storage of model parameters\n')
    """Outputting the predictions and the probability scores"""
    y_score = model.predict_proba(X_test)
    y_pred = model.predict(X_test)

    """Computing the various metrics"""
    f1_list.append(f1_score(y_test, y_pred))
    precision_list.append(precision_score(y_test, y_pred))
    recall_list.append(recall_score(y_test, y_pred))
    roc_auc_list.append(roc_auc_score(y_test, y_score))

    """Adding predictions and scores to computed global cross-validated performance after the end of the process"""
    Y_Pred.append(list(y_pred))
    Y_Proba.append(list(y_score))

    """Storing the model's coefficients"""
    biases.append(model.b)
    weights.append(model.w)

1-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

2-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

3-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

4-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

5-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

6-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

7-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

8-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

9-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

10-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters



In [27]:
print('Average precision : {}'.format(np.mean(precision_list)))
print('Average recall : {}'.format(np.mean(recall_list)))
print('Average F1 score : {}'.format(np.mean(f1_list)))
print('Average ROC score : {}'.format(np.mean(roc_auc_list)))

Average precision : 0.8146451048474352
Average recall : 0.5867777326290124
Average F1 score : 0.6811499928947253
Average ROC score : 0.9032620218140874


In [28]:
"""Cross-validation on 5% features, with hashtags"""
from scipy.special import logit

"""Initializing the list of outputs, predictions and probabilities to computed CV-ly"""
Y = ys
Y_Pred = []
Y_Proba = []

"""Intializing the list of cross-validated features"""
precision_list_full = []
recall_list_full = []
roc_auc_list_full = []
f1_list_full = []

"""Storing coefficients and biases for stability evaluation"""
biases_full = []
weights_full = []

"""Performing the cross-validation of the model using the features of interest
A caveat to be mentioned is that the feature selection was performed on the whole dataset, which may be a little biased towards choosing the right features.
This effect will be neglected during this test."""
for k in range(cv):
    """Splitting the data into train and test"""
    print('{}-th fold'.format(k+1))
    print('Splitting the data')
    neg_test = [X_full[i] for i in neg_index_list[k]]
    pos_test = [X_full[i] for i in pos_index_list[k]]
    neg_train = [X_full[i] for i in set(neg_index).difference(neg_index_list[k])]
    pos_train = [X_full[i] for i in set(pos_index).difference(pos_index_list[k])]

    X_train = pos_train + neg_train
    X_test = pos_test + neg_test

    y_train = [1] * len(pos_train) + [0] * len(neg_train)
    y_test = [1] * len(pos_test) + [0] * len(neg_test)

    """Fitting the model"""
    print('Fitting the model')
    d = X_train[0].shape[1]
    model = MaxLog(d=d, fit_intercept=True, alpha=10., epsilon=1e-4, init_b=logit(np.mean(y_train)), init_w=np.zeros(d))
    model.fit(X_train, y_train, n_iter=4000, parallel=False)

    print('Evaluation and storage of model parameters\n')
    """Outputting the predictions and the probability scores"""
    y_score = model.predict_proba(X_test)
    y_pred = model.predict(X_test)

    """Computing the various metrics"""
    f1_list_full.append(f1_score(y_test, y_pred))
    precision_list_full.append(precision_score(y_test, y_pred))
    recall_list_full.append(recall_score(y_test, y_pred))
    roc_auc_list_full.append(roc_auc_score(y_test, y_score))

    """Adding predictions and scores to computed global cross-validated performance after the end of the process"""
    Y_Pred.append(list(y_pred))
    Y_Proba.append(list(y_score))

    """Storing the model's coefficients"""
    biases_full.append(model.b)
    weights_full.append(model.w)
    
print('Average precision : {}'.format(np.mean(precision_list_full)))
print('Average recall : {}'.format(np.mean(recall_list_full)))
print('Average F1 score : {}'.format(np.mean(f1_list_full)))
print('Average ROC score : {}'.format(np.mean(roc_auc_list_full)))

1-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

2-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

3-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

4-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

5-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

6-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

7-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

8-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

9-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

10-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, description='Fitting the model', max=4000), HTML(value='')))

Evaluation and storage of model parameters

Average precision : 0.83263319828413
Average recall : 0.737659488013003
Average F1 score : 0.7817010148990152
Average ROC score : 0.9650264821003756


In [8]:
"""Benchmarking against plain logistic regression and Naive Bayes"""
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

"""Initializing the list of outputs, predictions and probabilities to computed CV-ly"""
Y = ys
Y_Pred = []
Y_Proba = []

"""Intializing the list of cross-validated metrics"""
precision_log = []
recall_log = []
roc_log = []
f1_log = []

precision_mnb = []
recall_mnb = []
roc_mnb = []
f1_mnb = []

"""Storing coefficients and biases for stability evaluation"""
biases_full = []
weights_full = []

"""Performing the cross-validation of the model using the features of interest
A caveat to be mentioned is that the feature selection was performed on the whole dataset, which may be a little biased towards choosing the right features.
This effect will be neglected during this test."""
for k in range(cv):
    """Splitting the data into train and test"""
    print('{}-th fold'.format(k+1))
    print('Splitting the data')
    neg_test = [np.minimum(np.sum(X_full[i] ,axis=0), np.ones(d)) for i in neg_index_list[k]]
    pos_test = [np.minimum(np.sum(X_full[i] ,axis=0), np.ones(d)) for i in pos_index_list[k]]
    neg_train = [np.minimum(np.sum(X_full[i] ,axis=0), np.ones(d)) for i in set(neg_index).difference(neg_index_list[k])]
    pos_train = [np.minimum(np.sum(X_full[i] ,axis=0), np.ones(d)) for i in set(pos_index).difference(pos_index_list[k])]

    X_train = pos_train + neg_train
    X_test = pos_test + neg_test

    y_train = [1] * len(pos_train) + [0] * len(neg_train)
    y_test = [1] * len(pos_test) + [0] * len(neg_test)
    
    logistic = LogisticRegression(C=1e5, fit_intercept=True)
    logistic.fit(X_train, y_train)
    y_pred_log = logistic.predict(X_test)
    y_score_log = logistic.predict_proba(X_test)[:,1]
    precision_log.append(precision_score(y_test, y_pred_log))
    recall_log.append(recall_score(y_test, y_pred_log))
    f1_log.append(f1_score(y_test, y_pred_log))
    roc_log.append(roc_auc_score(y_test, y_score_log))
    
    mnb = MultinomialNB()
    mnb.fit(X_train, y_train)
    y_pred_mnb = mnb.predict(X_test)
    y_score_mnb = mnb.predict_proba(X_test)[:,1]
    precision_mnb.append(precision_score(y_test, y_pred_mnb))
    recall_mnb.append(recall_score(y_test, y_pred_mnb))
    f1_mnb.append(f1_score(y_test, y_pred_mnb))
    roc_mnb.append(roc_auc_score(y_test, y_score_mnb))

1-th fold
Splitting the data
2-th fold
Splitting the data
3-th fold
Splitting the data
4-th fold
Splitting the data
5-th fold
Splitting the data
6-th fold
Splitting the data
7-th fold
Splitting the data
8-th fold
Splitting the data
9-th fold
Splitting the data
10-th fold
Splitting the data


In [9]:
"""Results for plain logisitic regression"""
print('Average precision : {}'.format(np.mean(precision_log)))
print('Average recall : {}'.format(np.mean(recall_log)))
print('Average F1 score : {}'.format(np.mean(f1_log)))
print('Average ROC AUC : {}'.format(np.mean(roc_log)))

Average precision : 0.7937333697627043
Average recall : 0.668955709061357
Average F1 score : 0.725195659533626
Average ROC AUC : 0.9556700765355094


In [10]:
"""Results for plain Naive Bayes"""
print('Average precision : {}'.format(np.mean(precision_mnb)))
print('Average recall : {}'.format(np.mean(recall_mnb)))
print('Average F1 score : {}'.format(np.mean(f1_mnb)))
print('Average ROC AUC : {}'.format(np.mean(roc_mnb)))

Average precision : 0.80758090544623
Average recall : 0.5548476229175131
Average F1 score : 0.6569602171630267
Average ROC AUC : 0.9128124049343714


In [11]:
"""Benchmarking against L1 logistic regression"""
import copy

"""Initializing the list of outputs, predictions and probabilities to computed CV-ly"""
Y = ys
Y_Pred = []
Y_Proba = []

"""Intializing the list of cross-validated metrics"""
precision_l1 = []
recall_l1 = []
roc_l1 = []
f1_l1 = []

lamdas = np.arange(-3,3.5,0.5)

"""Performing the cross-validation of the model using the features of interest
A caveat to be mentioned is that the feature selection was performed on the whole dataset, which may be a little biased towards choosing the right features.
This effect will be neglected during this test."""
for k in range(cv):
    """Splitting the data into train and test"""
    print('{}-th fold'.format(k+1))
    print('Splitting the data')
    neg_test = [np.minimum(np.sum(X_full[i] ,axis=0), np.ones(d)) for i in neg_index_list[k]]
    pos_test = [np.minimum(np.sum(X_full[i] ,axis=0), np.ones(d)) for i in pos_index_list[k]]
    neg_train = [np.minimum(np.sum(X_full[i] ,axis=0), np.ones(d)) for i in set(neg_index).difference(neg_index_list[k])]
    pos_train = [np.minimum(np.sum(X_full[i] ,axis=0), np.ones(d)) for i in set(pos_index).difference(pos_index_list[k])]

    X_train = pos_train + neg_train
    X_test = pos_test + neg_test

    y_train = [1] * len(pos_train) + [0] * len(neg_train)
    y_test = [1] * len(pos_test) + [0] * len(neg_test)
    
    f1_reg = []
    for C in 10**(-lamdas):
        logistic = LogisticRegression(C=C, penalty='l1', fit_intercept=True)
        logistic.fit(X_train, y_train)
        f1_reg.append(f1_score(y_train, logistic.predict(X_train)))
        
    C_max = 10**(-lamdas[np.argmax(f1_reg)])
    logistic = LogisticRegression(C=C_max, penalty='l1', fit_intercept=True)
    logistic.fit(X_train, y_train)
    y_pred = logistic.predict(X_test)
    y_score = logistic.predict_proba(X_test)[:,1]
    
    precision_l1.append(precision_score(y_test, y_pred))
    recall_l1.append(recall_score(y_test, y_pred))
    f1_l1.append(f1_score(y_test, y_pred))
    roc_l1.append(roc_auc_score(y_test, y_score))

1-th fold
Splitting the data


  'precision', 'predicted', average, warn_for)


2-th fold
Splitting the data
3-th fold
Splitting the data
4-th fold
Splitting the data
5-th fold
Splitting the data
6-th fold
Splitting the data
7-th fold
Splitting the data
8-th fold
Splitting the data
9-th fold
Splitting the data
10-th fold
Splitting the data


In [12]:
"""Computing average performance over best model"""
print('Average precision : {}'.format(np.mean(precision_l1)))
print('Average recall : {}'.format(np.mean(recall_l1)))
print('Average F1 score : {}'.format(np.mean(f1_l1)))
print('Average ROC AUC : {}'.format(np.mean(roc_l1)))

Average precision : 0.7911679840866783
Average recall : 0.6662819991873221
Average F1 score : 0.7224706724753356
Average ROC AUC : 0.957470212811726
