In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score
import matplotlib.pyplot as plt
import re
import pickle

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [None]:
class MaxLog:
    
    def __init__(self, d, fit_intercept=True, init_w=None, init_b=None, alpha=1e-3, epsilon=1e-2):
        """Defining the dimension and fit_intercept parameters"""
        self.dim = d
        self.grad = np.zeros(self.dim)
        self.fit_intercept = fit_intercept
        
        """Initializing the weights"""
        self.w = 0.
        if init_w is None:
            self.w = np.random.normal(size=self.dim)
        elif isinstance(init_w, np.ndarray) and init_w.shape[0]==self.dim:
            self.w = init_w
        else:
            raise ValueError('Wrong dimension in intial parameters.')
        
        """Initializing the bias"""
        self.b = 0.
        if fit_intercept:
            if init_b is None:
                self.b = np.random.normal()
            elif isinstance(init_b, (float, np.float64)):
                self.b = init_b
            else:
                raise ValueError('Bias initialization must be a float type.')
                
        """Setting the learning rate"""
        if isinstance(alpha, (float, np.float64)):
            self.alpha = alpha
        else:
            raise ValueError('Learning rate must be a float type.')
        
        """Setting the stopping criterion for the gradient descent"""
        if isinstance(epsilon, (float, np.float64)):
            self.epsilon = epsilon
        else:
            raise ValueError('Tolerance must be a float type.')
        
        """Initializing error sequence"""
        self.errors = []
        
        pass
    
    
    def check_dim(self, X):
        """Computing check on dimension of observations"""
        dim_check = any(x.shape[1]!=self.dim for x in X)
        if dim_check:
            raise ValueError('Wrong dimension in observations')
        pass
    
    
    def check_attr(self, attr, message=''):
        """Function to check whether the instance has an attribute"""
        if not hasattr(self, attr):
            if message=='':
                raise ValueError("Instance doesn't have attribute '{}'".format(attr))
            else:
                raise ValueError(message)
        pass
    
    
    def predict(self, X):
        """Computing check on dimension of observations"""
        self.check_dim(X)
            
        """Computing the predictions"""
        pred = [np.max(expit(self.b + x.dot(self.w))) for x in X]
        return np.array([int(p>=0.5) for p in pred])
    
    
    def predict_proba(self, X):
        """Computing check on dimension of observations"""
        self.check_dim(X)
            
        """Computing the probability"""
        pred = [np.max(expit(self.b + x.dot(self.w))) for x in X]
        return np.array(pred)
    
    
    def fit(self, X, y, parallel=False, n_iter=None):
        """Function to fit the model
        Needs : compute the forward propagation, compute the backward propagation"""
        """Computing check on dimension of observations"""
        self.check_dim(X)
        
        """Initiating the error sequence"""
        self.errors = []
        self.grad = 10
        
        """Performing the gradient descent"""
        error = np.inf
        if n_iter==None:
            while error >= self.epsilon:
                self.forward(X, y)
                self.backward(X, y)
                error = self.grad
        else:
            for k in tqdm(range(n_iter), desc='Fitting the model'):
                self.forward(X, y)
                self.backward(X, y)
                if self.grad<=self.epsilon:
                    break
        pass
    
    
    def forward(self, X, y):
        """Performing the forward pass of the model : registering the maximum indexes, the current proba scores, and the error"""
        probas = [expit(self.b + x.dot(self.w)) for x in X]
        self.max_ind = [np.argmax(v) for v in probas]
        y_pred = np.array([np.max(v) for v in probas])
        self.eta = y_pred
        self.errors.append(- np.mean(np.array(y) * np.log(y_pred) + (1-np.array(y)) * np.log(1-y_pred)))
        pass
    
    
    def backward(self, X, y):
        grad_w, grad_b = self.gradient(X, y)
        self.w = self.w - self.alpha * grad_w
        self.b = self.b - self.alpha * grad_b
        self.grad = np.sqrt(np.sum(grad_w**2)+grad_b**2)
        pass
    
    
    def gradient(self, X, y):
        """Check on error and maximum index for gradient computation"""
        self.check_attr('errors', message='Need to compute forward pass to have error')
        self.check_attr('max_ind', message='Need to compute forward pass to have maximum index')
        self.check_attr('eta', message='Need to compute forward pass to have current estimate probabilities')
        
        """Computing the gradient for every observation"""
        mi = self.max_ind
        current_X = np.vstack([X[k][mi[k]].reshape((1,-1)) for k in range(len(X))])
        grad_w = current_X.T.dot(self.eta - np.array(y))/current_X.shape[0]
        grad_b = 0.
        if self.fit_intercept:
            grad_b = np.mean(self.eta - np.array(y))
        return grad_w, grad_b

In [11]:
"""Examining bigrams that are highly correlated with the negative PI class"""
data = pickle.load(open('./label_tag_data.p','rb'))
lmtzr = WordNetLemmatizer()

bigram_count = {}
bigram_class = {}

for k in tqdm(range(data.shape[0])):
    local = data.tag_df.iloc[k]
    for i in range(local.shape[0]-1):
        if local.tag.iloc[i] in ['P','V','R','A','N','@','D'] and local.tag.iloc[i+1] in ['P','V','R','A','N','@','D']: #or local.word.iloc[i] in ['not','ever','never']:
            try:
                bigram_count['_'.join(list(local.word.iloc[i:i+2].apply(lambda x: lmtzr.lemmatize(x.replace("'",'').replace('-',''), 'v'))))] += 1
                bigram_class['_'.join(list(local.word.iloc[i:i+2].apply(lambda x: lmtzr.lemmatize(x.replace("'",'').replace('-',''), 'v'))))] += data.pi.iloc[k]
            except KeyError:
                bigram_count['_'.join(list(local.word.iloc[i:i+2].apply(lambda x: lmtzr.lemmatize(x.replace("'",'').replace('-',''), 'v'))))] = 1
                bigram_class['_'.join(list(local.word.iloc[i:i+2].apply(lambda x: lmtzr.lemmatize(x.replace("'",'').replace('-',''), 'v'))))] = data.pi.iloc[k]
                
"""Computing conditional probabilities"""
bigram_count = pd.Series(bigram_count)
bigram_class = pd.Series(bigram_class)
cond_proba = bigram_class/bigram_count
proba_df = pd.DataFrame([bigram_count, cond_proba], index=['count','proba']).T
proba_df['score_1'] = cond_proba - np.sqrt(cond_proba*(1-cond_proba)/bigram_count)

HBox(children=(IntProgress(value=0, max=11540), HTML(value='')))




In [14]:
proba_df[proba_df['count']>=10].sort_values('score_1', ascending=False).iloc[:100]

Unnamed: 0,count,proba,score_1
be_pst,15.0,1.000000,1.000000
daysno_resolution,13.0,1.000000,1.000000
resolution_from,14.0,1.000000,1.000000
seek_further,10.0,1.000000,1.000000
further_legal,10.0,1.000000,1.000000
in_vacation,17.0,1.000000,1.000000
baggage_in,16.0,1.000000,1.000000
@alex_cruz_tweet,17.0,1.000000,1.000000
department_of,16.0,1.000000,1.000000
week_ruin,14.0,1.000000,1.000000


In [12]:
"""Loading the relevant features"""
top_verbs_10 = pickle.load(open('./features_10/top_verbs_10.p','rb'))
bot_verbs_10 = pickle.load(open('./features_10/bot_verbs_10.p','rb'))
top_adverbs_10 = pickle.load(open('./features_10/top_adverbs_10.p','rb'))
bot_adverbs_10 = pickle.load(open('./features_10/bot_adverbs_10.p','rb'))
hashtag_words = pickle.load(open('./hashtags/hashtag_words.p','rb'))

actual_words_5 = pickle.load(open('./neighborhoods/actual_words_hashtag_free_5.p','rb'))
ys = list(data.pi)

In [13]:
"""Building bigrams neighborhoods"""
bigram_neighborhoods = []
for k in tqdm(range(len(actual_words_5))):
    bigram_neighborhoods.append([['_'.join(V[i:i+2]) for i in range(len(V)-1)] for V in actual_words_5[k]])

HBox(children=(IntProgress(value=0, max=11540), HTML(value='')))




In [15]:
"""Building one-hot encodings for bigram neighborhoods"""
bigram_feat = list(proba_df[proba_df['count']>=10].sort_values('score_1', ascending=False).index[:100])
ohe_bn = [np.array([[int(w in V) for w in bigram_feat] for V in N]) for N in tqdm(bigram_neighborhoods)]
for k in range(len(ohe_bn)):
    if ohe_bn[k].shape[0]==0:
        ohe_bn[k] = np.zeros((1,len(bigram_feat)))

HBox(children=(IntProgress(value=0, max=11540), HTML(value='')))




In [16]:
"""Building one-hot encodings for normal neighborhoods"""
features = top_verbs_10 + bot_verbs_10 + top_adverbs_10 + bot_adverbs_10
ohe_norm = [np.array([[int(w in V) for w in features] for V in N]) for N in tqdm(actual_words_5)]
for k in range(len(ohe_norm)):
    if ohe_norm[k].shape[0]==0:
        ohe_norm[k] = np.zeros((1,len(features)))

HBox(children=(IntProgress(value=0, max=11540), HTML(value='')))




In [17]:
"""Building one-hot encodings for hashtag neighborhoods"""
features = top_verbs_10 + bot_verbs_10 + top_adverbs_10 + bot_adverbs_10
ohe_hash = [np.array([[int(w in V) for w in features] for V in N]) for N in tqdm(hashtag_words)]
for k in range(len(ohe_hash)):
    if ohe_hash[k].shape[0]==0:
        ohe_hash[k] = np.zeros((1,len(features)))

HBox(children=(IntProgress(value=0, max=11540), HTML(value='')))




In [18]:
"""Building bigrams' hashtags' neighborhoods"""
bigram_hash = []
for k in tqdm(range(len(actual_words_5))):
    bigram_hash.append([['_'.join(V[i:i+2]) for i in range(len(V)-1)] for V in hashtag_words[k]])

HBox(children=(IntProgress(value=0, max=11540), HTML(value='')))




In [19]:
"""Building one-hot encodings for bigram hashtags"""
bigram_feat = list(proba_df[proba_df['count']>=10].sort_values('score_1', ascending=False).index[:100])
ohe_bn_hash = [np.array([[int(w in V) for w in bigram_feat] for V in N]) for N in tqdm(bigram_hash)]
for k in range(len(ohe_bn)):
    if ohe_bn_hash[k].shape[0]==0:
        ohe_bn_hash[k] = np.zeros((1,len(bigram_feat)))

HBox(children=(IntProgress(value=0, max=11540), HTML(value='')))




In [20]:
"""Building full representation using both neighborhoods and bigram neighborhoods"""
X_full = [np.vstack([np.hstack([ohe_norm[k], ohe_bn[k]]),
                     np.hstack([ohe_hash[k], ohe_bn_hash[k]])]) for k in tqdm(range(data.shape[0]))]

HBox(children=(IntProgress(value=0, max=11540), HTML(value='')))




In [21]:
"""Elementary check on dimensions"""
assert(all(x.shape[1]==X_full[0].shape[1]) for x in X_full)
print(X_full[0].shape[1])

276


In [22]:
"""Building indexes"""

n = len(ys)
d = X_full[0].shape[1]

"""Retrieving the positive and negative indexes"""
neg_index = [k for k in range(len(ys)) if ys[k]==0]
pos_index = [k for k in range(len(ys)) if ys[k]==1]

"""Shuffling both the positive and negative indexes"""
np.random.seed(seed=0)
np.random.shuffle(neg_index)
np.random.shuffle(pos_index)

"""Computing train and test index sets for the chosen number of folds"""
cv = 10

s = len(neg_index)//cv
neg_index_list = [neg_index[k*s:(k+1)*s] for k in range(cv-1)]
neg_index_list.append(neg_index[(cv-1)*s:])
assert(np.sum([len(e) for e in neg_index_list])==len(neg_index)) # Check on negative index completion

s = len(pos_index)//cv
pos_index_list = [pos_index[k*s:(k+1)*s] for k in range(cv-1)]
pos_index_list.append(pos_index[(cv-1)*s:])
assert(np.sum([len(e) for e in pos_index_list])==len(pos_index))# Check on positive index completion

In [23]:
"""Cross-validation on 10% features, with hashtags"""
from scipy.special import logit, expit

"""Initializing the list of outputs, predictions and probabilities to computed CV-ly"""
Y = ys
Y_Pred = []
Y_Proba = []
alpha = 10

"""Intializing the list of cross-validated features"""
precision_list_full = []
recall_list_full = []
roc_auc_list_full = []
f1_list_full = []

"""Storing coefficients and biases for stability evaluation"""
biases_full = []
weights_full = []

"""Performing the cross-validation of the model using the features of interest
A caveat to be mentioned is that the feature selection was performed on the whole dataset, which may be a little biased towards choosing the right features.
This effect will be neglected during this test."""
for k in range(cv):
    """Splitting the data into train and test"""
    print('{}-th fold'.format(k+1))
    print('Splitting the data')
    neg_test = [X_full[i] for i in neg_index_list[k]]
    pos_test = [X_full[i] for i in pos_index_list[k]]
    neg_train = [X_full[i] for i in set(neg_index).difference(neg_index_list[k])]
    pos_train = [X_full[i] for i in set(pos_index).difference(pos_index_list[k])]

    X_train = pos_train + neg_train
    X_test = pos_test + neg_test

    y_train = [1] * len(pos_train) + [0] * len(neg_train)
    y_test = [1] * len(pos_test) + [0] * len(neg_test)

    """Fitting the model"""
    print('Fitting the model')
    d = X_train[0].shape[1]
    model = MaxLog(d=d, fit_intercept=True, alpha=10., epsilon=1e-4, init_b=logit(np.mean(y_train)), init_w=np.zeros(d))
    for k in tqdm(range(2000)):
        model.forward(X_train, y_train)
        grad_w, grad_b = model.gradient(X_train, y_train)
        model.w -= alpha * grad_w
        model.b -= alpha * grad_b
        grad = np.sqrt(np.sum(grad_w**2)+grad_b**2)
        if grad<=1e-4:
            break


    print('Evaluation and storage of model parameters\n')
    """Outputting the predictions and the probability scores"""
    y_score = model.predict_proba(X_test)
    y_pred = model.predict(X_test)

    """Computing the various metrics"""
    f1_list_full.append(f1_score(y_test, y_pred))
    precision_list_full.append(precision_score(y_test, y_pred))
    recall_list_full.append(recall_score(y_test, y_pred))
    roc_auc_list_full.append(roc_auc_score(y_test, y_score))

    """Adding predictions and scores to computed global cross-validated performance after the end of the process"""
    Y_Pred.append(list(y_pred))
    Y_Proba.append(list(y_score))

    """Storing the model's coefficients"""
    biases_full.append(model.b)
    weights_full.append(model.w)

print('Average precision : {}'.format(np.mean(precision_list_full)))
print('Average recall : {}'.format(np.mean(recall_list_full)))
print('Average F1 score : {}'.format(np.mean(f1_list_full)))
print('Average ROC score : {}'.format(np.mean(roc_auc_list_full)))

1-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))


Evaluation and storage of model parameters

2-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))


Evaluation and storage of model parameters

3-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))


Evaluation and storage of model parameters

4-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))


Evaluation and storage of model parameters

5-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))


Evaluation and storage of model parameters

6-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))


Evaluation and storage of model parameters

7-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))


Evaluation and storage of model parameters

8-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))


Evaluation and storage of model parameters

9-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))


Evaluation and storage of model parameters

10-th fold
Splitting the data
Fitting the model


HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))


Evaluation and storage of model parameters

Average precision : 0.878211600819666
Average recall : 0.7841934173100366
Average F1 score : 0.8278022027707685
Average ROC score : 0.9697050539953643
