In [8]:
import pandas as pd
import numpy as np
from ark_tweet_nlp import CMUTweetTagger
from tqdm import tqdm_notebook as tqdm
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
import pickle
import copy
import re

The processed data contains pre-processed text to search for patterns. The 'actual_words_5' contains 5-word neighborhoods, it is redundant as far as the text is concerned, however it contains hashtag decompositions. Which is why we also look into it.

In [2]:
"""Loading the tagged data and the 5-words neighborhoods """
data = pickle.load(open('./processed_BA.b', 'rb'))
actual_words_5, _, _ = pickle.load(open('./neighborhoods_ba.b', 'rb'))

In [9]:
"""Preparing the data"""
data['tokens'] = data.text.apply(lambda x: word_tokenize(x.lower()))

In [15]:
"""Pattern flagging"""

"""Structuring the words into stand alone identifiers, negative paired identifiers, negative indicators, BA indicators, and other fluff"""
stand_alone = ['boycott','bycott','boycot','bycot','avoid']
neg_paired = ['fly','use','travel','choose','book','recommend']
neg_ind = ['never','not','dont','shouldnt','no','wont','wouldnt']
lmtzr = WordNetLemmatizer()

"""Function to pick pattern"""
def check_pattern(pat):
    sa_ind = int(len(set(list(pat)).intersection(stand_alone))>0)
    np_ind = int( (len(set(list(pat)).intersection(neg_paired))>0) & (len(set(list(pat)).intersection(neg_ind))>0))
    never_again = int('never' in pat and 'again' in pat)
    return [sa_ind, np_ind, never_again]

ys = list(data.pi)

pred_2 = []
for k in tqdm(range(data.shape[0]), desc='Going through the neighborhoods for 2 conditions'):
    label = False
    for pat in actual_words_5[k]:
        pf = check_pattern(pat)
        """Common conditions labeling"""
        if (pf[0]==1 or pf[1]==1) and not label:
            pred_2.append(1)
            label = True
            break
    if label==False:
        for i in range(len(data.tokens.iloc[k])):
            if lmtzr.lemmatize(data.tokens.iloc[k][i],'v') in stand_alone:
                label = True
            if data.tokens.iloc[k][i] in neg_ind:
                for j in range(i,min(len(data.tokens.iloc[k]),i+4)):
                    if lmtzr.lemmatize(data.tokens.iloc[k][j],'v') in neg_paired:
                        label = True
                        break
            if label:
                pred_2.append(1)
                break
    """Labeling as a non negative PI tweet if no condition is met"""
    if not label:
        pred_2.append(0)
        
print('Using 2 conditions : negative PI verb, negation word + positive PI verb with hashtags')
print('F1 score obtained through the fully supervised method : {}'.format(f1_score(y_true=ys, y_pred=pred_2)))
print('Precision obtained through the fully supervised method : {}'.format(precision_score(y_true=ys, y_pred=pred_2)))
print('Recall score obtained through the fully supervised method : {}'.format(recall_score(y_true=ys, y_pred=pred_2)))

HBox(children=(IntProgress(value=0, description='Going through the neighborhoods for 2 conditions', max=11684)…


Using 2 conditions : negative PI verb, negation word + positive PI verb with hashtags
F1 score obtained through the fully supervised method : 0.6442775077330977
Precision obtained through the fully supervised method : 0.6246786632390745
Recall score obtained through the fully supervised method : 0.6651459854014599
