In [1]:
import pandas as pd
import numpy as np
from ark_tweet_nlp import CMUTweetTagger
from tqdm import tqdm_notebook as tqdm
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
import pickle
import copy
import re

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
"""Loading the tagged data and the 5-words neighborhoods """
data = pickle.load(open('./label_tag_data.p', 'rb'))

In [3]:
"""Feature extraction"""
ys = list(data.pi)
lmtzr = WordNetLemmatizer()

verb_cond = {}
adverb_cond = {}
counts = {}

for k in tqdm(range(len(ys))):
    """Loop over the tweets"""
    y = ys[k]
    local = data.tag_df.iloc[k]
    
    """Finding and lemmatizing verbs"""
    local_verbs = local[(local.tag=='V') & (local.word.apply(lambda x: '#' not in x)) & (local.word.apply(lambda x: all(not c.isdigit() for c in x)))]
    local_verbs = list(local_verbs.word.apply(lambda x: lmtzr.lemmatize(x.lower().replace('-','').replace("'",''), 'v')).drop_duplicates())
    
    """Finding adverbs"""
    local_adverbs = local[(local.tag=='R') & (local.word.apply(lambda x: '#' not in x)) & (local.word.apply(lambda x: all(not c.isdigit() for c in x)))]
    local_adverbs = list(local_adverbs.word.apply(lambda x: x.lower().replace('-','').replace("'",'')).drop_duplicates())
    
    """Counting and label counting"""
    for verb in set(local_verbs):
        try:
            counts[verb] += 1
            verb_cond[verb] += y
        except KeyError:
            counts[verb] = 1
            verb_cond[verb] = y
        
    for adverb in set(local_adverbs):
        try:
            counts[adverb] += 1
            adverb_cond[adverb] += y
        except KeyError:
            counts[adverb] = 1
            adverb_cond[adverb] = y

HBox(children=(IntProgress(value=0, max=11540), HTML(value='')))




In [4]:
"""Computing the probabilities"""
verb_cond = pd.Series(verb_cond)
adverb_cond = pd.Series(adverb_cond)
counts = pd.Series(counts)

verb_cond = verb_cond/counts.loc[verb_cond.index]
adverb_cond = adverb_cond/counts.loc[adverb_cond.index]

verb_cond.sort_values(ascending=False, inplace=True)
adverb_cond.sort_values(ascending=False, inplace=True)

verb_cond = pd.DataFrame([verb_cond, counts.loc[verb_cond.index]], index=['cond_proba','count']).T
adverb_cond = pd.DataFrame([adverb_cond, counts.loc[adverb_cond.index]], index=['cond_proba','count']).T

In [5]:
"""Computing feature selection score for positive class"""
verb_cond['lower_bound'] = (verb_cond['cond_proba'] - 1.96*np.sqrt(verb_cond['cond_proba']*(1-verb_cond['cond_proba'])/verb_cond['count'])).apply(lambda x: max(x,0))
adverb_cond['lower_bound'] = (adverb_cond['cond_proba'] - 1.96*np.sqrt(adverb_cond['cond_proba']*(1-adverb_cond['cond_proba'])/adverb_cond['count'])).apply(lambda x: max(x,0))

"""Computing feature selection score for negative class"""
verb_cond['lower_bound_no_pi'] = (1 - verb_cond['cond_proba'] - 1.96*np.sqrt(verb_cond['cond_proba']*(1-verb_cond['cond_proba'])/verb_cond['count'])).apply(lambda x: max(x,0))
adverb_cond['lower_bound_no_pi'] = (1 - adverb_cond['cond_proba'] - 1.96*np.sqrt(adverb_cond['cond_proba']*(1-adverb_cond['cond_proba'])/adverb_cond['count'])).apply(lambda x: max(x,0))

  This is separate from the ipykernel package so we can avoid doing imports until
  import sys


In [6]:
"""Storing various top-p% feature verbs and adverbs"""
top_verbs_5 = list(verb_cond[verb_cond['count']>=5].sort_values('lower_bound', ascending=False).index[:int(0.05*verb_cond[verb_cond['count']>=5].shape[0])])
top_verbs_10 = list(verb_cond[verb_cond['count']>=5].sort_values('lower_bound', ascending=False).index[:int(0.1*verb_cond[verb_cond['count']>=5].shape[0])])
top_verbs_20 = list(verb_cond[verb_cond['count']>=5].sort_values('lower_bound', ascending=False).index[:int(0.2*verb_cond[verb_cond['count']>=5].shape[0])])
top_verbs_30 = list(verb_cond[verb_cond['count']>=5].sort_values('lower_bound', ascending=False).index[:int(0.3*verb_cond[verb_cond['count']>=5].shape[0])])

top_adverbs_5 = list(adverb_cond[adverb_cond['count']>=5].sort_values('lower_bound', ascending=False).index[:int(0.05*adverb_cond[adverb_cond['count']>=5].shape[0])])
top_adverbs_10 = list(adverb_cond[adverb_cond['count']>=5].sort_values('lower_bound', ascending=False).index[:int(0.1*adverb_cond[adverb_cond['count']>=5].shape[0])])
top_adverbs_20 = list(adverb_cond[adverb_cond['count']>=5].sort_values('lower_bound', ascending=False).index[:int(0.2*adverb_cond[adverb_cond['count']>=5].shape[0])])
top_adverbs_30 = list(adverb_cond[adverb_cond['count']>=5].sort_values('lower_bound', ascending=False).index[:int(0.3*adverb_cond[adverb_cond['count']>=5].shape[0])])

In [7]:
"""Storing various bottom-p% feature verbs and adverbs"""
bot_verbs_5 = list(verb_cond[verb_cond['count']>=5].sort_values('lower_bound_no_pi', ascending=False).index[:int(0.05*verb_cond[verb_cond['count']>=5].shape[0])])
bot_verbs_10 = list(verb_cond[verb_cond['count']>=5].sort_values('lower_bound_no_pi', ascending=False).index[:int(0.1*verb_cond[verb_cond['count']>=5].shape[0])])
bot_verbs_20 = list(verb_cond[verb_cond['count']>=5].sort_values('lower_bound_no_pi', ascending=False).index[:int(0.2*verb_cond[verb_cond['count']>=5].shape[0])])
bot_verbs_30 = list(verb_cond[verb_cond['count']>=5].sort_values('lower_bound_no_pi', ascending=False).index[:int(0.3*verb_cond[verb_cond['count']>=5].shape[0])])

bot_adverbs_5 = list(adverb_cond[adverb_cond['count']>=5].sort_values('lower_bound_no_pi', ascending=False).index[:int(0.05*adverb_cond[adverb_cond['count']>=5].shape[0])])
bot_adverbs_10 = list(adverb_cond[adverb_cond['count']>=5].sort_values('lower_bound_no_pi', ascending=False).index[:int(0.1*adverb_cond[adverb_cond['count']>=5].shape[0])])
bot_adverbs_20 = list(adverb_cond[adverb_cond['count']>=5].sort_values('lower_bound_no_pi', ascending=False).index[:int(0.2*adverb_cond[adverb_cond['count']>=5].shape[0])])
bot_adverbs_30 = list(adverb_cond[adverb_cond['count']>=5].sort_values('lower_bound_no_pi', ascending=False).index[:int(0.3*adverb_cond[adverb_cond['count']>=5].shape[0])])

In [11]:
"""Creating directories for feature storing"""
! mkdir features_5
! mkdir features_10
! mkdir features_20
! mkdir features_30

In [8]:
"""Storing the features selected"""
pickle.dump(top_verbs_5, open('./features_5/top_verbs_5.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(bot_verbs_5, open('./features_5/bot_verbs_5.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(top_adverbs_5, open('./features_5/top_adverbs_5.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(bot_adverbs_5, open('./features_5/bot_adverbs_5.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

pickle.dump(top_verbs_10, open('./features_10/top_verbs_10.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(bot_verbs_10, open('./features_10/bot_verbs_10.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(top_adverbs_10, open('./features_10/top_adverbs_10.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(bot_adverbs_10, open('./features_10/bot_adverbs_10.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

pickle.dump(top_verbs_20, open('./features_20/top_verbs_20.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(bot_verbs_20, open('./features_20/bot_verbs_20.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(top_adverbs_20, open('./features_20/top_adverbs_20.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(bot_adverbs_20, open('./features_20/bot_adverbs_20.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

pickle.dump(top_verbs_30, open('./features_30/top_verbs_30.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(bot_verbs_30, open('./features_30/bot_verbs_30.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(top_adverbs_30, open('./features_30/top_adverbs_30.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(bot_adverbs_30, open('./features_30/bot_adverbs_30.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
"""Top 20 verbs for conditional probability"""
verb_cond[verb_cond['count']>=5].sort_values('cond_proba',ascending=False).iloc[:20]

Unnamed: 0,cond_proba,count,lower_bound,lower_bound_no_pi
boycott,0.809524,21.0,0.641573,0.022526
terrorise,0.8,5.0,0.449385,0.0
vow,0.8,5.0,0.449385,0.0
stress,0.8,5.0,0.449385,0.0
witness,0.6,5.0,0.170586,0.0
behave,0.583333,12.0,0.304389,0.137722
yell,0.571429,7.0,0.204823,0.061965
round,0.555556,9.0,0.230912,0.119801
swear,0.555556,9.0,0.230912,0.119801
threaten,0.529412,17.0,0.292138,0.233315


In [11]:
"""Top 20 verbs for our score"""
verb_cond[verb_cond['count']>=5].sort_values('lower_bound',ascending=False).iloc[:20]

Unnamed: 0,cond_proba,count,lower_bound,lower_bound_no_pi
boycott,0.809524,21.0,0.641573,0.022526
vow,0.8,5.0,0.449385,0.0
stress,0.8,5.0,0.449385,0.0
terrorise,0.8,5.0,0.449385,0.0
fly,0.406061,1320.0,0.379567,0.567446
tweet,0.435484,62.0,0.312064,0.441096
ruin,0.428571,63.0,0.306369,0.449227
behave,0.583333,12.0,0.304389,0.137722
file,0.444444,45.0,0.299259,0.41037
insult,0.529412,17.0,0.292138,0.233315


In [12]:
"""Top 10 adverbs for conditional probability"""
adverb_cond[adverb_cond['count']>=5].sort_values('cond_proba',ascending=False).iloc[:10]

Unnamed: 0,cond_proba,count,lower_bound,lower_bound_no_pi
further,0.764706,17.0,0.563062,0.033651
again,0.645161,837.0,0.612747,0.322424
anytime,0.6,5.0,0.170586,0.0
racially,0.6,5.0,0.170586,0.0
after,0.571429,7.0,0.204823,0.061965
ever,0.463668,289.0,0.406173,0.478838
down,0.428571,7.0,0.061965,0.204823
consistently,0.4,5.0,0.0,0.170586
appallingly,0.4,5.0,0.0,0.170586
plus,0.333333,6.0,0.0,0.289464


In [13]:
"""Top 10 adverbs for our score"""
adverb_cond[adverb_cond['count']>=5].sort_values('lower_bound',ascending=False).iloc[:10]

Unnamed: 0,cond_proba,count,lower_bound,lower_bound_no_pi
again,0.645161,837.0,0.612747,0.322424
further,0.764706,17.0,0.563062,0.033651
ever,0.463668,289.0,0.406173,0.478838
never,0.264232,2547.0,0.247108,0.718644
after,0.571429,7.0,0.204823,0.061965
anytime,0.6,5.0,0.170586,0.0
racially,0.6,5.0,0.170586,0.0
personally,0.32,25.0,0.137141,0.497141
highly,0.333333,21.0,0.13171,0.465044
else,0.215909,88.0,0.129942,0.698124
