In [2]:
import pandas as pd
import numpy as np
from ark_tweet_nlp import CMUTweetTagger
from tqdm import tqdm_notebook as tqdm
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
import pickle
import copy
import re

In [3]:
"""Loading the tagged data"""
data = pickle.load(open('./label_tag_data.p', 'rb'))
! mkdir hashtags

In [4]:
"""Harvesting the hashtags"""
hashtag_set = set()

for k in tqdm(range(data.shape[0])):
    local = data.tag_df.iloc[k]
    for i in range(local.shape[0]):
        if local.tag.iloc[i]=='#' or '#' in local.word.iloc[i]:
            hashtag_set.add(local.word.iloc[i])
            
print('A total of {} hastags'.format(len(hashtag_set)))

HBox(children=(IntProgress(value=0, max=11540), HTML(value='')))


A total of 2546 hastags


In [5]:
"""1. Building corpus"""
corpus = set()

for k in tqdm(range(data.shape[0]), desc='Building corpus'):
    local = data.tag_df.iloc[k]
    for i in range(local.shape[0]):
        if any(str(k) in local.word.iloc[i] for k in range(10)):
            continue
        if '#' in local.word.iloc[i]:
            continue
        if local.tag.iloc[i] in ['#','U','&',',','O','$','D','!','^','@']:
            continue
        if "'" in local.word.iloc[i]:
            continue
        if '_' in local.word.iloc[i]:
            continue
        w = re.split('\W+', local.word.iloc[i].replace("n't"," not"))
        corpus.update(set(w))
        
corpus.remove('')
corpus = set([e for e in corpus])

list_corpus = list(corpus)
to_remove = []
keepers = ['an','of','air','ways','lines','ing','lon','un','don'] # Some problematic tokens in the segmentation phase

for i in tqdm(range(len(list_corpus)), desc='Removing concatenations of basics'):
    w_i = list_corpus[i]
    if w_i in keepers or len(w_i)<3:
        continue
    for j in range(i+1, len(list_corpus)):
        w_j = list_corpus[j]
        if w_j in keepers:
            continue
        if w_i+w_j in corpus:
            to_remove.append(w_i+w_j)
        if w_j+w_i in corpus:
            to_remove.append(w_j+w_i)
            
corpus = set([e.replace("'",'') for e in corpus if len(e)>1])
corpus.remove('baagain') # Removing a problematic token

HBox(children=(IntProgress(value=0, description='Building corpus', max=11540), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Removing concatenations of basics', max=11040), HTML(value=''…




In [6]:
"""2. Function to find the splits with only known words"""
def valid_split(ht):
    
    if ht in corpus:
        return [[ht]]
    
    components = [e for e in corpus if e in ht]
    queue = [[ht]]
    candidates = []
    
    while len(queue)>0:
        # Dequeuing
        next_c = []
        base = queue[-1][:-1]
        suite = queue[-1][-1]
        del queue[-1]
        
        # Looking for news candidates
        for c in components:
            if c==suite[:len(c)] and len(c)<len(suite):
                next_c.append(base + [suite[:len(c)], suite[len(c):]])
                candidates.append(next_c[-1])
            elif c==suite[:len(c)] and len(c)==len(suite):
                # If no more characters in string add words
                candidates.append(base + [suite[:len(c)]])
            else:
                continue
                
        # Growing the queue
        check = [e for e in next_c if not any(f not in corpus for f in e[:-1])]
        queue += check
        
    return [e for e in candidates if e[-1] in corpus]

hashtag_list = list(hashtag_set)
candidate_lists = [valid_split(e.replace('#','')) for e in tqdm(hashtag_list, desc='Extracting valid splits')]

"""Dropping duplicates in candidates"""
candidate_lists = [[list(a) for a in list(set([tuple(e) for e in f]))] for f in candidate_lists]

HBox(children=(IntProgress(value=0, description='Extracting valid splits', max=2546), HTML(value='')))




In [7]:
"""3. Computing the cooccurrence matrix"""

coocc = {}

for k in tqdm(range(data.shape[0]), desc='Building the co-occurrence matrix'):
    local = data.tag_df.iloc[k]
    local = local[local.tag.isin(['#','U','&',',','O','$','D','!','^','@']).apply(lambda x: not x)]
    local = local[local.word.apply(lambda x: '#' not in x)]
    if local.shape[0]>1:
        local = list(local.word)
        for i in range(1, len(local)):
            w1 = local[i-1]
            w2 = local[i]
            try:
                coocc[w1][w2] += 1
            except:
                try:
                    coocc[w1][w2] = 1
                except:
                    coocc[w1] = {w2: 1}
                    
coocc = pd.DataFrame(coocc).fillna(0.)
relevant = list(corpus.intersection(list(coocc.index)))
coocc = coocc.loc[relevant, relevant].fillna(0.)

HBox(children=(IntProgress(value=0, description='Building the co-occurrence matrix', max=11540), HTML(value=''…




Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


In [8]:
"""4. Computing External Value for the various configurations as the product of 
left-side value and right-side value"""

dict_hdf_ev = {}
for k in tqdm(range(len(hashtag_list)), desc='Computing EV for candidates'):
    dict_hdf_ev[hashtag_list[k]] = {}
    for e in candidate_lists[k]:
        ev = []
        for w in e:
            try:
                ev.append(coocc[w].sum() * coocc.loc[w].sum())
            except:
                ev.append(0.)
        dict_hdf_ev[hashtag_list[k]][tuple(e)] = ev

HBox(children=(IntProgress(value=0, description='Computing EV for candidates', max=2546), HTML(value='')))




In [9]:
"""5. Computing Internal value for the various splits"""

"""First, computing the Mutual Information scores of the splits"""

mi_matrix = {}
counts = {}

for k in tqdm(range(data.shape[0]), desc='Computing the Mutual Information between characters'):
    local = data.tag_df.iloc[k]
    local = local[local.tag.isin(['#','U','&',',','O','$','D','!','^','@']).apply(lambda x: not x)]
    local = local[local.word.apply(lambda x: '#' not in x)]
    if local.shape[0]>1:
        local = list(local.word)
        for i in range(1, len(local)):
            c1 = local[i-1][-1]
            c2 = local[i][0]
            # Updating counts
            try:
                counts[c1] += 1
            except:
                counts[c1] = 1
            # Updating follow ups
            try:
                mi_matrix[c1][c2] += 1
            except:
                try:
                    mi_matrix[c1][c2] = 1
                except:
                    mi_matrix[c1] = {c2: 1}
                            
mi_matrix = pd.DataFrame(mi_matrix)
mi_matrix = mi_matrix.loc[list(mi_matrix.index), list(mi_matrix.index)]
mi_df = (mi_matrix + mi_matrix.T)
mi_df.fillna(0., inplace=True)        

"""Computing the probabilities"""

base_proba = pd.Series(counts)
pair_proba = mi_df/np.sum(base_proba)
base_proba /= np.sum(base_proba)

"""Computing Mutual Information"""
mi_matrix = pair_proba * pd.DataFrame(1/np.array(base_proba).reshape((-1,1)).dot(np.array(base_proba).reshape((1,-1))), index=list(base_proba.index), columns=list(base_proba.index))
mi_matrix.fillna(0., inplace=True)

HBox(children=(IntProgress(value=0, description='Computing the Mutual Information between characters', max=115…




Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


In [10]:
"""Second for every candidate split, we compute the Boundary scores"""
dict_hdf_iv = {}
for k in tqdm(range(len(hashtag_list)), desc='Computing IV for candidates'):
    dict_hdf_iv[hashtag_list[k]] = {}
    for e in candidate_lists[k]:
        if len(e)<=1:
            dict_hdf_iv[hashtag_list[k]][tuple(e)] = []
        else:
            iv = []
            for i in range(1, len(e)):
                iv.append(mi_matrix.loc[e[i-1][-1], e[i][0]])
            dict_hdf_iv[hashtag_list[k]][tuple(e)] = iv

HBox(children=(IntProgress(value=0, description='Computing IV for candidates', max=2546), HTML(value='')))




In [11]:
"""Computing the Word Rank score as the product of 
- the geometric mean of EV (intuition : high EV reflects high word importance in the 
cooccurrence graph)
- a decreasing function of maximum IV (intuition : IV is large when split is very likely, 
large min IV indicates high probability of split validity)"""

dict_EV = {}
for ht in dict_hdf_ev.keys():
    dict_EV[ht] = {}
    if len(dict_hdf_ev[ht])>=1:
        for c in dict_hdf_ev[ht]:
            dict_EV[ht][c] = np.prod(dict_hdf_ev[ht][c])**(1./len(dict_hdf_ev[ht][c]))        

dict_IV = {}
for ht in dict_hdf_iv.keys():
    dict_IV[ht] = {}
    if len(dict_hdf_iv[ht])>=1:
        for c in dict_hdf_iv[ht]:
            if len(c)==1:
                dict_IV[ht][c] = 100.
            else:
                dict_IV[ht][c] = np.min(dict_hdf_iv[ht][c])

In [12]:
"""6. Given the results, and the large presence of small letters words due to the errors
in tweets, we'll two criteria to select the best split :
1. Largest IV
2. In case of ties, choose the split with smallest number of tokens"""

split_hashtags = {}

for k in tqdm(range(len(hashtag_list))):
    ht = hashtag_list[k]
    if len(list(dict_IV[ht].keys()))==0:
        split_hashtags[ht] = ht.replace('#','')
    elif len(list(dict_IV[ht].keys()))==1:
        split_hashtags[ht] = ', '.join(list(dict_IV[ht].keys())[0])
    else:
        loc_dict = dict_IV[ht]
        loc_iv = {', '.join(c): loc_dict[c] for c in loc_dict.keys()}
        loc_iv_df = pd.DataFrame([pd.Series(loc_iv),
                                  pd.Series([len(c.split(', ')) for c in pd.Series(loc_iv).index], index=pd.Series(loc_iv).index)],
                                 index=['iv','len'])
        loc_iv_df.sort_values(['iv','len'], ascending=[False,True], axis=1, inplace=True)
        split_hashtags[ht] = loc_iv_df.columns[0]

HBox(children=(IntProgress(value=0, max=2546), HTML(value='')))




In [13]:
"""Tagging all the split hashtags"""
hashtag_list = []
tokenized_hashtags = []

for ht in split_hashtags.keys():
    hashtag_list.append(ht)
    tokenized_hashtags.append(split_hashtags[ht].replace(',',''))

def tagg(x):
    return(CMUTweetTagger.runtagger_parse([x])[0])

tagged_hashtags = [tagg(tht) for tht in tqdm(tokenized_hashtags)]

tagged_hashtags = {hashtag_list[k]: tagged_hashtags[k] for k in range(len(hashtag_list))}

HBox(children=(IntProgress(value=0, max=2546), HTML(value='')))




In [15]:
pickle.dump(split_hashtags, open('./hashtags/split_hashtags.p','wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(tagged_hashtags, open('./hashtags/tagged_hashtags_last.p','wb'), protocol=pickle.HIGHEST_PROTOCOL)