## Preprocessing steps
This notebook presents the basic cross-validation for the model:
1. Basic cleaning
2. Hashtag decomposition
3. Handling competitors' names
4. Handling questions
5. Adding bigrams
6. Adding indicator on whether BA emitted the tweet or not
7. Neighborhood building

In [1]:
import pandas as pd
import numpy as np
from ark_tweet_nlp import CMUTweetTagger
from tqdm import tqdm_notebook as tqdm
from nltk.stem import WordNetLemmatizer
import re
import pickle
import nltk.data
from multiprocessing import Pool
import enchant
from spellchecker import SpellChecker
from copy import deepcopy



## Loading data and preprocessing

In [2]:
"""Loading and formatting the data"""

data = pd.read_csv('./labeled_ba.csv', sep='\t', index_col=0)

In [3]:
"""Collapsing repetitions"""

def collapse_repeat(s):
    if len(s)<=2:
        return s
    else:
        ind_trm = []
        k = 0
        while k <= len(s) - 2:
            if s[k]==s[k+1]:
                i = k + 2
                while i<len(s) and s[i]==s[k]:
                    ind_trm.append(i)
                    i += 1
                k = i
            else:
                k += 1
        if len(ind_trm)==0:
            return s
        else:
            return ''.join([s[i] for i in range(len(s)) if i not in ind_trm])
        
"""Handling special negative expressions"""
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

neg_pats_1 = {r'havent\s': ' have not ', r'wouldnt\s': ' would not ', r'couldnt\s': ' could not ', r'cant\s': ' can not ', r'wont\s': ' will not ',
              r'didnt\s': ' did not ', r'dont\s': ' do not ', r'shouldnt\s': ' should not '}

def handle_neg(text):
    first = text.lower().replace("'ll",' will').replace("'re",' are').replace("'ve", ' have').replace("'m",' am').replace("'d", ' would')
    second = first.replace("haven't",'havent').replace("wouldn't",'wouldnt').replace("couldn't",'couldnt').replace("can't","cant").replace("won't",'wont').replace("didn't",'didnt').replace("shouldn't",'shouldnt').replace("don't",'dont')
    for key, value in neg_pats_1.items():
        second = re.sub(key, value, second)
    second = second.replace('wont',' will not').replace('dont',' do not').replace('cant',' can not').replace('havent',' have not').replace('wouldnt',' would not')
    third = ' '.join([sent.capitalize() for sent in sent_tokenizer.tokenize(second)])
    return third

"""Building company designations"""
companies = [['british','airways'],['iceland','air'],['vueling'],['easy','jet'],['air','canada'],['american','airways'],['united','airlines'],['qatar','airways'],
             ['virgin','atlantic'],['qatar','airways'],['norwegian','air'],['virgin'],['qatar']]

def generate_versions(name):
    if len(list(name))>=2:
        return [''.join(list(name)), '_'.join(list(name)), '@'+'_'.join(list(name)), '@'+''.join(list(name)), '#'+'_'.join(list(name)), '#'+''.join(list(name)),
               ' '.join(list(name))]
    else:
        return ['@'+name[0],name[0],'#'+name[0]]

ba = generate_versions(companies[0])

comp_des = []
for c in companies[1:]:
    comp_des += generate_versions(c)
    
"""Function to replace British Airways indicators as well as its competitors' indicators (as far as we know from reading the tweets)"""
pat_ba = re.compile('\s(' + '|'.join(ba) + ')\s')
pat_comp = re.compile(r'\s(' + '|'.join(comp_des) + ')\s')

def replace_ba(string):
    ret_str = sent_tokenizer.tokenize(' '.join(re.sub(pat_ba, ' ba ', string.lower()+' ').replace('@ba','ba').replace('#ba','ba').split()))
    ret_str = [sent.capitalize() for sent in ret_str]
    return ' '.join(ret_str)
            

def replace_comp(string):
    ret_str = sent_tokenizer.tokenize(' '.join(re.sub(pat_comp, ' no_ba ', string.lower()+' ').replace('@no_ba','no_ba').replace('#no_ba','no_ba').split()))
    ret_str = [sent.capitalize() for sent in ret_str]
    return ' '.join(ret_str)

"""Preprocessing the data"""
data['text'] = data.text.apply(collapse_repeat)
data['text'] = data.text.apply(lambda x: x.replace('"',' '))
data['text'] = data.text.apply(replace_ba).apply(replace_comp)
data['text'] = data.text.apply(handle_neg)
data['text'] = data.text.apply(lambda x: x.replace(' i ', ' I '))

In [4]:
"""Provenance from BA variable"""
import re

ba_pat = re.compile("\^\w+")
def find_ba_reply(text):
    s = [m.start() for m in re.finditer(ba_pat, text)]
    return(len(s)>0)

data['from_ba'] = data.text.apply(find_ba_reply).apply(int)

In [5]:
"""POS-tagging"""
def tagg(x):
    return(pd.DataFrame(CMUTweetTagger.runtagger_parse([x])[0], columns=['word','tag','score']))

tag_text = []
N = data.shape[0]
p = Pool(30)

for k in tqdm(range(N//500+1)):
    tag_text += p.map(tagg, list(data.text.iloc[500*k:min(N,500*(k+1))]))
    
data['tag_df'] = pd.Series(tag_text)

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))




## Hashtag decomposition

In [6]:
"""1. Building corpus"""
corpus = set()

for k in tqdm(range(data.shape[0]), desc='Building corpus'):
    local = data.tag_df.iloc[k]
    for i in range(local.shape[0]):
        if any(str(k) in local.word.iloc[i] for k in range(10)):
            continue
        if '#' in local.word.iloc[i]:
            continue
        if local.tag.iloc[i] in ['#','U','&',',','O','$','D','!','^','@']:
            continue
        if "'" in local.word.iloc[i]:
            continue
        if '_' in local.word.iloc[i]:
            continue
        w = re.split('\W+', local.word.iloc[i].replace("n't"," not"))
        corpus.update(set(w))

corpus.remove('')
corpus = set([e for e in corpus])

list_corpus = list(corpus)
to_remove = []
keepers = ['an','of','air','ways','lines','ing','lon','un','don',]

for i in tqdm(range(len(list_corpus)), desc='Removing concatenations of basics'):
    w_i = list_corpus[i]
    if w_i in keepers or len(w_i)<3:
        continue
    for j in range(i+1, len(list_corpus)):
        w_j = list_corpus[j]
        if w_j in keepers:
            continue
        if w_i+w_j in corpus:
            to_remove.append(w_i+w_j)
        if w_j+w_i in corpus:
            to_remove.append(w_j+w_i)
            
corpus = set([e.replace("'",'') for e in corpus if len(e)>1])
corpus.remove('baagain')

"""Clean the corpus"""
eng_dict = enchant.Dict('en')
remove_from_corpus = set([w for w in corpus if not eng_dict.check(w)])

spell = SpellChecker(distance=2)
spell.word_frequency.load_words(list(corpus.difference(remove_from_corpus)))
misspelled = spell.unknown(list(remove_from_corpus))

corrected = {w: w for w in tqdm(corpus.difference(remove_from_corpus))}
corrected.update({w: spell.correction(w) for w in tqdm(remove_from_corpus)})

corrected['flyng'] = 'flying'
corrected['bycott'] = 'boycott'
corrected['flyng'] = 'flying'
corrected['bycott'] = 'boycott'

for k in tqdm(range(data.shape[0])):
    local = data.tag_df.at[k]
    local['word'] = local.word.apply(lambda x: corrected[x] if x in corrected.keys() else x)
    data['tag_df'].at[k] = deepcopy(local)
    
"""Harvesting the hashtags"""
hashtag_set = set()

for k in tqdm(range(data.shape[0])):
    local = data.tag_df.iloc[k]
    for i in range(local.shape[0]):
        if local.tag.iloc[i]=='#' or '#' in local.word.iloc[i]:
            hashtag_set.add(local.word.iloc[i])
            
print('A total of {} hastags'.format(len(hashtag_set)))

HBox(children=(IntProgress(value=0, description='Building corpus', max=11684), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Removing concatenations of basics', max=12235), HTML(value=''…




HBox(children=(IntProgress(value=0, max=10187), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2011), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11684), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11684), HTML(value='')))


A total of 2424 hastags


In [7]:
"""2. Function to find the splits with only known words"""
proper = ['british','airways','iceland','air','qatar','vueling','norwegian','air','american','canada','united','airlines','ryanair','easy','jet','virgin','atlantic']
new_corpus = set(list(corrected.keys())).union(proper)
for w in proper:
    corrected[w] = w
corrected['flyijng'] = 'flying'
corrected['safair'] = 'safair'


def valid_split(ht):
    
    if ht in new_corpus:
        return [[ht]]
    
    components = [e for e in new_corpus if e in ht]
    queue = [[ht]]
    candidates = []
    
    while len(queue)>0:
        # Dequeuing
        next_c = []
        base = queue[-1][:-1]
        suite = queue[-1][-1]
        del queue[-1]
        
        # Looking for news candidates
        for c in components:
            if c==suite[:len(c)] and len(c)<len(suite):
                next_c.append(base + [suite[:len(c)], suite[len(c):]])
                candidates.append(next_c[-1])
            elif c==suite[:len(c)] and len(c)==len(suite):
                # If no more characters in string add words
                candidates.append(base + [suite[:len(c)]])
            else:
                continue
                
        # Growing the queue
        check = [e for e in next_c if not any(f not in new_corpus for f in e[:-1])]
        queue += check
        
    return [[corrected[f] for f in e] for e in candidates if e[-1] in new_corpus]

hashtag_list = list(hashtag_set)
candidate_lists = [valid_split(e.replace('#','')) for e in tqdm(hashtag_list, desc='Extracting valid splits')]

"""Dropping duplicates in candidates"""
candidate_lists = [[list(a) for a in list(set([tuple(e) for e in f]))] for f in candidate_lists]

"""Checking how many hashtags have more than one valid split"""
counts = {}
for k in tqdm(range(len(candidate_lists)), desc='Histogram of number of splits'):
    try:
        counts[len(candidate_lists[k])] += 1
    except:
        counts[len(candidate_lists[k])] = 1
        
print('Counts histogram :\n{}'.format(pd.Series(counts)))

HBox(children=(IntProgress(value=0, description='Extracting valid splits', max=2424), HTML(value='')))




HBox(children=(IntProgress(value=0, description='Histogram of number of splits', max=2424), HTML(value='')))


Counts histogram :
1      940
0      821
2      327
8       12
10      23
4       83
5       18
6       43
9        5
15       1
30       6
20      13
3      102
40       4
12       6
7        6
14       3
24       1
16       1
130      1
18       3
50       2
60       2
11       1
dtype: int64


In [8]:
"""3. Computing the cooccurrence matrix"""
coocc = {}

for k in tqdm(range(data.shape[0]), desc='Building the co-occurrence matrix'):
    local = data.tag_df.iloc[k]
    local = local[local.tag.isin(['#','U','&',',','O','$','D','!','^','@']).apply(lambda x: not x)]
    local = local[local.word.apply(lambda x: '#' not in x)]
    if local.shape[0]>1:
        local = list(local.word)
        for i in range(1, len(local)):
            w1 = local[i-1]
            w2 = local[i]
            try:
                coocc[w1][w2] += 1
            except:
                try:
                    coocc[w1][w2] = 1
                except:
                    coocc[w1] = {w2: 1}
                    
coocc = pd.DataFrame(coocc).fillna(0.)
relevant = list(corpus.intersection(list(coocc.index)))
coocc = coocc.loc[relevant, relevant].fillna(0.)

HBox(children=(IntProgress(value=0, description='Building the co-occurrence matrix', max=11684), HTML(value=''…




Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


In [9]:
"""4. Computing External Value for the various configurations as the product of 
left-side value and right-side value"""

dict_hdf_ev = {}
for k in tqdm(range(len(hashtag_list)), desc='Computing EV for candidates'):
    dict_hdf_ev[hashtag_list[k]] = {}
    for e in candidate_lists[k]:
        ev = []
        for w in e:
            try:
                ev.append(coocc[w].sum() * coocc.loc[w].sum())
            except:
                ev.append(0.)
        dict_hdf_ev[hashtag_list[k]][tuple(e)] = ev

HBox(children=(IntProgress(value=0, description='Computing EV for candidates', max=2424), HTML(value='')))




In [10]:
"""5. Computing Internal value for the various splits"""

"""First, computing the Mutual Information scores of the splits"""

mi_matrix = {}
counts = {}

for k in tqdm(range(data.shape[0]), desc='Computing the Mutual Information between characters'):
    local = data.tag_df.iloc[k]
    local = local[local.tag.isin(['#','U',',','$','!']).apply(lambda x: not x)]
    local = local[local.word.apply(lambda x: '#' not in x)]
    if local.shape[0]>1:
        local = [w for w in list(local.word) if w!=''] 
        for i in range(1, len(local)):
            c1 = local[i-1][-1]
            c2 = local[i][0]
            # Updating counts
            try:
                counts[c1] += 1
            except:
                counts[c1] = 1
            # Updating follow ups
            try:
                mi_matrix[c1][c2] += 1
            except:
                try:
                    mi_matrix[c1][c2] = 1
                except:
                    mi_matrix[c1] = {c2: 1}
                            
mi_matrix = pd.DataFrame(mi_matrix)
mi_matrix = mi_matrix.loc[list(mi_matrix.index), list(mi_matrix.index)]
mi_df = (mi_matrix + mi_matrix.T)
mi_df.fillna(0., inplace=True)        

"""Computing the probabilities"""
base_proba = pd.Series(counts)
pair_proba = mi_df/np.sum(base_proba)
base_proba /= np.sum(base_proba)

"""Computing Mutual Information"""
mi_matrix = pair_proba * pd.DataFrame(1/np.array(base_proba).reshape((-1,1)).dot(np.array(base_proba).reshape((1,-1))), index=list(base_proba.index), columns=list(base_proba.index))
mi_matrix.fillna(0., inplace=True)


"""Second for every candidate split, we compute the Boundary scores"""
dict_hdf_iv = {}
for k in tqdm(range(len(hashtag_list)), desc='Computing IV for candidates'):
    dict_hdf_iv[hashtag_list[k]] = {}
    for e in candidate_lists[k]:
        if len(e)<=1:
            dict_hdf_iv[hashtag_list[k]][tuple(e)] = []
        else:
            iv = []
            for i in range(1, len(e)):
                iv.append(mi_matrix.loc[e[i-1][-1], e[i][0]])
            dict_hdf_iv[hashtag_list[k]][tuple(e)] = iv
            

"""Computing the Word Rank score as the product of 
- the geometric mean of EV (intuition : high EV reflects high word importance in the 
cooccurrence graph)
- a decreasing function of maximum IV (intuition : IV is large when split is very likely, 
large min IV indicates high probability of split validity)"""

dict_EV = {}
for ht in dict_hdf_ev.keys():
    dict_EV[ht] = {}
    if len(dict_hdf_ev[ht])>=1:
        for c in dict_hdf_ev[ht]:
            dict_EV[ht][c] = np.prod(dict_hdf_ev[ht][c])**(1./len(dict_hdf_ev[ht][c]))        

dict_IV = {}
for ht in dict_hdf_iv.keys():
    dict_IV[ht] = {}
    if len(dict_hdf_iv[ht])>=1:
        for c in dict_hdf_iv[ht]:
            if len(c)==1:
                dict_IV[ht][c] = 100.
            else:
                dict_IV[ht][c] = np.min(dict_hdf_iv[ht][c])

HBox(children=(IntProgress(value=0, description='Computing the Mutual Information between characters', max=116…




Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


HBox(children=(IntProgress(value=0, description='Computing IV for candidates', max=2424), HTML(value='')))




In [11]:
"""6. Given the results, and the large presence of small letters words due to the errors
in tweets, we'll two criteria to select the best split :
1. Largest IV
2. In case of ties, choose the split with smallest number of tokens"""
split_hashtags = {}

for k in tqdm(range(len(hashtag_list))):
    ht = hashtag_list[k]
    if len(list(dict_IV[ht].keys()))==0:
        split_hashtags[ht] = ht.replace('#','')
    elif len(list(dict_IV[ht].keys()))==1:
        split_hashtags[ht] = ', '.join(list(dict_IV[ht].keys())[0])
    else:
        loc_dict = dict_IV[ht]
        loc_iv = {', '.join(c): loc_dict[c] for c in loc_dict.keys()}
        loc_iv_df = pd.DataFrame([pd.Series(loc_iv),
                                  pd.Series([len(c.split(', ')) for c in pd.Series(loc_iv).index], index=pd.Series(loc_iv).index)],
                                 index=['iv','len'])
        loc_iv_df.sort_values(['iv','len'], ascending=[False,True], axis=1, inplace=True)
        split_hashtags[ht] = loc_iv_df.columns[0]
        

"""Normalizing the hashtags and correcting the relevant ones"""
#"""Correcting relevant hashtags"""
split_hashtags['#ineverflywithbaagain'] = 'i, never, fly, with, ba, again'
split_hashtags['#boycottryanair'] = 'boycott, no_ba'
split_hashtags['#neverflybritish'] = 'never, fly, ba'
split_hashtags['#boycottbritish_airways'] = 'boycott, ba'

for key, value in split_hashtags.items():
    """Normalizing airlines"""
    split_hashtags[key] = value.replace('british, airways','ba').replace('ryanair','no_ba').replace('americanairlines','no_ba').replace('vueling','no_ba').replace('iberia,','no_ba,').replace(' no,',' not,').replace('norwegian','no_ba').replace('virgin, atlantic', 'no_ba').replace('virgin,','no_ba,')
    """Normalizing verbs with negatives"""
    split_hashtags[key] = split_hashtags[key].replace(' wont,',' will, not,').replace(' cant,',' can, not,').replace(' dont,',' do, not,').replace(' wouldnt,',' would, not,').replace(' shouldnt,',' should, not,')

HBox(children=(IntProgress(value=0, max=2424), HTML(value='')))




In [12]:
"""Tagging all the split hashtags"""
hashtag_list = []
tokenized_hashtags = []

for ht in split_hashtags.keys():
    hashtag_list.append(ht)
    tokenized_hashtags.append(split_hashtags[ht].replace(',',''))
    
H = len(tokenized_hashtags)
tagged_hashtags = []
p = Pool(40)

for k in tqdm(range(H//500+1)):
    tagged_hashtags += p.map(tagg, tokenized_hashtags[500*k:min(H,500*(k+1))])

tagged_hashtags = {hashtag_list[k]: tagged_hashtags[k] for k in range(len(hashtag_list))}

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




In [13]:
"""Additional cleaning to remove some special characters"""
for k in tqdm(range(data.shape[0])):
    local = data.tag_df.iloc[k]
    local['word'] = local['word'].apply(lambda x: x.replace('-','').replace('/','').replace('\\','').replace("'",'').replace(' ','').lower())
    data['tag_df'].at[k] = deepcopy(local)

HBox(children=(IntProgress(value=0, max=11684), HTML(value='')))




In [14]:
"""Collecting verbs, adverbs, and adjectives"""
from nltk import WordNetLemmatizer

lmtzr = WordNetLemmatizer()
unique_verbs = set()
unique_adverbs = set()
unique_adjectives = set()

for k in tqdm(range(data.shape[0])):
    local = data.tag_df.iloc[k]
    local = local[local.word.apply(lambda x: x.count('-')<=1 and x.count('/')<=1 and not(any(str(k) in x for k in range(10))) and not('#' in x))]
    unique_verbs.update(list(local[local.tag=='V'].word.apply(lambda x: lmtzr.lemmatize(x.replace('-','').replace('/','').replace('\\',''),'v'))))
    unique_adverbs.update(list(local[local.tag=='R'].word.apply(lambda x: x.replace('-','').replace('/','').replace('\\',''))))
    unique_adjectives.update(list(local[local.tag=='A'].word.apply(lambda x: x.replace('-','').replace('/','').replace('\\',''))))
    
"""Lemmatizing all verbs"""
lem_verbs = {verb: lmtzr.lemmatize(verb, 'v') for verb in unique_verbs}
lem_adj = {adj: lmtzr.lemmatize(adj, 'a') for adj in unique_adjectives}
lem_adv = {adv: adv for adv in unique_adverbs}
lem_words = dict()
lem_words.update(lem_adv)
lem_words.update(lem_adj)
lem_words.update(lem_verbs)

HBox(children=(IntProgress(value=0, max=11684), HTML(value='')))




In [15]:
"""Building the neighborhoods"""
unique_corpus = unique_verbs.union(unique_adverbs).union(unique_adjectives)

def build_neighborhood(local):
    #Instantiating lemmatizer
    lmtzr = WordNetLemmatizer()
    
    try:
        #Removing non interesting tags, tweet specific tags (emojis, existentials, numbers, URLs, the &, punctuation, unknown, @ mentions, determinants)
        local = local[local.word.apply(lambda x: x.count('-')<=1 and x.count('/')<=1 and not(any(str(k) in x for k in range(10))) and not('#' in x))]
        local = local[local.tag.isin(['U','&',',','$','!','^','#']).apply(lambda x: not x)]
        if local.shape[0]==0:
            return([],[],[])
        else: 
            #Lemmatizing the words to remove the verb and adverb tokens to be considered
            local['word'] = local.word.apply(lambda x: x.replace('-',' ').replace('/',' ').replace('\\','').replace("'",''))
            #Extracting the verb and adverb patterns
            local_words_5 = []
            local_words_7 = []
            local_words_9 = []
            for i in range(local.shape[0]):
                w = local.word.iloc[i]
                if w in unique_corpus or local.tag.iloc[i] in ['V','R']:#,'N']:
                    neighborhood_5 = local.iloc[max(0,i-2):min(local.shape[0],i+3)]
                    neighborhood_7 = local.iloc[max(0,i-3):min(local.shape[0],i+4)]
                    neighborhood_9 = local.iloc[max(0,i-4):min(local.shape[0],i+5)]
                    local_words_5.append(list(neighborhood_5.word))
                    local_words_7.append(list(neighborhood_7.word))
                    local_words_9.append(list(neighborhood_9.word))
                #If the word is a verb, add it to the bank of verbs
                if (local.tag.iloc[i]=='V') and ('#' not in w) and ('&' not in w) and not(any(str(k) in w for k in range(10))) and ('-' not in w) and ('/' not in w):
                    if type(w)==list:
                        unique_verbs.update(set(w))
                    else:
                        unique_verbs.add(w)
            return(local_words_5, local_words_7, local_words_9)
    except:
        return ([],[],[])

neighborhoods = []
p = Pool(40)
for k in tqdm(range(N//500+1)):
    neighborhoods += p.map(build_neighborhood, list(data.tag_df.iloc[500*k:min(500*(k+1),N)]))

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))




In [16]:
actual_words_5 = [[[lem_words[w.lower()] if w.lower() in lem_words.keys() else w.lower() for w in v] for v in neighborhoods[k][0]] for k in tqdm(range(len(neighborhoods)))] 
actual_words_7 = [[[lem_words[w.lower()] if w.lower() in lem_words.keys() else w.lower() for w in v] for v in neighborhoods[k][1]] for k in tqdm(range(len(neighborhoods)))]
actual_words_9 = [[[lem_words[w.lower()] if w.lower() in lem_words.keys() else w.lower() for w in v] for v in neighborhoods[k][2]] for k in tqdm(range(len(neighborhoods)))]

HBox(children=(IntProgress(value=0, max=11684), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11684), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11684), HTML(value='')))




In [17]:
"""Building hashtag neighbordhoods, and correcting some of them"""
tagged_hashtags['#neveragainonba'] = pd.DataFrame([['never','again','on','ba'],['R','R','P','N'],[1.,1.,1.,1.]], index=['word','tag','score']).T
tagged_hashtags['#boycottryanair'] = pd.DataFrame([['boycott','no_ba'],['R','N'],[1.,1.]], index=['word','tag','score']).T
tagged_hashtags['#neverflybritish'] = pd.DataFrame([['never','fly','ba'],['R','V','N'],[1.,1.,1.]], index=['word','tag','score']).T
tagged_hashtags['#ineverflywithbaagain'] = pd.DataFrame([['i','never','fly','with','ba','again'],['P','R','V','P','N','R'],[1.,1.,1.,1.,1.,1.]], index=['word','tag','score']).T

hashtag_neighborhoods = []
for k in tqdm(range(data.shape[0])):
    local = data.tag_df.iloc[k]
    local = local[(local.tag=='#') | (local.word.apply(lambda x: '#' in x))]
    if local.shape[0]==0:
        hashtag_neighborhoods.append([])
    else:
        local_list = []
        for w in list(local.word):
            try:
                words = list(tagged_hashtags[w].word.apply(lambda x: x.replace("'",'')))
                words = [lmtzr.lemmatize(words[i], 'v') if tagged_hashtags[w].tag.iloc[i]=='V' else words[i] for i in range(len(words)) ]
                local_list.append((', '.join(words).replace('dont,', 'do, not,').replace('wont,','will, not,')).split(','))
            except:
                pass
        hashtag_neighborhoods.append(local_list)
        
hashtag_sents = []
for k in tqdm(range(len(hashtag_neighborhoods))):
    local = []
    for e in hashtag_neighborhoods[k]:
        local.append([w.replace(' ','') for w in e])
    hashtag_sents.append(local)

HBox(children=(IntProgress(value=0, max=11684), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11684), HTML(value='')))




In [18]:
"""Building full representation"""
full_representation = []
for k in tqdm(range(data.shape[0])):
    local = data.tag_df.iloc[k]
    if local.shape[0]>0:
        local['word'] = local.word.apply(lambda x: x.replace('-',' ').replace('/',' ').replace('\\','').replace("'",''))
        local['word'] = local.T.apply(lambda x: (lmtzr.lemmatize(x['word'].replace("'",''), x['tag'].lower()) if x['tag'] in ['V','R','A','N'] else x['word'].lower()) if type(x['word'])==str else '')
        full_representation.append(list(local.word) + ','.join([','.join(v) for v in hashtag_neighborhoods[k] if type(v)==str]).split(','))
    else:
        full_representation.append([])

"""Building vector representations based on full representation"""
vec_reps = []
for k in tqdm(range(len(full_representation))):
    loc = full_representation[k]
    v = []
    for w in loc:
        t = w.replace('-',' ').replace('/',' ').replace('\\','')
        try:
            t = lem_words[t]
            v.append(t)
        except:
            pass
    vec_reps.append(deepcopy(v))
    
hashtag_sents = [[[lem_words[w.lower()] if w.lower() in lem_words.keys() else w.lower() for w in v] for v in hashtag_sents[k]] for k in tqdm(range(len(hashtag_sents)))]

HBox(children=(IntProgress(value=0, max=11684), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11684), HTML(value='')))




HBox(children=(IntProgress(value=0, max=11684), HTML(value='')))




In [19]:
"""Adding the hashtag neighborhoods to the first neighborhoods"""
actual_words_5 = [actual_words_5[k] + hashtag_sents[k] for k in range(len(actual_words_5))]
actual_words_7 = [actual_words_7[k] + hashtag_sents[k] for k in range(len(actual_words_7))]
actual_words_9 = [actual_words_9[k] + hashtag_sents[k] for k in range(len(actual_words_9))]

In [24]:
"""Saving the neighborhood representation as well as the full representation for further use"""
pickle.dump((split_hashtags, tagged_hashtags), open('./hashtags_BA.b','wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(data, open('./processed_BA.b','wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump((actual_words_5, actual_words_7, actual_words_9), open('./neighborhoods_ba.b','wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(full_representation, open('./full_ba.b','wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(data['from_ba'], open('./from_ba.b','wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump((lem_verbs, lem_adj, lem_adv, lem_words, unique_verbs, unique_adverbs, unique_adjectives), open('./vocab_BA.b','wb'), protocol=pickle.HIGHEST_PROTOCOL)