## Preprocessing steps
This notebook presents the basic cross-validation for the model:
1. Basic cleaning
2. Hashtag decomposition
3. Handling competitors' names
4. Handling questions
5. Adding bigrams
6. Adding indicator on whether BA emitted the tweet or not
7. Neighborhood building

In [1]:
import pandas as pd
import numpy as np
from ark_tweet_nlp import CMUTweetTagger
from tqdm import tqdm_notebook as tqdm
from nltk.stem import WordNetLemmatizer
import re
import pickle
import nltk.data
from multiprocessing import Pool
import enchant
from spellchecker import SpellChecker
from copy import deepcopy

## Loading data and preprocessing

In [12]:
"""Loading the MacDonald's scandal data"""
data = pd.read_csv('./labeled_mcd.csv', sep='\t', index_col=0)

In [14]:
"""Collapsing repetitions"""

def collapse_repeat(s):
    if len(s)<=2:
        return s
    else:
        ind_trm = []
        k = 0
        while k <= len(s) - 2:
            if s[k]==s[k+1]:
                i = k + 2
                while i<len(s) and s[i]==s[k]:
                    ind_trm.append(i)
                    i += 1
                k = i
            else:
                k += 1
        if len(ind_trm)==0:
            return s
        else:
            return ''.join([s[i] for i in range(len(s)) if i not in ind_trm])
        
"""Handling special negative expressions"""
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

neg_pats_1 = {r'havent\s': ' have not ', r'wouldnt\s': ' would not ', r'couldnt\s': ' could not ', r'cant\s': ' can not ', r'wont\s': ' will not ',
              r'didnt\s': ' did not ', r'dont\s': ' do not ', r'shouldnt\s': ' should not '}

def handle_neg(text):
    first = text.lower().replace("'ll",' will').replace("'re",' are').replace("'ve", ' have').replace("'m",' am').replace("'d", ' would')
    second = first.replace("haven't",'havent').replace("wouldn't",'wouldnt').replace("couldn't",'couldnt').replace("can't","cant").replace("won't",'wont').replace("didn't",'didnt').replace("shouldn't",'shouldnt').replace("don't",'dont')
    for key, value in neg_pats_1.items():
        second = re.sub(key, value, second)
    second = second.replace('wont',' will not').replace('dont',' do not').replace('cant',' can not').replace('havent',' have not').replace('wouldnt',' would not')
    third = ' '.join([sent.capitalize() for sent in sent_tokenizer.tokenize(second)])
    return third

"""Preprocessing the data"""
data['text'] = data.text.apply(collapse_repeat)
data['text'] = data.text.apply(lambda x: x.replace('"',' '))
data['text'] = data.text.apply(handle_neg)
data['text'] = data.text.apply(lambda x: x.replace(' i ', ' I '))

In [15]:
"""POS-tagging"""
def tagg(x):
    return(pd.DataFrame(CMUTweetTagger.runtagger_parse([x])[0], columns=['word','tag','score']))

tag_text = []
N = data.shape[0]
p = Pool(40)

for k in tqdm(range(N//500+1)):
    tag_text += p.map(tagg, list(data.text.iloc[500*k:min(N,500*(k+1))]))
    
data['tag_df'] = pd.Series(tag_text, index=data.index)

HBox(children=(IntProgress(value=0, max=47), HTML(value='')))




Process ForkPoolWorker-27:
Process ForkPoolWorker-28:
Process ForkPoolWorker-20:
Process ForkPoolWorker-6:
Process ForkPoolWorker-30:
Process ForkPoolWorker-40:
Process ForkPoolWorker-9:
Process ForkPoolWorker-19:
Process ForkPoolWorker-11:
Process ForkPoolWorker-34:
Process ForkPoolWorker-21:
Process ForkPoolWorker-12:
Process ForkPoolWorker-13:
Process ForkPoolWorker-15:
Process ForkPoolWorker-31:
Process ForkPoolWorker-3:
Process ForkPoolWorker-2:
Process ForkPoolWorker-33:
Process ForkPoolWorker-7:
Process ForkPoolWorker-32:
Process ForkPoolWorker-24:
Process ForkPoolWorker-22:
Process ForkPoolWorker-10:
Process ForkPoolWorker-25:
Process ForkPoolWorker-39:
Process ForkPoolWorker-14:
Process ForkPoolWorker-38:
Process ForkPoolWorker-16:
Process ForkPoolWorker-18:
Process ForkPoolWorker-23:
Process ForkPoolWorker-1:
Process ForkPoolWorker-17:
Process ForkPoolWorker-4:
Process ForkPoolWorker-37:
Process ForkPoolWorker-8:
Process ForkPoolWorker-36:
Process ForkPoolWorker-5:
Traceback 

  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/qrg-researchlab/a

  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/queues.py", line 3

  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/home/qrg-researchlab/anaconda3/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    

In [28]:
"""Additional cleaning to remove some special characters"""
for k in tqdm(range(data.shape[0])):
    local = data.tag_df.iloc[k]
    local['word'] = local['word'].apply(lambda x: x.replace('-','').replace('/','').replace('\\','').replace("'",'').replace(' ','').lower())
    data['tag_df'].at[k] = deepcopy(local)

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))

In [29]:
"""Collecting verbs, adverbs, and adjectives"""
from nltk import WordNetLemmatizer

lmtzr = WordNetLemmatizer()
unique_verbs = set()
unique_adverbs = set()
unique_adjectives = set()

for k in tqdm(range(data.shape[0])):
    local = data.tag_df.iloc[k]
    local = local[local.word.apply(lambda x: x.count('-')<=1 and x.count('/')<=1 and not(any(str(k) in x for k in range(10))) and not('#' in x))]
    unique_verbs.update(list(local[local.tag=='V'].word.apply(lambda x: lmtzr.lemmatize(x.replace('-','').replace('/','').replace('\\',''),'v'))))
    unique_adverbs.update(list(local[local.tag=='R'].word.apply(lambda x: x.replace('-','').replace('/','').replace('\\',''))))
    unique_adjectives.update(list(local[local.tag=='A'].word.apply(lambda x: x.replace('-','').replace('/','').replace('\\',''))))
    
"""Lemmatizing all verbs"""
lem_verbs = {verb: lmtzr.lemmatize(verb, 'v') for verb in unique_verbs}
lem_adj = {adj: lmtzr.lemmatize(adj, 'a') for adj in unique_adjectives}
lem_adv = {adv: adv for adv in unique_adverbs}
lem_words = dict()
lem_words.update(lem_adv)
lem_words.update(lem_adj)
lem_words.update(lem_verbs)

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))

In [55]:
"""Building the neighborhoods"""
unique_corpus = unique_verbs.union(unique_adverbs).union(unique_adjectives)

def build_neighborhood(local):
    #Instantiating lemmatizer
    lmtzr = WordNetLemmatizer()
    
    try:
        #Removing non interesting tags, tweet specific tags (emojis, existentials, numbers, URLs, the &, punctuation, unknown, @ mentions, determinants)
        local = local[local.word.apply(lambda x: x.count('-')<=1 and x.count('/')<=1 and not(any(str(k) in x for k in range(10))) and not('#' in x))]
        local = local[local.tag.isin(['U','&',',','$','!','^','#']).apply(lambda x: not x)]
        if local.shape[0]==0:
            return([],[],[])
        else: 
            #Lemmatizing the words to remove the verb and adverb tokens to be considered
            local['word'] = local.word.apply(lambda x: x.replace('-',' ').replace('/',' ').replace('\\','').replace("'",''))
            #Extracting the verb and adverb patterns
            local_words_5 = []
            local_words_7 = []
            local_words_9 = []
            for i in range(local.shape[0]):
                w = local.word.iloc[i]
                if w in unique_corpus or local.tag.iloc[i] in ['V','R','A']:#,'N']:
                    neighborhood_5 = local.iloc[max(0,i-2):min(local.shape[0],i+3)]
                    neighborhood_7 = local.iloc[max(0,i-3):min(local.shape[0],i+4)]
                    neighborhood_9 = local.iloc[max(0,i-4):min(local.shape[0],i+5)]
                    local_words_5.append(list(neighborhood_5.word))
                    local_words_7.append(list(neighborhood_7.word))
                    local_words_9.append(list(neighborhood_9.word))
                #If the word is a verb, add it to the bank of verbs
                if (local.tag.iloc[i]=='V') and ('#' not in w) and ('&' not in w) and not(any(str(k) in w for k in range(10))) and ('-' not in w) and ('/' not in w):
                    if type(w)==list:
                        unique_verbs.update(set(w))
                    else:
                        unique_verbs.add(w)
            return(local_words_5, local_words_7, local_words_9)
    except:
        return ([],[],[])

neighborhoods = []
p = Pool(40)
for k in tqdm(range(N//500+1)):
    neighborhoods += p.map(build_neighborhood, list(data.tag_df.iloc[500*k:min(500*(k+1),N)]))

HBox(children=(IntProgress(value=0, max=47), HTML(value='')))

In [59]:
actual_words_5 = [[[lem_words[w.lower()] if w.lower() in lem_words.keys() else w.lower() for w in v] for v in neighborhoods[k][0]] for k in tqdm(range(len(neighborhoods)))] 
actual_words_7 = [[[lem_words[w.lower()] if w.lower() in lem_words.keys() else w.lower() for w in v] for v in neighborhoods[k][1]] for k in tqdm(range(len(neighborhoods)))]
actual_words_9 = [[[lem_words[w.lower()] if w.lower() in lem_words.keys() else w.lower() for w in v] for v in neighborhoods[k][2]] for k in tqdm(range(len(neighborhoods)))]

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))

In [60]:
"""Building full representation"""
full_representation = []
for k in tqdm(range(data.shape[0])):
    local = data.tag_df.iloc[k]
    if local.shape[0]>0:
        local['word'] = local.word.apply(lambda x: x.replace('-',' ').replace('/',' ').replace('\\','').replace("'",''))
        local['word'] = local.T.apply(lambda x: (lmtzr.lemmatize(x['word'].replace("'",''), x['tag'].lower()) if x['tag'] in ['V','R','A','N'] else x['word'].lower()) if type(x['word'])==str else '')
        full_representation.append(list(local.word))
    else:
        full_representation.append([])

HBox(children=(IntProgress(value=0, max=23102), HTML(value='')))

In [37]:
local

Unnamed: 0,word,tag,score
0,you,O,0.9839
1,good,A,0.5916
2,believe,V,0.8031
3,i,O,0.9992
4,get,V,1.0
5,myself,O,0.9777
6,fucking,R,0.3936
7,mcdonald,^,0.8266
8,tonight,R,0.5139
9,to,P,0.9965


In [58]:
"""Saving the neighborhood representation as well as the full representation for further use"""
pickle.dump(data, open('./processed_MCD.b','wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump((actual_words_5, actual_words_7, actual_words_9), open('./neighborhoods_mcd.b','wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump(full_representation, open('./full_mcd.b','wb'), protocol=pickle.HIGHEST_PROTOCOL)
pickle.dump((lem_verbs, lem_adj, lem_adv, lem_words, unique_verbs, unique_adverbs, unique_adjectives), open('./vocab_MCD.b','wb'), protocol=pickle.HIGHEST_PROTOCOL)