In [9]:
import pandas as pd
import numpy as np
from ark_tweet_nlp import CMUTweetTagger
from tqdm import tqdm_notebook as tqdm
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
import pickle
import copy
import re

In [3]:
"""Loading the data"""
data = pd.read_csv('basic_data.csv', sep='\t', index_col=0)

In [5]:
"""Function to collapse repetitions"""
def collapse_repeat(s):
    if len(s)<=2:
        return s
    else:
        ind_trm = []
        k = 0
        while k <= len(s) - 2:
            if s[k]==s[k+1]:
                i = k + 2
                while i<len(s) and s[i]==s[k]:
                    ind_trm.append(i)
                    i += 1
                k = i
            else:
                k += 1
        if len(ind_trm)==0:
            return s
        else:
            return ''.join([s[i] for i in range(len(s)) if i not in ind_trm])

In [7]:
"""Funciton to normalize verbs"""
def normalize(text):
    first = text.lower().replace("'ll",' will').replace("'re",' are').replace("'ve", ' have').replace("'m",' am')
    second = first.replace("don't",'dont').replace("haven't",'havent').replace("wouldn't",'wouldnt').replace("couldn't",'couldnt').replace("can't","cant").replace("won't",'wont').replace("didn't",'didnt').replace("shouldn't",'shouldnt')
    return second

In [8]:
"""Building company designations"""
companies = [['british','airways'],['iceland','air'],['vueling'],['easy','jet'],['air','canada'],['american','airways'],['united','airlines'],['qatar','airways']]

def generate_versions(name):
    if len(list(name))>=2:
        return [''.join(list(name)), '_'.join(list(name)), '@'+'_'.join(list(name)), '@'+''.join(list(name)), '#'+'_'.join(list(name)), '#'+''.join(list(name)),
               ' '.join(list(name))]
    else:
        return ['@'+name[0],name[0],'#'+name[0]]

ba = generate_versions(companies[0])
designate = {word: 'ba' for word in ba}

for c in companies[1:]:
    des = generate_versions(c)
    designate.update({word: ''.join(c) for word in des})
designate

{'britishairways': 'ba',
 'british_airways': 'ba',
 '@british_airways': 'ba',
 '@britishairways': 'ba',
 '#british_airways': 'ba',
 '#britishairways': 'ba',
 'british airways': 'ba',
 'icelandair': 'icelandair',
 'iceland_air': 'icelandair',
 '@iceland_air': 'icelandair',
 '@icelandair': 'icelandair',
 '#iceland_air': 'icelandair',
 '#icelandair': 'icelandair',
 'iceland air': 'icelandair',
 '@vueling': 'vueling',
 'vueling': 'vueling',
 '#vueling': 'vueling',
 'easyjet': 'easyjet',
 'easy_jet': 'easyjet',
 '@easy_jet': 'easyjet',
 '@easyjet': 'easyjet',
 '#easy_jet': 'easyjet',
 '#easyjet': 'easyjet',
 'easy jet': 'easyjet',
 'aircanada': 'aircanada',
 'air_canada': 'aircanada',
 '@air_canada': 'aircanada',
 '@aircanada': 'aircanada',
 '#air_canada': 'aircanada',
 '#aircanada': 'aircanada',
 'air canada': 'aircanada',
 'americanairways': 'americanairways',
 'american_airways': 'americanairways',
 '@american_airways': 'americanairways',
 '@americanairways': 'americanairways',
 '#americ

In [10]:
"""Function to replace the designations"""
def replace_designate(text):
    local = copy.deepcopy(text)
    for w in designate.keys():
        local = local.replace(w+' ',designate[w]+' ')
    return local

In [12]:
"""Preprocessing the data"""
data['text'] = data.text.apply(collapse_repeat)
data['text'] = data.text.apply(lambda x: x.replace('"',''))
data['text'] = data.text.apply(normalize)
data['text'] = data.text.apply(replace_designate)

def tagg(x):
    return(pd.DataFrame(CMUTweetTagger.runtagger_parse([x])[0], columns=['word','tag','score']))

tag_text = []
for k in tqdm(range(data.shape[0])):
    tag_text.append(tagg(data.text.iloc[k]))

data['tag_df'] = pd.Series(tag_text)

pickle.dump(data, open('./label_tag_data.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

HBox(children=(IntProgress(value=0, max=11540), HTML(value='')))


