In [55]:
from nltk.tokenize import word_tokenize
import os

In [56]:
import context2vec.eval as c2v
import os
MODEL_DIR = '/home/michael/Documents/MVA/NLP/MVA_2018_SL/TD_#3/model/'
MODEL_NAME = 'context2vec.ukwac.model.params'
PATH = os.path.join(MODEL_DIR,MODEL_NAME)
print(PATH)

/home/michael/Documents/MVA/NLP/MVA_2018_SL/TD_#3/model/context2vec.ukwac.model.params


In [57]:
#!/usr/bin/env python
import numpy
import six
import sys
import traceback
import re

from chainer import cuda
from context2vec.common.context_models import Toks
from context2vec.common.model_reader import ModelReader

class ParseException(Exception):
    def __init__(self, str):
        super(ParseException, self).__init__(str)

def parse_input(line):    
    target_exp = re.compile('\[.*\]')
    sent = line.strip().split()
    target_pos = None
    for i, word in enumerate(sent):
        if target_exp.match(word) != None:
            target_pos = i
            if word == '[]':
                word = None
            else:
                word = word[1:-1]
            sent[i] = word
    return sent, target_pos    

def mult_sim(w, target_v, context_v):
    target_similarity = w.dot(target_v)
    target_similarity[target_similarity<0] = 0.0
    context_similarity = w.dot(context_v)
    context_similarity[context_similarity<0] = 0.0
    return (target_similarity * context_similarity) 

def load_c2v_model(model_param_file):
        
    model_reader = ModelReader(model_param_file)
    w = model_reader.w
    word2index = model_reader.word2index
    index2word = model_reader.index2word
    model = model_reader.model
    
    model_full = [model,w,word2index,index2word]
    
    return model_full


def evalc2v(input_line, c2v_model,verbose=False,n_result=10):
    eval_list = []
#     n_result = 10  # number of search result to show
    gpu = -1 # todo: make this work with gpu
    
    if gpu >= 0:
        cuda.check_cuda_available()
        cuda.get_device(gpu).use()    
    xp = cuda.cupy if gpu >= 0 else numpy    
    
    try:
        line = input_line
        sent, target_pos = parse_input(line)
        if target_pos == None:
            raise ParseException("Can't find the target position.") 
                    
        if sent[target_pos] == None:
            target_v = None
        elif sent[target_pos] not in word2index:
            raise ParseException("Target word is out of vocabulary.")
        else:
            target_v = w[word2index[sent[target_pos]]]
        if len(sent) > 1:
            context_v = c2v_model[0].context2vec(sent, target_pos) 
            context_v = context_v / xp.sqrt((context_v * context_v).sum())
        else:
            context_v = None        
            
        if target_v is not None and context_v is not None:
            similarity = mult_sim(c2v_model[1], target_v, context_v)
        else:
            if target_v is not None:
                v = target_v
            elif context_v is not None:
                v = context_v                
            else:
                raise ParseException("Can't find a target nor context.")   
            similarity = (c2v_model[1].dot(v)+1.0)/2 # Cosine similarity can be negative, mapping similarity to [0,1]
                
        count = 0
        for i in (-similarity).argsort():
            if numpy.isnan(similarity[i]):
                continue
            eval_list.append((c2v_model[3][i], similarity[i]))
            
            if verbose is True:
                print('{0}: {1}'.format(c2v_model[3][i], similarity[i]))
            count += 1
            if count == n_result:
                break

    except ParseException as e:
        print "ParseException: {}".format(e)                
    except Exception:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        print "*** print_tb:"
        traceback.print_tb(exc_traceback, limit=1, file=sys.stdout)
        print "*** print_exception:"
        traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout)
    return eval_list

In [58]:
c2v_model = load_c2v_model(PATH)

Reading config file: /home/michael/Documents/MVA/NLP/MVA_2018_SL/TD_#3/model/context2vec.ukwac.model.params
Config:  {'config_path': '/home/michael/Documents/MVA/NLP/MVA_2018_SL/TD_#3/model/', 'model_file': 'context2vec.ukwac.model', 'deep': 'yes', 'drop_ratio': '0.0', 'words_file': 'context2vec.ukwac.words.targets', 'unit': '300'}


In [59]:
eval_vec = evalc2v('This is a [] book', c2v_model,True,10)

hard-back: 0.551928520203
must-read: 0.550950348377
32-page: 0.546000599861
spiral-bound: 0.544216036797
64-page: 0.541346013546
coffee-table: 0.539836406708
new: 0.538940668106
self-published: 0.538060128689
48-page: 0.537546575069
best-selling: 0.536374986172


### Levenshtein distance

In [60]:
import numpy as np
def computeEditDistance(str1, str2,w_del=1,w_ins=1,w_sub=1):
    str1 = str1.lower()
    str2 = str2.lower()
    m = np.zeros((len(str1)+1,len(str2)+1))
    for i in range(len(str1)+1):
        m[i,0] = i
    for j in range(len(str2)+1):
        m[0,j] = j
    for i in range(len(str1)+1)[1:]:
        for j in range(len(str2)+1)[1:]:
            if str1[i-1] == str2[j-1]:
#                 m[i,j] = min(m[i-1,j]+w_del, m[i,j-1]+w_ins, m[i-1,j-1])
                m[i,j] = m[i-1,j-1]
            else:
                m[i,j] = min(m[i-1,j]+w_del, m[i,j-1]+w_ins, m[i-1,j-1]+w_sub)
    return m[len(str1),len(str2)]

When we change the weights, the Edit distance function is not symmetric anymore

### Import the corpus, word dictionnary, slang dictionnary

#### Word corpus

In [61]:
dataset = 'Corpus/CorpusBataclan_en.1M.raw.txt'
text = []
def read_dataset(dataset):
    text = []
    with open(dataset, "r") as f:
        text = f.readlines()
        
    for i,line in enumerate(text):
        text[i] = line.split('\n')[0]
    return text

def write_output(filename,text_list):
    with open(filename, "w") as f:
        for tweet in text_list:
            f.write(tweet+'\n')
    

In [62]:
text_full = read_dataset(dataset)

In [63]:
len(text_full)

1000000

#### Slang dictionnary

In [64]:
dict_abbreviation = {'a':{'atm':'at the moment',
                          'af':'as fuck',
                          'afk':'away from keyboard'},
                     'b':{'b':'be',
                          'bc':'because',
                          'brb':'be right back',
                          'bro':'friend',
                          'bb':'baby',
                          'bm':'bad manners',
                          'bs':'bullshit',
                          'bbq':'barbecue'},
                     'c':{'cya':'see you later',
                          'cu':'see you later',
                          'cus':'because',
                         },
                     'd':{'dis':'this',
                          'dat':'that',
                          'dawg':'friend',
                          'dafuq':'what the fuck',
                          'dm':'direct message',
                          'dang':'wow'},
                     'e':{'ez':'easy'
                         },
                     'f':{'fu':'fuck you',
                          'ffs':'for fuck sake',
                          'fr':'for real',
                          'fml':'fuck my life'},
                     'g':{'gov':'government',
                          'gg':'good game',
                          'ggla':'good game, love all',
                          'gr8':'great',
                          'gtfo':'go away',
                          'gth':'go to hell',
                          'gtg':'have to go'
                         },
                     'h':{},
                     'i':{'ikr':'I know, right?',
                          'idc':'I do not care',
                          'idgaf':'I do not care',
                          'immo':'in my modest opinion',
                          'iirc':'if I remember correctly'
                         },
                     'j':{},
                     'k':{'k':'okay',
                          'kk':'okay'
                         },
                     'l':{'lmao':'*laugh',
                          'lmfao':'*laugh*',
                          'lol':'*laugh*'
                         },
                     'm':{'m8':'mate',
                          'mf':'motherfucker'
                         },
                     'n':{'nsfw':'not safe for work',
                          'nbd':'no big deal'},
                     'o':{'ok':'okay',
                          'omg':'oh my god',
                          'omfg':'oh my fucking god'},
                     'p':{'pm':'private message',
                          'ppl':'people',
                          'plz':'please'
                         },
                     'q':{},
                     'r':{'r':'are',
                          'rofl':'*laugh*',
                          'rn':'right now',
                         'ru':'are you'},
                     's':{'smh':'shaking my head',
                          'sis':'friend',
                          'sry':'sorry',
                          'stfu':'shut the fuck up',
                          'smth':'something'
                         },
                     't':{'ttyl':'talk to you later',
                          'tn':'tonight'},
                     'u':{'u':'you',
                          'ur':'your'},
                     'v':{},
                     'w':{'wtf':'what the fuck',
                          'wth':'what the hell'},
                     'x':{'xoxo':'kissing and hugging',
                          'xo':'kissing',
                          'xx':'kissing'},
                     'y':{'yolo':'you only live once'},
                     'z':{}
                    }

### Import word dictionnary

In [65]:
def abcTo123():
    alphabet_dict = {'a':0,
                 'b':1,
                 'c':2,
                 'd':3,
                 'e':4,
                 'f':5,
                 'g':6,
                 'h':7,
                 'i':8,
                 'j':9,
                 'k':10,
                 'l':11,
                 'm':12,
                 'n':13,
                 'o':14,
                 'p':15,
                 'q':16,
                 'r':17,
                 's':18,
                 't':19,
                 'u':20,
                 'v':21,
                 'w':22,
                 'x':23,
                 'y':24,
                 'z':25}
    
    return alphabet_dict

def build_indexed_alphabet(word_dict_txt):
    large_dict = []
    alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
    
    for letter in alphabet:
        list_letter = [word for word in word_dict_txt if word.startswith(letter)]
        large_dict.append(list_letter)
        
    return large_dict    

In [66]:
large_dict_txtfile = read_dataset('large.txt')

In [67]:
large_dict = build_indexed_alphabet(large_dict_txtfile)

### Define all tweet normalization functions

In [68]:
# Removing non ASCII elements

def remove_no_ascii(text):
    import re
    text = re.sub(r'[^\x00-\x7F]+','', text)
    text = re.sub(r'&amp;',r'and',text)
    text = re.sub(r'&gt;',r'>',text)
    text = re.sub(r'&lt;',r'<',text)
    text = re.sub(r'\'',r"'",text)
    return text

In [69]:
a = 'Don\'t do drugs &amp; alcohol'
remove_no_ascii(a)

"Don't do drugs and alcohol"

In [70]:
# Removing RTs

def clear_RTs(text):
            
    clean_text = text
    idx = clean_text.find('RT')

    while idx != -1:    
        idx_end = clean_text.find(': ',idx)+1
        if idx_end == 0:
            idx_end = len(clean_text)
        RT = clean_text[idx:idx_end]
        clean_text = clean_text.replace(RT, '')
        idx = clean_text.find('RT')
        
    return clean_text

In [71]:
def clear_At(text):
    
    clean_text = re.sub(r'@[a-zA-Z0-9_]{1,15}','',text)               

    return clean_text

In [72]:
# Removing HTML

def clear_HTMLs(text):
    
    text = re.sub(r'<[a-zA-Z0-9]>','',text)
    text = re.sub(r'</[a-zA-Z0-9]>','',text)
    return text

In [73]:
a = '<b>Facebook</b>'
clear_HTMLs(a)

'Facebook'

In [74]:
def parseHashtags(text):
            
    clean_text = text
    idx = clean_text.find('#')
    
    while idx != -1:
        idx_end = 1
        if idx!=len(clean_text):
            while idx+idx_end<len(clean_text):
                if not clean_text[idx+idx_end].isalpha() and not clean_text[idx+idx_end].isdigit():
                    break
                else:
                    idx_end += 1
                    
            hashtag = clean_text[idx:idx+idx_end]
#             print(hashtag)
            hashtag = re.sub(r'#([a-zA-Z0-9])',r' #\1',hashtag)   
#             print(hashtag)
            hashtag = re.sub(r'([a-zA-Z])([0-9])([a-zA-Z])',r'\1-\2-\3',hashtag)
#             print(hashtag)
            hashtag = re.sub(r'([0-9])([a-zA-Z])',r'\1-\2',hashtag)     
#             print(hashtag)            
            hashtag = re.sub(r'([a-z])([A-Z0-9])',r'\1-\2',hashtag)
#             print(hashtag)
            hashtag = re.sub(r'([A-Z]{1})([A-Z]{1})([a-z0-9])',r'\1-\2\3',hashtag)

        clean_text = clean_text[:idx]+hashtag+clean_text[idx+idx_end:]
        idx = clean_text.find('#',idx+len(hashtag))

    return clean_text

In [75]:
a = 'hey #Pray4Paris#Hello#world#4Paris#ELLISStory#wORLD for#4Paris'
parseHashtags(a)

'hey  #Pray-4-Paris #Hello #world #4-Paris #ELLIS-Story #w-ORLD for #4-Paris'

In [94]:
def correctSlang(text,dictionnary):
    clean_text = text
    split = re.split("-|;|:|\(|\)|\[|\]|\.| |,|!|\~|\?",text)
    parsed_tweet = [s for s in split if len(s)!=0]
    
    for word in parsed_tweet:#re.findall(r"[\w']+", text):
        word = word.lower()
        idx = 0
        if len(word)==1 and word[0].isalpha():
            if word in dictionnary[word.lower()]:
                idx = clean_text.lower().find(word,idx)
                while idx!=-1:                    
                    if idx<len(clean_text)-1:
                        if clean_text[idx-1].isalpha() or clean_text[idx+1].isalpha():
                            idx = clean_text.lower().find(word,idx+1)
                        else:
                            clean_text = clean_text[:idx]+dictionnary[word[0].lower()][word]+clean_text[idx+1:]
                            idx = clean_text.lower().find(word,idx+len(dictionnary[word[0].lower()][word]))
                    elif idx==len(clean_text)-1:
                        if not clean_text[idx-1].isalpha():
                            clean_text = clean_text[:idx]+dictionnary[word[0].lower()][word]
                        idx=-1
                        
                    elif idx==0:
                        if not clean_text[idx+1].isalpha():
                            clean_text = dictionnary[word.lower()][word]+clean_text[1:]
                            idx = clean_text.lower().find(word,idx+len(dictionnary[word.lower()][word]))
                    
                idx = 0
        
        elif len(word)>1 and word[0].isalpha():
            if word in dictionnary[word[0].lower()]:
                idx = clean_text.lower().find(word,idx)
                while idx!=-1:                     
                    if idx>0:                        
                        if idx<len(clean_text)-len(word):
                            if clean_text[idx-1].isalpha() or clean_text[idx+len(word)].isalpha():
                                idx = clean_text.lower().find(word,idx+1)
                                
                            else:
                                clean_text = clean_text[:idx]+dictionnary[word[0].lower()][word]+clean_text[idx+len(word):]
                                idx = clean_text.lower().find(word,idx+len(dictionnary[word[0].lower()][word]))
                                
                        elif idx==len(clean_text)-len(word):
                            if not clean_text[idx-1].isalpha():
                                clean_text = clean_text[:idx]+dictionnary[word[0].lower()][word]
                            idx=-1
                                
                    elif idx==0:
                        if idx+len(word)<len(clean_text):
                            if not clean_text[idx+len(word)].isalpha():
                                clean_text = dictionnary[word[0].lower()][word]+clean_text[len(word):]
                            idx = clean_text.lower().find(word,idx+len(dictionnary[word[0].lower()][word]))
                        else:
                            idx=-1
                idx = 0
                
    return clean_text
            

In [95]:
a = 'SRY GtG, Brb !  WTF??? "GTfO and stfu ffs!"... R U kidding me ? r u freaking fR? fu m8! LMFAO ROFL LoL FmL, iKr YoLo rn m8,smh'
correctSlang(a,dict_abbreviation)

'sorry have to go, be right back !  what the fuck??? "GTfO and shut the fuck up for fuck sake!"... are you kidding me ? are you freaking for real? fuck you mate! *laugh* *laugh* *laugh* fuck my life, I know, right? you only live once right now mate,shaking my head'

In [96]:
def remove_ends(text):
    clean_text = ''
    j = 0
    while j<len(text) and text[j].isalpha() is False:
        j=j+1
    clean_text = text[j:]
    
    j=1
    while True:
        if len(clean_text)>1:
            if not clean_text[-j].isalpha() and not clean_text[-j-1].isalpha():
                clean_text = clean_text[:-j-1]
            elif clean_text[-j].isalpha() and clean_text[-j-1].isalpha():
                break
            else:
                clean_text = clean_text[:-j]
        else:
            break
    
#     if clean_text[-1].isalpha()

    return clean_text

In [97]:
a = ' ?1. hahlaalalojjh j,.:; ;   .  '
print(len(a))
print(remove_ends(a))
len(remove_ends(a))

32
hahlaalalojjh


13

In [98]:
def clear_URLs(text):
    clean_text = text
    idx = clean_text.find('http')

    while idx != -1:    
        idx_end = clean_text.find(' ',idx)
        if idx_end == -1:
            idx_end = len(clean_text)
        url = clean_text[idx:idx_end]
        clean_text = clean_text.replace(url, '')
        idx = clean_text.find('http')
        
    return clean_text

In [99]:
a = 'click on this link: https://www.thelink.com/'
clear_URLs(a)

'click on this link: '

In [100]:
def remove_verb_contractions(text):

    clean_text = text
    
    idx = clean_text.lower().find("'s")
    while idx != -1 and idx!=1:
        pre_letter = clean_text[idx-2]+clean_text[idx-1]
        if idx<len(clean_text)-len("'s"):
            post_letter = clean_text[idx+len("'s")]
        else:
            post_letter = '.'
            
        if pre_letter.lower() in ['he','it','at'] and not post_letter.isalpha():
            
            idx_end = idx+len("'s")                
            word = clean_text[idx:idx_end]
            if word.upper()==word:
                clean_text = clean_text[:idx] + " 'S " + clean_text[idx+len("'s")+1:]#clean_text[idx:].replace(word, " 'S", 1)
            else:
                clean_text = clean_text[:idx] + " 's " + clean_text[idx+len("'s")+1:]
        idx = clean_text.lower().find("'s",idx+1,len(clean_text))
        
    idx = clean_text.lower().find("'d")
    while idx != -1 and idx!=1:
        pre_letter = clean_text[idx-2]+clean_text[idx-1]
        if idx<len(clean_text)-len("'d"):
            post_letter = clean_text[idx+len("'d")]
        else:
            post_letter = '.'
            
        if pre_letter.lower() in ['he','it','at','ou'] and not post_letter.isalpha():
            
            idx_end = idx+len("'d")                
            word = clean_text[idx:idx_end]
            if word.upper()==word:
                clean_text = clean_text[:idx] + " 'D " + clean_text[idx+len("'D")+1:]#clean_text[idx:].replace(word, " 'S", 1)
            else:
                clean_text = clean_text[:idx] + " 'd " + clean_text[idx+len("'d")+1:]
        idx = clean_text.lower().find("'d",idx+1,len(clean_text))
    
    idx = clean_text.lower().find("'re")
    while idx != -1 and idx!=1:
        pre_letter = clean_text[idx-1]
        if idx<len(clean_text)-len("'re"):
            post_letter = clean_text[idx+len("'re")]
        else:
            post_letter = '.'
        if pre_letter.lower() in ['u','y','e'] and not post_letter.isalpha():
            
            idx_end = idx+len("'re")                
            word = clean_text[idx:idx_end]
            if word.upper()==word:
                clean_text = clean_text[:idx] + clean_text[idx:].replace(word, ' ARE', 1)
            else:
                clean_text = clean_text[:idx] + clean_text[idx:].replace(word, ' are', 1)

        idx = clean_text.lower().find("'re",idx+1,len(clean_text))
        
    idx = clean_text.lower().find("'ve")
    while idx != -1 and idx!=1:
        pre_letter = clean_text[idx-1]
        if idx<len(clean_text)-len("'ve"):
            post_letter = clean_text[idx+len("'ve")]
        else:
            post_letter = '.'
        if pre_letter.lower() in ['i','u','y','e','t'] and not post_letter.isalpha():
             
            idx_end = idx+len("'ve")                
            word = clean_text[idx:idx_end]
            if word.upper()==word:
                clean_text = clean_text[:idx] + clean_text[idx:].replace(word, ' HAVE', 1)
            else:
                clean_text = clean_text[:idx] + clean_text[idx:].replace(word, ' have', 1)

        idx = clean_text.lower().find("'ve",idx+1,len(clean_text))
        
    idx = clean_text.lower().find("'ll")
    while idx != -1 and idx!=1:
        pre_letter = clean_text[idx-1]
        if idx<len(clean_text)-len("'ll"):
            post_letter = clean_text[idx+len("'ll")]
        else:
            post_letter = '.'
        if pre_letter.lower() in ['i','u','y','e','t'] and not post_letter.isalpha():
            
            idx_end = idx+len("'ll")            
            word = clean_text[idx:idx_end]
            
            if word.upper()==word:
                clean_text = clean_text[:idx] + clean_text[idx:].replace(word, ' WILL', 1)
            else:
                clean_text = clean_text[:idx] + clean_text[idx:].replace(word, ' will', 1)

        idx = clean_text.lower().find("'ll",idx+1,len(clean_text))
    
        
    idx = clean_text.lower().find("n't")
    while idx != -1 and idx!=1:
        pre_letter = clean_text[idx-1]
        if idx<len(clean_text)-len("n't"):
            post_letter = clean_text[idx+len("n't")]
        else:
            post_letter = '.'
        if pre_letter.lower() in ['o','s','a','e','d'] and not post_letter.isalpha():
        
            idx_end = idx+len("n't")    
            word = clean_text[idx:idx_end]
            
            if clean_text.lower()[idx-1] != 'a':
                if word.upper()==word:
                    clean_text = clean_text[:idx] + clean_text[idx:].replace(word, ' NOT', 1)
                else:
                    clean_text = clean_text[:idx] + clean_text[idx:].replace(word, ' not', 1)
            else:
                if word.upper()==word:
                    clean_text = clean_text[:idx] + clean_text[idx:].replace(word, 'NNOT', 1)
                else:
                    clean_text = clean_text[:idx] + clean_text[idx:].replace(word, 'nnot', 1)
            
        idx = clean_text.lower().find("n't",idx+1,len(clean_text))
        
    idx = clean_text.lower().find("gonna")
    while idx != -1:
        if idx!=1:
            pre_letter = clean_text[idx-1]
        else:
            pre_letter = '.'
        if idx<len(clean_text)-len("gonna"):
            post_letter = clean_text[idx+len("gonna")]
        else:
            post_letter = '.'
        if not pre_letter.isalpha() and not post_letter.isalpha():
            
            idx_end = idx+len("gonna")            
            word = clean_text[idx:idx_end]
            
            if word.upper()==word:
                clean_text = clean_text[:idx] + clean_text[idx:].replace(word, 'GOING TO', 1)
            else:
                clean_text = clean_text[:idx] + clean_text[idx:].replace(word, 'going to', 1)

        idx = clean_text.lower().find("gonna",idx+1,len(clean_text))    
        
        
    idx = clean_text.lower().find("wanna")
    while idx != -1:
        if idx!=1:
            pre_letter = clean_text[idx-1]
        else:
            pre_letter = '.'
        if idx<len(clean_text)-len("wanna"):
            post_letter = clean_text[idx+len("wanna")]
        else:
            post_letter = '.'
        if not pre_letter.isalpha() and not post_letter.isalpha():
            
            idx_end = idx+len("wanna")            
            word = clean_text[idx:idx_end]
            
            if word.upper()==word:
                clean_text = clean_text[:idx] + clean_text[idx:].replace(word, 'WANT TO', 1)
            else:
                clean_text = clean_text[:idx] + clean_text[idx:].replace(word, 'want to', 1)

        idx = clean_text.lower().find("wanna",idx+1,len(clean_text))  
        
    idx = clean_text.lower().find("gotta")
    while idx != -1:
        if idx!=1:
            pre_letter = clean_text[idx-1]
        else:
            pre_letter = '.'
        if idx<len(clean_text)-len("gotta"):
            post_letter = clean_text[idx+len("gotta")]
        else:
            post_letter = '.'
        if not pre_letter.isalpha() and not post_letter.isalpha():
            
            idx_end = idx+len("gotta")            
            word = clean_text[idx:idx_end]
            
            if word.upper()==word:
                clean_text = clean_text[:idx] + clean_text[idx:].replace(word, 'GOT TO', 1)
            else:
                clean_text = clean_text[:idx] + clean_text[idx:].replace(word, 'got to', 1)

        idx = clean_text.lower().find("gotta",idx+1,len(clean_text))  
            
    clean_text = re.sub(r'([iI]{1})\'[Mm]',r'\1 am', clean_text)
    clean_text = re.sub(r'I\'M',r'I AM', clean_text)
        
    return clean_text

We separate the apostrophes in it's, she's, he's to later tokenize "'s" and replace it with context2vec

In [101]:
a = "It doesn't DOESN'T can't i'M it's CAN'T they're THEY'RE i've that'll I've I'VE you'll YOU'll YOU'LL doesn't i'll that's"
remove_verb_contractions(a)

"It does not DOES NOT cannot i am it 's CANNOT they are THEY ARE i have that will I have I HAVE you will YOU will YOU WILL does not i will that 's "

## Using context2vec and Levenshtein distance

The following function is the most complicated one as it corrects misspelled words:
- it first loads a word dictionnary, with words indexed by their first letter <b>indexed_word_dict</b> 
- it then parses the tweet in seperate words
- for each word in the tweet:
    - check if it exists in <b>indexed_word_dict</b> 
    - if it does, keep it as such
    - if it does not, and is not a proper noun (first letter being a capital letter), correct it using context2vec
        - make a list of candidates using both <b>nb_c2v</b> context2vec outputs and words starting with the same letter in <b>indexed_word_dict</b>
        - choose the candidate with smallest <b>Levenshtein distance</b> to the original word (a weighing of the candidates is done based on their probability given by context2vec)
        - as some tweets are in the middle of a sentence, last words tend to be cut, therefore we put less weight on letter insertion than substitution or deletion

In [102]:
def use_dict2vec(tweet, c2v_model,indexed_word_dict,w_del=2,w_ins=0.5,w_sub=1,nb_c2v=100,verbose=False):
    split = re.split("-|;|:|\(|\)|\[|\]|\.| |,|!|\~|\?",tweet)
    parsed_tweet = [s for s in split if len(s)!=0]
#     re.findall(r"[#\w\'*]+", tweet)
    alphabet_dict = abcTo123()
    if len(parsed_tweet)>1:
        for word in parsed_tweet:
            if word[0].isalpha() and word[0].islower():
                dict_letter = indexed_word_dict[alphabet_dict[word[0].lower()]]
                
                if word.lower() not in dict_letter:
                    i_start = 0
                    w = [w_del,w_ins,w_sub]
                    stop = False
                    while stop is False:                        
                        idx = tweet.find(word,i_start)
                        if idx>0 and idx<len(tweet)-len(word):
                            if not tweet[idx-1].isalpha() and not tweet[idx+len(word)].isalpha():
                                tweet2pred = tweet[:idx].lower()+' [] '+tweet[idx+len(word):].lower()
                                eval_vec = evalc2v(tweet2pred,c2v_model,False,nb_c2v)
                                stop=True
                            else:
                                i_start = idx+1
                        elif idx==0 and len(tweet)>1:
                            if not tweet[idx+len(word)].isalpha():
                                tweet2pred = tweet[:idx].lower()+' [] '+tweet[idx+len(word):].lower()
                                # we put everything on lower case to use it with c2v
                                eval_vec = evalc2v(tweet2pred,c2v_model,False,nb_c2v)
                                stop=True
                            else:
                                i_start = idx+1
                                
                        elif idx==len(tweet)-len(word):
                            if not tweet[idx-1].isalpha():
                                tweet2pred = tweet[:idx].lower()+' [] '+tweet[idx+len(word):].lower()
                                eval_vec = evalc2v(tweet2pred,c2v_model,False,nb_c2v)
                                w = [2,1,1]
                                stop=True
                            else:
                                i_start = idx+1
                        else:
                            i_start = idx+1
                            
                    if verbose is True:
                        print(tweet2pred)
                    dist = 100
                    idx_min = 0
                    
                    candidates = [e[0] for e in eval_vec if e[0].isalpha()]+dict_letter
                                        
                    for i,candidate in enumerate(candidates):
                        if candidate[0].isalpha():
                            dist_temp = computeEditDistance(word.lower(), candidate.lower(),w[0],w[1],w[2])
                            if i<len(eval_vec):
                                dist_temp = dist_temp*(1/eval_vec[i][1])*0.4
                                # This lowers the distance of words predicted with context2vec, with most probable words having closest distance
                                # if the prediction has a probability <0.4, the distance is then amplified by a factor 0.4/probability
#                             computeEditDistance(str1, str2,w_del=1,w_ins=1,w_sub=1):
                            if dist_temp<dist:
                                dist=dist_temp
                                idx_min = i
                
                    tweet = tweet[:idx]+candidates[idx_min]+tweet[idx+len(word):]
                    
            elif word.lower()=="'s" or word.lower()=="'d":
                i_start = 0
                stop = False
                while stop is False:
                    idx = tweet.find(word,i_start)
                    if idx>0 and idx<len(tweet)-len(word):
                        if not tweet[idx-1].isalpha() and not tweet[idx+len(word)].isalpha():
                            tweet2pred = tweet[:idx].lower()+' [] '+tweet[idx+len(word):].lower()
                            eval_vec = evalc2v(tweet2pred,c2v_model,False,10)
                            stop=True
                        else:
                            i_start = idx+1
                    elif idx==0 and len(tweet)>1:
                        if not tweet[idx+len(word)].isalpha():
                            tweet2pred = tweet[:idx].lower()+' [] '+tweet[idx+len(word):].lower()
                            eval_vec = evalc2v(tweet2pred,c2v_model,False,10)
                            stop=True
                        else:
                            i_start = idx+1
                            
                    elif idx==len(tweet)-len(word):
                        if not tweet[idx-1].isalpha():
                            tweet2pred = tweet[:idx].lower()+' [] '+tweet[idx+len(word):].lower()
                            eval_vec = evalc2v(tweet2pred,c2v_model,False,10)
                            
                            stop=True
                        else:
                            i_start = idx+1
                    else:
                        i_start = idx+1
                        
                if verbose is True:
                    print(tweet2pred)
                dist = 100
                idx_min = 0
                
                candidates = [e[0] for e in eval_vec if e[0].isalpha()]
                for i,candidate in enumerate(candidates):
                    if candidate[0].isalpha():
                        dist_temp = computeEditDistance(word.lower(),candidate.lower(),1000,1,1000)
                        
                        if dist_temp<dist and dist_temp<5:
                            dist=dist_temp
                            idx_min = i
                            
                tweet = tweet[:idx]+candidates[idx_min]+tweet[idx+len(word):]

                
    else:
        w = [2,1,2]
        if tweet[0].isalpha() and tweet[0].islower():
            dict_letter = large_dict[alphabet_dict[tweet[0].lower()]]
            if tweet not in dict_letter:
                dist = 100
                idx_min = 0
                for i, candidate in enumerate(dict_letter):
                    dist_temp = computeEditDistance(tweet.lower(),candidate.lower(),w[0],w[1],w[2])
                    if dist_temp<dist:
                        dist=dist_temp
                        idx_min = i
                        
                tweet = dict_letter[idx_min]
            
    return tweet

In [104]:
a = "I persolany think it 'd be easier like this, but I ccould be wrng."
b = use_dict2vec(a, c2v_model,large_dict,1,1,1,400)

print b

I personaly think it wud be easier like this, but I could be wrong.


In [115]:
def clean_homophones(tweet,c2v_model):
#     parsed_tweet = re.findall(r"[#\w\']+", tweet)
    split = split = re.split("-|;|:|\(|\)|\[|\]|\.| |,|!|\~|\?",tweet)
    parsed_tweet = [s for s in split if len(s)!=0]
    idx_start = 0
    
    word = parsed_tweet[0]
    if word[0].islower():
        tweet2pred = ' [] '+tweet[len(parsed_tweet[0]):]
        eval_vec = evalc2v(tweet2pred,c2v_model,False,4)
        dist = 100
        idx_min = -1
        candidates = [ev[0] for ev in eval_vec if ev[0].isalpha()]
        for i,e in enumerate(candidates):
            dist_temp = computeEditDistance(word.lower(), e,1,1,1)
            if dist_temp<3 and dist_temp<dist:
                dist=dist_temp
                idx_min = i
                
        if idx_min != -1:          
            tweet = candidates[idx_min]+' '+tweet[len(word):]    
            idx_start += len(candidates[idx_min])+1
            
        idx_start = len(word)+1
    else:
        idx_start = len(word)+1
        
    for word in parsed_tweet[1:]:
        if word[0].islower():
            idx_start_updated = tweet.find(word, idx_start)
            tweet2pred = tweet[:idx_start_updated]+' [] '+tweet[idx_start_updated+len(word):]
            eval_vec = evalc2v(tweet2pred,c2v_model,False,4)
            dist = 100
            idx_min = -1
            candidates = [ev[0] for ev in eval_vec if ev[0].isalpha()]
            for i,e in enumerate(candidates):
                dist_temp = computeEditDistance(word.lower(), e,1,1,1)
                if dist_temp<3 and dist_temp<dist:
                    dist=dist_temp
                    idx_min = i
                    
            if idx_min !=-1:               
                tweet = tweet[:idx_start_updated]+candidates[idx_min]+tweet[idx_start_updated+len(word):]  
                idx_start += len(candidates[idx_min])+1
            else:
                idx_start += len(word)+1
        else:
            idx_start += len(word)+1
                       
    return tweet

In [116]:
tweet = "I do not know were to go"
clean_homophones(tweet,c2v_model)

'I do not know where to go'

## Putting everything together

In [107]:
def normalize_tweet_text(text,c2v_model,wordDict,slangDict,verbose=False):
    
    normalized_text = []
    tASCII = 0
    tRT = 0
    tURLs = 0
    tHash = 0
    tAt = 0
    tEnds = 0
    tVerb = 0
    tSlang = 0
    tD2V = 0
    tHomo = 0
    
    for i,tweet in enumerate(text):
        if len(tweet)!=0:  
            t = time.time()
            tweet = remove_no_ascii(tweet)
            tASCII = tASCII + time.time()-t
            
            t = time.time()
            tweet = clear_RTs(tweet)
            tRT = tRT + time.time()-t
            
            t = time.time()
            tweet = clear_URLs(tweet)
            tweet = clear_HTMLs(tweet)
            tURLs = tURLs + time.time()-t
            
            t = time.time()
            tweet = parseHashtags(tweet)
            tHash = tHash + time.time()-t
            
            t = time.time()
            tweet = clear_At(tweet)
            tAt = tAt + time.time()-t
            
            t = time.time()
            tweet = remove_ends(tweet)
            tEnds = tEnds + time.time()-t
            
            t = time.time()
            tweet = remove_verb_contractions(tweet)
            tVerb = tVerb + time.time()-t
            
            
            t = time.time()
            tweet = correctSlang(tweet,slangDict)
            tSlang = tSlang + time.time()-t
            
            print(tweet)
            
            # Now using context2vec and Edit distance
        if len(tweet)!=0:
            t = time.time()
            tweet = use_dict2vec(tweet, c2v_model,wordDict,1,1,1,400)
            tD2V = tD2V + time.time()-t
            
            t = time.time()
            tweet = clean_homophones(tweet, c2v_model)
            tHomo = tHomo + time.time()-t
            
        if len(tweet)!=0:
#             normalized_text.append(tweet)            
            tweet = word_tokenize(tweet)
            print(tweet)
            normalized_text.append(tweet)
    
        if i>0 and i%100000==0:
            if verbose is True:
                print(i)
                print('Time to clear non ASCII characters: '+str(tASCII)+'s')
                print('Time to clear RTs: '+str(tRT)+'s')
                print('Time to clear URLs and HTML '+str(tURLs)+'s')
                print('Time to clear Hashtags: '+str(tHash)+'s')
                print('Time to clear ATs: '+str(tAt)+'s')
                print('Time to remove Ends: '+str(tEnds)+'s')
                print('Time to clean verbs and negation: '+str(tVerb)+'s')
                print('Time to clean slang: '+str(tSlang)+'s')
                print('Time to clean misspelled words: '+str(tD2V)+'s')
                print('Time to clean common homophonic errors: '+str(tHomo)+'s')
                
            tASCII = 0
            tRT = 0
            tURLs = 0
            tHash = 0
            tAt = 0
            tEnds = 0
            tVerb = 0
            tSlang = 0
            tD2V = 0
        
    print(i)
    return normalized_text

In [108]:
a = "We haven't gotta stop: 'regex isn't gonna fail'. We've understood that they've figured we can't avoid using it to solve 'verbs'. We're convinced it works. They aren't and wanna tell us to stop, but we're still gonna"
remove_verb_contractions(a)

"We have not got to stop: 'regex is not going to fail'. We have understood that they have figured we cannot avoid using it to solve 'verbs'. We are convinced it works. They are not and want to tell us to stop, but we are still going to"

In [110]:
import time

t = time.time()
norm_tweets = normalize_tweet_text(text_full[:10],c2v_model,large_dict,dict_abbreviation,False)
print(time.time()-t)

It 's disappointing how people nowadays think terrorism is linked to a religion
['It', 'is', 'disappointing', 'how', 'people', 'nowadays', 'think', 'terrorism', 'is', 'linked', 'to', 'a', 'religion']
Israel killing Muslims everyday and no one bats an eye. Terrorist attack and Muslims got the blame? How shallow can you be
['Israel', 'killing', 'Muslims', 'everyday', 'and', 'no', 'one', 'bats', 'an', 'eye', '.', 'Terrorist', 'attack', 'and', 'Muslims', 'got', 'the', 'blame', '?', 'How', 'shallow', 'can', 'you', 'be']
that bitch stops a show because someone spilled water on stage but puts on a show when terrorists attacki
['that', 'bitch', 'stops', 'a', 'show', 'because', 'someone', 'spilled', 'water', 'on', 'stage', 'but', 'puts', 'on', 'a', 'show', 'when', 'terrorist', 'attacks']
French President Francois Hollande condemns the "terrorist attacks of unprecedented proportions."  #Pray-For-Paris
['French', 'President', 'Francois', 'Hollande', 'condemns', 'the', '``', 'terrorist', 'attacks'

In [121]:
import time

# text_extract = norm_tweets[:10000]
# text_extract = ['The KKK was screaming "kill all n*ggers" 2 DAYS AGO', 'They are not responsib']
text_extract = ['This is not a vonpetition !',
                'thsi is amzing! How old is you?',
                "I don't know. wehre to go",
                "I really think do your very stupid",
                "I couldn't remember the write answer",
                "I personnaly think it'd be easier like this",
                "It's gonna take you forever to come bcak home"]
norm_text = normalize_tweet_text(text_extract,c2v_model,large_dict,dict_abbreviation)
    

This is not a vonpetition
['This', 'is', 'not', 'a', 'contradiction']
thsi is amzing! How old is you
['this', 'is', 'amazing', '!', 'How', 'old', 'is', 'you']
I do not know. wehre to go
['I', 'do', 'not', 'know', '.', 'were', 'you', 'go']
I really think do your very stupid
['I', 'really', 'think', 'do', 'your', 'very', 'stupid']
I could not remember the write answer
['I', 'could', 'not', 'remember', 'to', 'write', 'answer']
I personnaly think it 'd be easier like this
['I', 'personaly', 'think', 'it', 'woudl', 'be', 'easier', 'like', 'this']
It 's going to take you forever to come bcak home
['It', 'is', 'going', 'to', 'take', 'you', 'forever', 'to', 'come', 'back', 'home']
6


In [None]:
import time
t = time.time()
text_extract = norm_tweets[:1000]
# print(text_extract)
tweet_out = []
for i,tweet in enumerate(text_extract):
    print(tweet)
#     parsed_tweet = re.findall(r"[#\w\'*]+", tweet)
    split = split = re.split("-|;|:|\(|\)|\[|\]|\.| |,|!|\~|\?",tweet)
    parsed_tweet = [s for s in split if len(s)!=0]
    if len(parsed_tweet)>1:
        for word in parsed_tweet:   
            if word[0].isalpha() and word[0].islower():
                dict_letter = large_dict[alphabet_dict[word[0].lower()]]
                
                if word.lower() not in dict_letter:
                    i_start = 0
                    stop = False
                    while stop is False:
                        idx = tweet.find(word,i_start)
                        if idx>0 and idx<len(tweet)-len(word):
                            if not tweet[idx-1].isalpha() and not tweet[idx+len(word)].isalpha():
                                #                                 tweet = tweet.replace(word,' [] ',1)
                                tweet = tweet[:idx]+' [] '+tweet[idx+len(word):]
                                eval_vec = evalc2v(tweet,c2v_model,False,100)
                                stop=True
                            else:
                                i_start = idx+1
                        elif idx==0 and len(tweet)>1:
#                             print(len(tweet))
#                             print(idx+len(word))
                            if not tweet[idx+len(word)].isalpha():
                                #                                 tweet = tweet.replace(word,' [] ',1)
                                tweet = tweet[:idx]+' [] '+tweet[idx+len(word):]
                                eval_vec = evalc2v(tweet,c2v_model,False,100)
                                stop=True
                            else:
                                i_start = idx+1
                                
                        elif idx==len(tweet)-len(word):
                            if not tweet[idx-1].isalpha():
                                #                                 tweet = tweet.replace(word, ' [] ',1)
                                tweet = tweet[:idx]+' [] '+tweet[idx+len(word):]
                                eval_vec = evalc2v(tweet,c2v_model,False,100)
                                stop=True
                            else:
                                i_start = idx+1
                        else:
                            i_start = idx+1
                            
                    print(tweet)
                    dist = 100
                    idx_min = 0
                    
                    candidates = [e[0] for e in eval_vec if e[0].isalpha()]+dict_letter
                    
                    for i,candidate in enumerate(candidates):
                        if candidate[0].isalpha():
                            dist_temp = computeEditDistance(candidate.lower(),word.lower(),1.5,0.5,1)
#                             computeEditDistance(str1, str2,w_del=1,w_ins=1,w_sub=1):
                            if dist_temp<dist:
                                dist=dist_temp
                                idx_min = i
                                
                    tweet = tweet.replace(' [] ', candidates[idx_min],1)
                        #             eval_vec = evalc2v(tweet,c2v_model)
                        #             tweet_out = tweet.replace(word, eval_vec[0])
                        
            elif word.lower()=="'s":
                i_start = 0
                stop = False
                while stop is False:
                    idx = tweet.find(word,i_start)
                    if idx>0 and idx<len(tweet)-len(word):
                        if not tweet[idx-1].isalpha() and not tweet[idx+len(word)].isalpha():
                            #                                 tweet = tweet.replace(word,' [] ',1)
                            tweet = tweet[:idx]+' [] '+tweet[idx+len(word):]
                            eval_vec = evalc2v(tweet,c2v_model,False,100)
                            stop=True
                        else:
                            i_start = idx+1
                    elif idx==0 and len(tweet)>1:
                        #                             print(len(tweet))
                        #                             print(idx+len(word))
                        if not tweet[idx+len(word)].isalpha():
                            #                                 tweet = tweet.replace(word,' [] ',1)
                            tweet = tweet[:idx]+' [] '+tweet[idx+len(word):]
                            eval_vec = evalc2v(tweet,c2v_model,False,100)
                            stop=True
                        else:
                            i_start = idx+1
                            
                    elif idx==len(tweet)-len(word):
                        if not tweet[idx-1].isalpha():
                            #                                 tweet = tweet.replace(word, ' [] ',1)
                            tweet = tweet[:idx]+' [] '+tweet[idx+len(word):]
                            eval_vec = evalc2v(tweet,c2v_model,False,100)
                            stop=True
                        else:
                            i_start = idx+1
                    else:
                        i_start = idx+1
                        
                print(tweet)
                dist = 100
                idx_min = 0
                
                candidates = [e[0] for e in eval_vec if e[0].isalpha()]
                
                for i,candidate in enumerate(candidates):
                    if candidate[0].isalpha():
                        dist_temp = computeEditDistance(candidate.lower(),word.lower(),1.5,0.5,1)
                        #                             computeEditDistance(str1, str2,w_del=1,w_ins=1,w_sub=1):
                        if dist_temp<dist:
                            dist=dist_temp
                            idx_min = i
                            
                tweet = tweet.replace(' [] ', candidates[idx_min],1)
                        
    else:
        if tweet[0].isalpha() and tweet[0].islower():
            dict_letter = large_dict[alphabet_dict[tweet[0].lower()]]
            if tweet not in dict_letter:
                dist = 100
                idx_min = 0
                for i, candidate in enumerate(dict_letter):
                    dist_temp = computeEditDistance(candidate.lower(),tweet.lower(),1.5,0.5,1)
#                     computeEditDistance(str1, str2,w_del=1,w_ins=1,w_sub=1):
                    if dist_temp<dist:
                        dist=dist_temp
                        idx_min = i
                        
                tweet = dict_letter[idx_min]
            
    print(tweet)

print(time.time()-t)
            
            

### TODO change levenshtein weights for last words, allow more insertion, allow apostrophes in words, magnitud --> manitus, transform it's in it 's (with remove_contractions) then split

In [1499]:
hashtag = '#BREAKING-NeWS'
parseHashtags(hashtag)

' #BREAKING-NeWS'