In [1]:
#Don't use standard preprocessing steps like stemming or stopword removal when you have pre-trained embeddings
#Some of you might used standard preprocessing steps when doing word count based feature extraction 
#(e.g. TFIDF) such as removing stopwords, stemming etc. 
#The reason is simple: You loose valuable information, which would help your NN to figure things out.

#Get your vocabulary as close to the embeddings as possible
#I will focus in this notebook, how to achieve that. 
#For an example I take the GoogleNews pretrained embeddings, 
#there is no deeper reason for this choice.

In [2]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [3]:
train = pd.read_csv('train.csv',error_bad_lines=False)
test  = pd.read_csv('test.csv',error_bad_lines=False)
print(f'Train Shape: {train.shape}')
print(f'Test Shape: {test.shape}')

b'Skipping line 40589: expected 3 fields, saw 88109\nSkipping line 81028: expected 3 fields, saw 88235\nSkipping line 121424: expected 3 fields, saw 88056\nSkipping line 161735: expected 3 fields, saw 87702\nSkipping line 202162: expected 3 fields, saw 87833\nSkipping line 242604: expected 3 fields, saw 88078\n'
b'Skipping line 278263: expected 3 fields, saw 77398\n'


Train Shape: (1002366, 3)
Test Shape: (375806, 2)


In [4]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [5]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [6]:
sentences = train["question_text"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)
print({k: vocab[k] for k in list(vocab)[:5]})

100%|██████████| 1002366/1002366 [00:04<00:00, 248688.23it/s]
100%|██████████| 1002366/1002366 [00:03<00:00, 278889.76it/s]

{'How': 201006, 'did': 25570, 'Quebec': 76, 'nationalists': 70, 'see': 6900}





In [7]:
from gensim.models import KeyedVectors
news_path = 'GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

In [8]:
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [9]:
#(oov) words that we can use to improve our preprocessing
oov = check_coverage(vocab,embeddings_index)
oov[:20]


100%|██████████| 430209/430209 [00:01<00:00, 269333.27it/s]


Found embeddings for 25.95% of vocab
Found embeddings for  78.75% of all text


[('to', 309623),
 ('a', 309121),
 ('of', 253817),
 ('and', 193805),
 ('India?', 12561),
 ('it?', 9900),
 ('do?', 6835),
 ('life?', 5996),
 ('you?', 4800),
 ('them?', 4782),
 ('me?', 4771),
 ('time?', 4317),
 ('world?', 4193),
 ('people?', 3768),
 ('why?', 3730),
 ('Quora?', 3589),
 ('10', 3502),
 ('like?', 3457),
 ('for?', 3383),
 ('work?', 3234)]

In [10]:
#step 1 :remove symbol
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x



In [11]:
train["question_text"] = train["question_text"].progress_apply(lambda x: clean_text(x))
sentences = train["question_text"].apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████| 1002366/1002366 [00:08<00:00, 112185.25it/s]
100%|██████████| 1002366/1002366 [00:03<00:00, 279073.05it/s]


In [12]:
oov = check_coverage(vocab,embeddings_index)
oov[:20]

100%|██████████| 218035/218035 [00:00<00:00, 293197.71it/s]


Found embeddings for 60.36% of vocab
Found embeddings for  89.99% of all text


[('to', 312050),
 ('a', 310015),
 ('of', 255475),
 ('and', 195395),
 ('2017', 6756),
 ('2018', 5652),
 ('10', 5072),
 ('12', 2869),
 ('20', 2277),
 ('100', 2201),
 ('15', 2131),
 ('12th', 1954),
 ('11', 1784),
 ('30', 1687),
 ('18', 1636),
 ('50', 1498),
 ('16', 1221),
 ('14', 1187),
 ('17', 1156),
 ('13', 1062)]

In [13]:
#step 2 : remove number
import re

def clean_numbers(x):

    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    
    
    return x

In [14]:
train["question_text"] = train["question_text"].progress_apply(lambda x: clean_numbers(x))
sentences = train["question_text"].progress_apply(lambda x: x.split())
vocab = build_vocab(sentences)

100%|██████████| 1002366/1002366 [00:11<00:00, 88386.30it/s]
100%|██████████| 1002366/1002366 [00:03<00:00, 261707.73it/s]
100%|██████████| 1002366/1002366 [00:03<00:00, 292019.09it/s]


In [15]:
oov = check_coverage(vocab,embeddings_index)
oov[:20]

100%|██████████| 209295/209295 [00:00<00:00, 272354.70it/s]


Found embeddings for 63.42% of vocab
Found embeddings for  90.75% of all text


[('to', 312050),
 ('a', 310015),
 ('of', 255475),
 ('and', 195395),
 ('favourite', 959),
 ('colour', 763),
 ('bitcoin', 756),
 ('doesnt', 711),
 ('centre', 684),
 ('Quorans', 654),
 ('cryptocurrency', 637),
 ('Snapchat', 605),
 ('travelling', 522),
 ('counselling', 487),
 ('btech', 482),
 ('didnt', 471),
 ('cryptocurrencies', 388),
 ('Brexit', 381),
 ('behaviour', 357),
 ('blockchain', 356)]

In [16]:
#step 3 : build a dict to do simple preprocessing remove prep and mispell and top underflow
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


mispell_dict = {'bitcoins':'bitcoin',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'socialmedium',
                'whatsapp': 'socialmedium',
                'Snapchat': 'socialmedium',
                'Btech': 'btech',
                
                

                }
mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)  #replace all the keys if in dict





In [17]:
train["question_text"] = train["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
sentences = train["question_text"].progress_apply(lambda x: x.split())
to_remove = ['a','an','to','of','and','is','it','that','am','are','were']
sentences = [[word for word in sentence if not word in to_remove] for sentence in tqdm(sentences)]
vocab = build_vocab(sentences)

100%|██████████| 1002366/1002366 [00:04<00:00, 209287.47it/s]
100%|██████████| 1002366/1002366 [00:03<00:00, 263695.64it/s]
100%|██████████| 1002366/1002366 [00:04<00:00, 216865.27it/s]
100%|██████████| 1002366/1002366 [00:03<00:00, 307162.43it/s]


In [18]:
oov = check_coverage(vocab,embeddings_index)
oov[:20]

100%|██████████| 209248/209248 [00:00<00:00, 251310.59it/s]


Found embeddings for 63.43% of vocab
Found embeddings for  98.89% of all text


[('bitcoin', 949),
 ('colour', 763),
 ('btech', 675),
 ('Quorans', 654),
 ('cryptocurrency', 637),
 ('socialmedium', 614),
 ('cryptocurrencies', 388),
 ('Brexit', 381),
 ('behaviour', 357),
 ('blockchain', 356),
 ('upvotes', 325),
 ('programme', 317),
 ('Redmi', 299),
 ('realise', 289),
 ('defence', 276),
 ('KVPY', 273),
 ('Paytm', 256),
 ('grey', 232),
 ('mtech', 208),
 ('upvote', 196)]