Importing all the packages that are used ahead.

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import OrderedDict
import re
from sklearn.model_selection import train_test_split as tts
from keras.models import Sequential
from keras.layers import Dense, CuDNNLSTM, Embedding, Bidirectional

Using TensorFlow backend.


In [2]:
tqdm.pandas()
train_data = pd.read_csv("../input/train.csv")

Coverting pandas dataframe columns to lists for easier and faster modification.

In [3]:
sentence_list_train = train_data['question_text'].tolist()

Handling punctuations. Some scope of improving time complexity here.

In [4]:
def handle_punctuations(sentence):
    sentence = str(sentence)
    for punct in "/-'":
        sentence = sentence.replace(punct, ' ')
    for punct in '&':
        sentence = sentence.replace(punct, ' and ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        sentence = sentence.replace(punct, '')
    return sentence

> Source of this dictionary: [Github gist by Neal Shyam](https://gist.github.com/nealrs/96342d8231b75cf4bb82)

This method expands contractions.

In [5]:
def handle_contractions(sentence):
    contraction_dict = {"ain't": "am not",
                        "aren't": "are not",
                        "can't": "cannot",
                        "can't've": "cannot have",
                        "'cause": "because",
                        "could've": "could have",
                        "couldn't": "could not",
                        "couldn't've": "could not have",
                        "didn't": "did not",
                        "doesn't": "does not",
                        "Don't": "do not",
                        "don't": "do not",
                        "hadn't": "had not",
                        "hadn't've": "had not have",
                        "hasn't": "has not",
                        "haven't": "have not",
                        "he'd": "he would",
                        "he'd've": "he would have",
                        "he'll": "he will",
                        "he'll've": "he will have",
                        "he's": "he is",
                        "how'd": "how did",
                        "how'd'y": "how do you",
                        "how'll": "how will",
                        "how's": "how is",
                        "I'd": "I would",
                        "I'd've": "I would have",
                        "I'll": "I will",
                        "I'll've": "I will have",
                        "i'm": "I am",
                        "I'm": "I am",
                        "I've": "I have",
                        "isn't": "is not",
                        "it'd": "it had",
                        "it'd've": "it would have",
                        "it'll": "it will",
                        "it'll've": "it will have",
                        "it's": "it is",
                        "let's": "let us",
                        "ma'am": "madam",
                        "mayn't": "may not",
                        "might've": "might have",
                        "mightn't": "might not",
                        "mightn't've": "might not have", 
                        "must've": "must have",
                        "mustn't": "must not",
                        "mustn't've": "must not have",
                        "needn't": "need not",
                        "needn't've": "need not have",
                        "o'clock": "of the clock",
                        "oughtn't": "ought not",
                        "oughtn't've": "ought not have",
                        "shan't": "shall not",
                        "sha'n't": "shall not",
                        "shan't've": "shall not have",
                        "she'd": "she would",
                        "she'd've": "she would have",
                        "she'll": "she will",
                        "she'll've": "she will have",
                        "she's": "she is",
                        "should've": "should have",
                        "shouldn't": "should not",
                        "shouldn't've": "should not have",
                        "so've": "so have",
                        "so's": "so is",
                        "that'd": "that would",
                        "that'd've": "that would have",
                        "that's": "that is",
                        "there'd": "there had",
                        "there'd've": "there would have",
                        "there's": "there is",
                        "they'd": "they would",
                        "they'd've": "they would have",
                        "they'll": "they will",
                        "they'll've": "they will have",
                        "they're": "they are",
                        "they've": "they have",
                        "to've": "to have",
                        "wasn't": "was not",
                        "we'd": "we had",
                        "we'd've": "we would have",
                        "we'll": "we will",
                        "we'll've": "we will have",
                        "we're": "we are",
                        "we've": "we have",
                        "weren't": "were not",
                        "what'll": "what will",
                        "what'll've": "what will have",
                        "what're": "what are",
                        "what's": "what is",
                        "what've": "what have",
                        "when's": "when is",
                        "when've": "when have",
                        "where'd": "where did",
                        "where's": "where is",
                        "where've": "where have",
                        "who'll": "who will",
                        "who'll've": "who will have",
                        "who's": "who is",
                        "who've": "who have",
                        "why's": "why is",
                        "why've": "why have",
                        "will've": "will have",
                        "won't": "will not",
                        "won't've": "will not have",
                        "would've": "would have",
                        "wouldn't": "would not",
                        "wouldn't've": "would not have",
                        "y'all": "you all",
                        "y'alls": "you alls",
                        "y'all'd": "you all would",
                        "y'all'd've": "you all would have",
                        "y'all're": "you all are",
                        "y'all've": "you all have",
                        "you'd": "you had",
                        "you'd've": "you would have",
                        "you'll": "you you will",
                        "you'll've": "you you will have",
                        "you're": "you are",
                        "you've": "you have"
                       }
    updated_sentence = ""
    words = sentence.split()
    for word in words:
        try:
            updated_sentence += contraction_dict[word]
        except KeyError:
            updated_sentence += word
        updated_sentence += " "
    return updated_sentence

> Idea of this approach: [An algorithm that learns what is in a name - Research paper(Skip to page 7)](http://people.csail.mit.edu/mcollins/6864/slides/bikel.pdf)


In [6]:
def handle_digits(sentence):
    
    def to_string(digit):
        if x == 1:
            return "one"
        elif x == 2:
            return "two"
        elif x == 3:
            return "three"
        elif x == 4:
            return "four"
        elif x == 5:
            return "five"
        elif x == 6:
            return "six"
        elif x == 7:
            return "seven"
        elif x == 8:
            return "eight"
        elif x == 9:
            return "nine"
        else:
            return "large"
        
    pattern = re.compile('.*[0-9].*')
    words = sentence.split()
    updated_line = ""
    for word in words:
        matched = pattern.match(word)
        if matched:
            if "," in word:
                updated_line += "amount "
            elif "/" in word:
                updated_line += "date "
            elif ":" in word:
                updated_line += "time "
            elif "-" in word:
                updated_line += "date "
            elif "." in word:
                updated_line += "decimal number "
            else:
                x = len(word)
                x = to_string(x)
                x += " digit number "
                updated_line += x
        else:
            word += " "
            updated_line += word
    return updated_line

In [7]:
def handle_spelling_errors(sentence):
    spell_correction_dict = {"qoura": "quora",
                            "qouran": "quoran",
                            "quoracom": "quora website",
                            "wwwyoutubecom": "youtube website",
                            "freelancercom": "freelancer website",
                            "demonitisation": "demonetization",
                            "demonetisation": "demonetization",
                            "bookingcom": "booking website",
                            "upwork": "freelancing platform",
                            "trumpcare": "trump care",
                            "brexit": "britain exit from europe",
                            "iiith": "iiit hyderabad",
                            "cryptocurrencies": "multiple cryptocurrency",
                            "pokémon": "pokemon",
                            "clickbait": "forced click",
                            "naukricom": "indian job portal website",
                            "bhakts": "devotees",
                            "…": "",
                             "etc…": "etc",
                             "π": "pi",
                             "√": "square root",
                             "blockchains": "blockchain",
                             "∞": "infinity"
                            }
    correct_sentence = ""
    words = sentence.split()
    for word in words:
        try:
            x = spell_correction_dict[word.lower()]
            correct_sentence += x
        except KeyError:
            correct_sentence += word
        correct_sentence += " "
    return correct_sentence

In [8]:
def handle_non_English_words(sentence):
    pass

In [9]:
def handle_acronyms_and_proper_nouns(sentence):
    pass

As name suggests, this method is used to clean sentences.

In [10]:
def clean_sentence(sentence):
    sentence = handle_contractions(sentence)
    sentence = handle_digits(sentence)
    sentence = sentence.strip()
    sentence = handle_punctuations(sentence)
    #sentence = handle_digits_and_nouns(sentence)
    #sentence = handle_non_English_words(sentence)
    #sentence = handle_acronyms_and_proper_nouns(sentence)
    sentence = handle_spelling_errors(sentence)
    sentence = sentence.strip()
    return sentence

This method creates vocabulary of ours. If word is already in our dictionary, add 1 to the value. Else add the word with value initialized to 1. Then using OrderedDict, it is sorted in reverse order.

In [11]:
def create_vocabulary(sentence_list):
    new_sentence_list = []
    vocabulary = {}
    for sentence in tqdm(sentence_list):
        sentence = clean_sentence(sentence)
        new_sentence_list.append(sentence)
        words = sentence.split()
        for word in words:
            try:
                vocabulary[word] += 1
            except KeyError:
                vocabulary[word] = 1
    vocabulary = OrderedDict(sorted(vocabulary.items(), key = lambda x:x[1], reverse = True))
    return vocabulary, new_sentence_list

This method finds how much of our vocabulary is actually useful.

In [12]:
def check_coverage(vocabulary, embedding):
    words_vocabulary = set(vocabulary.keys())
    words_embedding = set(embedding.keys())
    intersection = words_vocabulary & words_embedding
    print('Found embeddings for {:.2%} of our vocabulary'.format(len(intersection)/len(words_vocabulary)))

> Source of this function to load embedding: [Theo Viel's Notebook](https://www.kaggle.com/theoviel/improve-your-score-with-some-text-preprocessing)

In [None]:
glove_embedding = "../input/embeddings/glove.840B.300d/glove.840B.300d.txt"

def loading_glove_embedding(glove_embedding):
    
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    model = dict(get_coefs(*o.split(" ")) for o in open(glove_embedding, encoding='latin'))
    return model

glove = loading_glove_embedding(glove_embedding)

In [None]:
vocabulary, sentence_list_train = create_vocabulary(sentence_list_train)

This method brings all elements of our vocabulary to lower case. If lower case element is already present, add both lowercase and uppercase values, else only the one.  

In [None]:
def to_lower_case(vocabulary):
    updated_vocabulary = {}
    for word in tqdm(vocabulary):
        lower_word = word.lower()
        try:
            if word != lower_word:
                updated_vocabulary[lower_word] = vocabulary[word] + vocabulary[lower_word]
            else:
                updated_vocabulary[lower_word] = vocabulary[word]
        except KeyError:
            updated_vocabulary[lower_word] = vocabulary[word]
    return updated_vocabulary

Updating the embedding. Some words are known as proper nouns in Glove while others keep it small. Therefore, add lower case words to the embedding with their values initialized to the uppercase words.

In [None]:
def update_glove(vocabulary, glove):
    print(len(glove))
    for word in tqdm(vocabulary):
        lower_word = word.lower()
        if word in glove and lower_word not in glove:
            glove[lower_word] = glove[word]
    print(len(glove))
    return glove

In [None]:
glove = update_glove(vocabulary, glove)

In [None]:
vocabulary = to_lower_case(vocabulary)

In [None]:
check_coverage(vocabulary, glove)

In [None]:
def create_oov_dictionary(vocabulary, glove):
    oov_dictionary = {}
    for key in tqdm(vocabulary):
        try:
            x = glove[key]
        except KeyError:
            oov_dictionary[key] = vocabulary[key]
    oov_dictionary = OrderedDict(sorted(oov_dictionary.items(), key = lambda x:x[1], reverse = True))
    return oov_dictionary

In [None]:
oov_vocabulary = create_oov_dictionary(vocabulary, glove)
print({k:oov_vocabulary[k] for k in list(oov_vocabulary)[:100]})

In [None]:
def lower_case_sentence(sentence_list):
    sentence_list_new = []
    for sentence in tqdm(sentence_list):
        sentence = sentence.lower()
        sentence_list_new.append(sentence)
    return sentence_list_new

In [None]:
sentence_list_train = lower_case_sentence(sentence_list_train)

In [None]:
col = train_data.columns[1]
train_data[col] = sentence_list_train

In [None]:
features = np.asarray(train_data['question_text'].tolist())
target = np.asarray(train_data['target'].tolist())

In [None]:
x_train, x_validate, y_train, y_validate = tts(features, target, test_size = 0.2, random_state = 42)

In [None]:
del train_data
del oov_vocabulary
del sentence_list_train

In [None]:
def pretrainedEmbedding(vocab, embed):
    
    def wordToIndex(embed):
        tokens = sorted(embed.keys())
        wordIndex = {}
        for idx, tok in enumerate(tokens):
            kerasIdx = idx + 1
            wordIndex[tok] = kerasIdx
        return wordIndex
    
    wordIndex = wordToIndex(embed)
    
    vocabLength = len(wordIndex) + 1
    embDim = next(iter(embed.values())).shape[0]
    
    embeddingMatrix = np.zeros((vocabLength, embDim))
    for word, index in tqdm(wordIndex.items()):
        embeddingMatrix[index, : ] = embed[word]
    
    embeddingLayer = Embedding(vocabLength, embDim, weights = [embeddingMatrix], trainable = False)
    return embeddingLayer

In [None]:
model = Sequential()

In [40]:
model = Sequential()
model.add(pretrainedEmbedding(vocabulary, glove))
model.add(Bidirectional(CuDNNLSTM(64, return_sequences=True),
                        input_shape=(30, 300)))
model.add(Bidirectional(CuDNNLSTM(64)))
model.add(Dense(1, activation="sigmoid"))

MemoryError: 

In [None]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
model.fit_generator(x_train, y_train, epochs = 20, steps_per_epoch = 1000 , validation_set = [x_validate, y_validate], verbose = True)

In [None]:
del vocabulary
del glove
test_data = pd.read_csv('../input/test.csv')

In [None]:
test_sentence_list = test_data['question_text'].tolist()

In [27]:
def update_test_sentences(sentences):
    new_sentence_list = []
    for sentence in tqdm(sentences):
        sentence = clean_sentence(sentence)
        new_sentence_list.append(sentence)
    return new_sentence_list

In [None]:
test_sentence_list = updated_test_sentences(test_sentence_list)
test_sentence_list = lower_case_sentence(test_sentence_list)

In [None]:
test_data['question_text'] = test_sentence_list

In [None]:
ques_id = test_data['qid'].tolist() # for safety reasons
questions = np.asarray(test_data['question_text'].tolist())

In [None]:
questions = text_to_number(questions)

In [None]:
y_predicted = []
for question in tqdm(questions):
    y_predicted.extend(model.predict(question).flatten())

In [None]:
submission_df = pd.DataFrame({"qid": ques_id, "prediction": y_predicted})
submission_df.to_csv("submission.csv", index = False)