# Highest scoring kernel on Kaggle

This notebooks reproduces the results of our highest scoring kernel in the Quora competition on the public leaderboard.
The most important elements of this submission are:
- Only a sequence model is used
- The average of the Glove and Para embeddings are taken
- An Attention and Capsule layer is used in the model

In [None]:
import pandas as pd
import numpy as np
import gc
import sys
sys.path.append("../..") # Append source directory to our Python path

from common.nlp.sequence_preprocessing import preprocess_text_for_dl, fit_tokenizer, tokenize_and_pad
from common.nlp.load_embeddings import load_word_embedding, load_word2vec, create_embedding_matrix
from quora.sequence_models import cross_validate_and_predict

## 1. Load datasets and initialize parameters

In [None]:
# load data with right data types and preprocess
dtypes = {"qid": str, "question_text": str, "target": int}
training = pd.read_csv("../data/train.csv", dtype=dtypes)
testing = pd.read_csv("../data/test.csv", dtype=dtypes)

In [None]:
max_words = 75 # maximum number of words in a sentence/document

# Initialize WORD_MAP
WORD_MAP = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because",
           "could've": "could have", "couldn't": "could not", "didn't": "did not",
           "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not",
           "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is",
           "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have",
           "I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",
           "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not",
           "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
           "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not",
           "might've": "might have","mightn't": "might not","mightn't've": "might not have",
           "must've": "must have", "mustn't": "must not", "mustn't've": "must not have",
           "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
           "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would",
           "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
           "she's": "she is", "should've": "should have", "shouldn't": "should not",
           "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is",
           "that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would",
           "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are",
           "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have",
           "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not",
           "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is",
           "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", 
           "y'all'd": "you all would", "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
           "you're": "you are", "you've": "you have", 'hasnt': 'has not',
            'colour':'color', 'centre':'center', 'didnt':'did not', 'doesnt':'does not',
            'isnt':'is not', 'shouldnt':'should not', 'favourite':'favorite','travelling':'traveling',
            'counselling':'counseling', 'theatre':'theater', 'cancelled':'canceled', 'labour':'labor',
            'organisation':'organization', 'wwii':'world war 2', 'citicise':'criticize', 'instagram': 'social medium',
            'whatsapp': 'social medium', 'snapchat': 'social medium', 'behaviour': 'behavior', 'realise': 'realize',
            'defence': 'defense', 'programme': 'program', 'upvotes': 'votes', 'grey': 'gray', 'btech': 'bachelor of technology',
            'mtech': 'master of technology', 'cryptocurrency': 'digital currency', 'cryptocurrencies': 'digital currencies', 'bitcoin': 'digital currency',
            'bitcoins': 'digital currency', 'Bitcoin': 'digital currency', 'Btech': 'Bachelor of Technology', 'Isnt': 'Is not',
            'Snapchat': 'social medium', 'doesnt': 'does not', 'programmr': 'programmer', 'programr': 'programmer',
            'didnt': 'did not', 'blockchain': 'software technology', 'Shouldnt': 'Should not', 'Doesnt': 'Does not', 'isnt': 'is not',
            'programrs': 'programmers', 'currencys': 'currencies', 'honours': 'honors', 'upvote': 'vote', 'learnt': 'learned', 'licence': 'license',
            'Ethereum': 'digital currency', 'Whatis': 'What is', 'bcom': 'bachelor of commerce', 'aluminium': 'aluminum', 'favour': 'favor',
            'Pinterest': 'social medium', 'cheque': 'check', 'judgement': 'judgment', 'modelling': 'modeling', 'Xiaomi': 'phone', 'Coursera': 'online platform',
            'Quora': 'online platform', 'OnePlus': 'phone', 'wasnt': 'was not', 'recognise': 'recognize', 
            'organisation': 'organization', 'organisations': 'organizations', 'colour': 'color', 'colours': 'colors', 'coloured': 'colored',
            'Fortnite': 'video game', 'centres': 'centers', 
            'Quorans': 'people', "Quoras": "online platform's", 'jewellery': 'jewelry store', 'Lyft': 'ride sharing platform',
            'Didnt': 'Did not', 'practise': 'practice', 'vape': 'smoke', 'WeChat': 'social medium', 'analyse': 'analyze', 'travelled': 'traveled',
            'recognised': 'recognized', 'GDPR': 'privacy bill', 'neighbours': 'neighbors', 'demonetisation': 'demonetization', 'programmes': 'programs',
            'Blockchain': 'software technology', 'Nodejs': 'software technology', 'Coinbase': 'online platform', 'litre': 'liter', 'upvoted': 'voted',
            'sulphuric': 'sulfuric', 'Musks': "Musk's", 'neighbour': 'neighbor', 'selfies': 'photos', 'tyres': 'tires', 'ICOs': 'initial coin offerings',
            'Wasnt': 'Was not', 'realised': 'realized', 'specialisation': 'specialization', 'ethereum': 'digital currency', 'tyre': 'tire',
            'organised': 'organized', 'traveller': 'traveler', 'downvote': 'vote against', 'selfie': 'photo', 'Udacity': 'online platform', 'offence': 'offense',
            'litres': 'liters', 'vapour': 'vapor', 'Qoura': 'online platform', 'fibre': 'fiber', 'aeroplane': 'airplane', 'laymans': 'laymen', 'humour': 'humor',
            'utilise': 'utilize', 'civilisation': 'civilization', 'sulphur': 'sulfur', 'archaeology': 'archeology', 'masterbate': 'masturbate', 'Upwork': 'online platform',
            'neurotypicals': 'non-autistic people', 'criticise': 'criticize', 'organise': 'organize', 'labelled': 'labeled', 'cosx': 'cosine x',
            'judgemental': 'judgmental', 'dreamt': 'dreamed', 'Xamarin': 'medicin', 'MOOCs': 'online classes', 'emojis': 'smileys', 'Unacademy': 'online platform',
            'neighbouring': 'neighboring', 'cancelling': 'canceling', 'numericals': 'numerical', 'honour': 'honor', 'globalisation': 'globalization',
            'practising': 'practicing', 'WooCommerce': 'software technology', 'behavioural': 'behavioral', 'masterbation': 'masturbation', 'AngularJS': 'software technology',
            'wwwyoutubecom': 'online platform', 'Terroristan': 'terrorist Pakistan', 'terroristan': 'terrorist Pakistan', 
            'BIMARU': 'Bihar, Madhya Pradesh, Rajasthan, Uttar Pradesh', 'Hinduphobic': 'Hindu phobic', 'hinduphobic': 'Hindu phobic', 'Hinduphobia': 'Hindu phobic', 
            'hinduphobia': 'Hindu phobic', 'Babchenko': 'Arkady Arkadyevich Babchenko faked death', 'Boshniaks': 'Bosniaks',
            'Dravidanadu': 'Dravida Nadu', 'mysoginists': 'misogynists', 'MGTOWS': 'Men Going Their Own Way', 'unsincere': 'insincere',
            'meninism': 'male feminism', 'jewplicate': 'jewish replicate', 'unoin': 'Union', 'daesh': 'Islamic State of Iraq and the Levant',
            'Kalergi': 'Coudenhove-Kalergi', 'Bhakts': 'Bhakt', 'bhakts': 'Bhakt', 'Tambrahms': 'Tamil Brahmin', 'Pahul': 'Amrit Sanskar',
            'SJW': 'social justice warrior', 'SJWs': 'social justice warrior', 'incel': ' involuntary celibates', 'incels': ' involuntary celibates',
            'emiratis': 'Emiratis', 'weatern': 'western', 'westernise': 'westernize', 'Pizzagate': 'Pizzagate conspiracy theory', 'naïve': 'naive',
            'Skripal': 'Sergei Skripal', 'Remainers': 'British remainer', 'remainers': 'British remainer', 'bremainer': 'British remainer',
            'antibrahmin': 'anti Brahminism', 'HYPSM': 'Harvard, Yale, Princeton, Stanford, MIT', 'HYPS': 'Harvard, Yale, Princeton, Stanford',
            'kompromat': 'compromising material', 'Tharki': 'pervert', 'tharki': 'pervert', 'mastuburate': 'masturbate', 'Zoë': 'Zoe',
            'indans': 'Indian', 'xender': 'gender', 'Naxali ': 'Naxalite ', 'Naxalities': 'Naxalites', 'Bathla': 'Namit Bathla', 
            'Mewani': 'Indian politician Jignesh Mevani', 'clichéd': 'cliche', 'cliché': 'cliche', 'clichés': 'cliche', 'Wjy': 'Why',
            'Fadnavis': 'Indian politician Devendra Fadnavis', 'Awadesh': 'Indian engineer Awdhesh Singh', 'Awdhesh': 'Indian engineer Awdhesh Singh',
            'Khalistanis': 'Sikh separatist movement', 'madheshi': 'Madheshi', 'BNBR': 'Be Nice, Be Respectful', 'Bolsonaro': 'Jair Bolsonaro',
            'XXXTentacion': 'Tentacion', 'Padmavat': 'Indian Movie Padmaavat', 'Žižek': 'Slovenian philosopher Slavoj Žižek', 'Adityanath': 'Indian monk Yogi Adityanath',
            'Brexit': 'British Exit', 'Brexiter': 'British Exit supporter', 'Brexiters': 'British Exit supporters', 'Brexiteer': 'British Exit supporter',
            'Brexiteers': 'British Exit supporters', 'Brexiting': 'British Exit', 'Brexitosis': 'British Exit disorder', 'brexit': 'British Exit',
            'brexiters': 'British Exit supporters', 'jallikattu': 'Jallikattu', 'fortnite': 'Fortnite ', 'Swachh': 'Swachh Bharat mission campaign ',
            'Quorans': 'Quoran', 'Qoura ': 'Quora ', 'quoras': 'Quora', 'Quroa': 'Quora', 'QUORA': 'Quora', 'narcissit': 'narcissist', 'Doklam': 'Tibet',
            'Drumpf': 'Donald Trump fool', 'Drumpfs': 'Donald Trump fools', 'Strzok': 'Hillary Clinton scandal', 'rohingya': 'Rohingya ',
            'wumao': 'cheap Chinese stuff', 'wumaos': 'cheap Chinese stuff', 'Sanghis': 'Sanghi', 'Tamilans': 'Tamils', 'biharis': 'Biharis',
            'Rejuvalex': 'hair growth formula', 'Feku': 'Fake', 'deplorables': 'deplorable', 'muhajirs': 'Muslim immigrant', 'Gujratis': 'Gujarati',
            'Chutiya': 'Fucker', 'Chutiyas': 'Fucker', 'thighing': 'masturbate', '卐': 'Nazi Germany', 'Pribumi': 'Native Indonesian',
            'Gurmehar': 'Gurmehar Kaur Indian student activist', 'Novichok': 'Soviet Union agents', 'Khazari': 'Khazars', 'Demonetization': 'demonetization',
            'demonetisation': 'demonetization', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization',
            'cryptocurrencies': 'cryptocurrency', 'Hindians': 'North Indian who hate British', 'vaxxer': 'vocal nationalist ', 'remoaner': 'remainer ',
            'bremoaner': 'British remainer ', 'Jewism': 'Judaism', 'Eroupian': 'European', 'WMAF': 'White male married Asian female', 'moeslim': 'Muslim',
            'cishet': 'cisgender and heterosexual person', 'Eurocentric': 'Eurocentrism ', 'Jewdar': 'Jew dar', 'Asifa': 'abduction, rape, murder case ',
            'marathis': 'Marathi', 'Trumpanzees': 'Trump chimpanzee fool', 'Crimean': 'Crimea people ', 'atrracted': 'attract', 
            'LGBT': 'lesbian, gay, bisexual, transgender', 'Boshniak': 'Bosniaks ', 'Myeshia': 'widow of Green Beret killed in Niger', 'demcoratic': 'Democratic',
            'raaping': 'rape', 'Dönmeh': 'Islam', 'feminazism': 'feminism nazi', 'langague': 'language', 'Hongkongese': 'HongKong people',
            'hongkongese': 'HongKong people', 'Kashmirians': 'Kashmirian', 'Chodu': 'fucker', 'penish': 'penis', 'micropenis': 'tiny penis', 
            'Madridiots': 'Real Madrid idiot supporters', 'Ambedkarite': 'Dalit Buddhist movement ', 'ReleaseTheMemo': 'cry for the right and Trump supporters',
            'harrase': 'harass', 'Barracoon': 'Black slave', 'Castrater': 'castration', 'castrater': 'castration', 'Rapistan': 'Pakistan rapist', 
            'rapistan': 'Pakistan rapist', 'Turkified': 'Turkification', 'turkified': 'Turkification', 'Dumbassistan': 'dumb ass Pakistan',
            'facetards': 'Facebook retards', 'rapefugees': 'rapist refugee', 'superficious': 'superficial', 'colour': 'color', 'centre': 'center',
            'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 
            'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'sallary': 'salary',
            'Whta': 'What', 'narcisist': 'narcissist', 'narcissit': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can',
            'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 
            'mastrubation': 'masturbation', 'mastrubate': 'masturbate', 'mastrubating': 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum',
            'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess',
            'whst': 'what', 'watsapp': 'whatsapp', 'bodyshame': 'body shaming', 'bodyshoppers': 'body shopping', 'bodycams': 'body cams',
            'Cananybody': 'Can any body', 'deadbody': 'dead body', 'deaddict': 'de addict', 'Northindian': 'North Indian ', 'northindian': 'north Indian ',
            'northkorea': 'North Korea', 'Whykorean': 'Why Korean', 'koreaboo': 'Korea boo ', 'Brexshit': 'British Exit bullshit', 'shithole': 'shithole ',
            'shitpost': 'shit post', 'shitslam': 'shit Islam', 'shitlords': 'shit lords', 'Fck': 'Fuck', 'fck': 'fuck', 'Clickbait': 'click bait ',
            'clickbait': 'click bait ', 'mailbait': 'mail bait', 'healhtcare': 'healthcare', 'trollbots': 'troll bots', 'trollled': 'trolled',
            'trollimg': 'trolling', 'cybertrolling': 'cyber trolling', 'sickular': 'India sick secular ', 'suckimg': 'sucking', 'Idiotism': 'idiotism',
            'Niggerism': 'Nigger', 'Niggeriah': 'Nigger'}

# Initialize filepaths to embeddings
EMBEDDING_FILE_GOOGLE = '../data/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
EMBEDDING_FILE_GLOVE = '../data/embeddings/glove.840B.300d/glove.840B.300d.txt'
EMBEDDING_FILE_PARA = '../data/embeddings/paragram_300_sl999/paragram_300_sl999.txt'

## 2. Prepare data for using sequence models

In [None]:
train_Y = training["target"].copy().values
train, test = preprocess_text_for_dl(training.copy(), testing.copy(), puncts_ignore='/-', puncts_retain='&',
                                     word_map=WORD_MAP, )

In [None]:
# Delete data to prevent for having memory errors
del training, testing
gc.collect()
print('Preprocessing is done!')

## 3. Tokenize text

In [None]:
tokenizer = fit_tokenizer(train, test, text_col="question_text")
train_X, test_X = tokenize_and_pad(tokenizer, train, test, text_col="question_text", id_col="qid", max_words=max_words)

In [None]:
# Delete data to prevent for having memory errors
del train, test
gc.collect()
print('Tokenization is done!')

## 4. Load embeddings
Note that you can only use 2 out of 3 embeddings. Otherwise you will get a MemoryError.

In [None]:
# Load google (word2vec) embedding
embedding_dict_google = load_word2vec(EMBEDDING_FILE_GOOGLE, return_as_dict=True)

# Load other embeddings 
embedding_dict_para = load_word_embedding(EMBEDDING_FILE_PARA)
embedding_dict_glove = load_word_embedding(EMBEDDING_FILE_GLOVE)

# Create embeddings matrices and delete data to prevent for having memory errors
embedding_matrix_google = create_embedding_matrix(tokenizer.word_index, embedding_dict_google)
del embedding_dict_google
gc.collect()
print('Google embeddings are loaded!')

embedding_matrix_para = create_embedding_matrix(tokenizer.word_index, embedding_dict_para)
del embedding_dict_para
gc.collect()
print('Para embeddings are loaded!')

embedding_matrix_glove = create_embedding_matrix(tokenizer.word_index, embedding_dict_glove)
del embedding_dict_glove
gc.collect()
print('Glove embeddings are loaded!')

# Create average weights
embedding_matrix = np.mean((embedding_matrix_para, embedding_matrix_glove), axis=0)
# Delete only the matrices that you loaded
del embedding_matrix_glove, embedding_matrix_para, embedding_matrix_google 
gc.collect()

In [None]:
#embedding_matrix = pd.read_pickle('embedding.pkl').values

## 5. Create model and make a prediction

In [None]:
predictions = cross_validate_and_predict(train_X.values, train_Y, test_X.values, embedding_matrix, tokenizer.word_index, max_words, folds=5)

In [None]:
submission = pd.DataFrame({"qid": test_X.reset_index()["qid"].values,
                           "prediction": np.array(predictions, dtype=int)})
submission.to_csv("submission_fullscript.csv", index=False)
print("Submission file save to current working directory.")