In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import spacy
import re

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torch.nn as nn
import torch.optim as optim

import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
%matplotlib inline

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

# Exploration

In [3]:
print(train.shape)
print(train.info())
print("-----------------")
print(test.shape)
print(test.info())

(1306122, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306122 entries, 0 to 1306121
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   qid            1306122 non-null  object
 1   question_text  1306122 non-null  object
 2   target         1306122 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 29.9+ MB
None
-----------------
(375806, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375806 entries, 0 to 375805
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   qid            375806 non-null  object
 1   question_text  375806 non-null  object
dtypes: object(2)
memory usage: 5.7+ MB
None


In [4]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [5]:
test.head()

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


In [6]:
print('Available Embeddings: ', os.listdir('embeddings/'))

Available Embeddings:  ['glove.840B.300d', 'GoogleNews-vectors-negative300', 'paragram_300_sl999', 'wiki-news-300d-1M']


In [7]:
train['target'].value_counts()

0    1225312
1      80810
Name: target, dtype: int64

- The data is highly unbalanced, only about 6% of the data are insincere

In [8]:
maxlen_train = 0
maxlen_test = 0
for i in train['question_text']:
    i = i.split()
    maxlen_train = max(len(i), maxlen_train)

for i in test['question_text']:
    i = i.split()
    maxlen_test = max(len(i), maxlen_test)

print("Max len question in train:", maxlen_train)
print("Max len question in test:", maxlen_test)

Max len question in train: 134
Max len question in test: 87


# Data Cleaning

In [15]:
print(train.isnull().sum())
print(test.isnull().sum())

qid              0
question_text    0
target           0
dtype: int64
qid              0
question_text    0
dtype: int64


### Puncts and numbers

In [9]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', 
          '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', '·', '_', 
          '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×',
          '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', '−', 
          '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', 
          '‹', '─', '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 
          'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', 
          '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 
          'Ø', '¹', '≤', '‡', '√', ]


def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x


def clean_numbers(x):
    x = re.sub('[0-9]{5, }', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

### Spelling

In [10]:
mispell_dict = {"aren't": "are not", "can't": "cannot", 
                "couldn't": "could not", "didn't": "did not",
                "doesn't": "does not", "don't": "do not", 
                "hadn't": "had not", "hasn't": "has not",
                "haven't": "have not", "he'd": "he would", 
                "he'll": "he will", "he's": "he is",
                "i'd": "I would", "i'd": "I had", "i'll": 
                "I will", "i'm" : "I am", "isn't": "is not",
                "it's": "it is", "it'll": "it will", 
                "i've" : "I have", "let's": "let us", 
                "mightn't": "might not", "mustn't": "must not", 
                "shan't" : "shall not", "she'd": "she would",
                "she'll": "she will", "she's": "she is", 
                "shouldn't": "should not", "that's": "that is", 
                "there's": "there is","they'd": "they would", 
                "they'll": "they will", "they're": "they are",
                "they've": "they have", "we'd": "we would", 
                "we're": "we are", "weren't": "were not",
                "we've": "we have", "what'll": "what will", 
                "what're": "what are", "what's": "what is", 
                "what've": "what have", "where's": "where is", 
                "who'd": "who would", "who'll": "who will",
                "who're": "who are", "who's": "who is", 
                "who've": "who have", "won't": "will not",
                "wouldn't" : "would not", "you'd": "you would", 
                "you'll": "you will", "you're": "you are",
                "you've": "you have", "'re": " are", 
                "wasn't": "was not", "we'll": " will", 
                "didn't": "did not", "tryin'": "trying"}


def get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re


def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

In [11]:
mispellings, mispellings_re = get_mispell(mispell_dict)
mispellings, mispellings_re

({"aren't": 'are not',
  "can't": 'cannot',
  "couldn't": 'could not',
  "didn't": 'did not',
  "doesn't": 'does not',
  "don't": 'do not',
  "hadn't": 'had not',
  "hasn't": 'has not',
  "haven't": 'have not',
  "he'd": 'he would',
  "he'll": 'he will',
  "he's": 'he is',
  "i'd": 'I had',
  "i'll": 'I will',
  "i'm": 'I am',
  "isn't": 'is not',
  "it's": 'it is',
  "it'll": 'it will',
  "i've": 'I have',
  "let's": 'let us',
  "mightn't": 'might not',
  "mustn't": 'must not',
  "shan't": 'shall not',
  "she'd": 'she would',
  "she'll": 'she will',
  "she's": 'she is',
  "shouldn't": 'should not',
  "that's": 'that is',
  "there's": 'there is',
  "they'd": 'they would',
  "they'll": 'they will',
  "they're": 'they are',
  "they've": 'they have',
  "we'd": 'we would',
  "we're": 'we are',
  "weren't": 'were not',
  "we've": 'we have',
  "what'll": 'what will',
  "what're": 'what are',
  "what's": 'what is',
  "what've": 'what have',
  "where's": 'where is',
  "who'd": 'who would',
  "

In [12]:
for i in range(10):
    print(train['question_text'][i])
    print("---------------")

How did Quebec nationalists see their province as a nation in the 1960s?
---------------
Do you have an adopted dog, how would you encourage people to adopt and not shop?
---------------
Why does velocity affect time? Does velocity affect space geometry?
---------------
How did Otto von Guericke used the Magdeburg hemispheres?
---------------
Can I convert montra helicon D to a mountain bike by just changing the tyres?
---------------
Is Gaza slowly becoming Auschwitz, Dachau or Treblinka for Palestinians?
---------------
Why does Quora automatically ban conservative opinions when reported, but does not do the same for liberal views?
---------------
Is it crazy if I wash or wipe my groceries off? Germs are everywhere.
---------------
Is there such a thing as dressing moderately, and if so, how is that different than dressing modestly?
---------------
Is it just me or have you ever been in this phase wherein you became ignorant to the people you once loved, completely disregarding their

In [13]:
# Clean the text
train["question_text"] = train["question_text"].apply(lambda x: clean_text(x.lower()))
test["question_text"] = test["question_text"].apply(lambda x: clean_text(x.lower()))

# Clean numbers
train["question_text"] = train["question_text"].apply(lambda x: clean_numbers(x))
test["question_text"] = test["question_text"].apply(lambda x: clean_numbers(x))

# Clean speelings
train["question_text"] = train["question_text"].apply(lambda x: replace_typical_misspell(x))
test["question_text"] = test["question_text"].apply(lambda x: replace_typical_misspell(x))

In [14]:
for i in range(10):
    print(train['question_text'][i])
    print("---------------")

how did quebec nationalists see their province as a nation in the ####s ? 
---------------
do you have an adopted dog ,  how would you encourage people to adopt and not shop ? 
---------------
why does velocity affect time ?  does velocity affect space geometry ? 
---------------
how did otto von guericke used the magdeburg hemispheres ? 
---------------
can i convert montra helicon d to a mountain bike by just changing the tyres ? 
---------------
is gaza slowly becoming auschwitz ,  dachau or treblinka for palestinians ? 
---------------
why does quora automatically ban conservative opinions when reported ,  but does not do the same for liberal views ? 
---------------
is it crazy if i wash or wipe my groceries off ?  germs are everywhere . 
---------------
is there such a thing as dressing moderately ,  and if so ,  how is that different than dressing modestly ? 
---------------
is it just me or have you ever been in this phase wherein you became ignorant to the people you once love