## Importing important libraries and reading the training and testing data

In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,roc_curve,auc,f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import matplotlib.pyplot as plt 
import re
import string
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize,word_tokenize
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/karanjitsaha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/karanjitsaha/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/karanjitsaha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/karanjitsaha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
train_df = pd.read_csv("train_df.csv")
test_df = pd.read_csv("test_df.csv")

## EDA of training data

### Checking for null values, invalid target values and duplicated rows

In [3]:
train_df.describe()

Unnamed: 0,target
count,1000000.0
mean,0.06187
std,0.240919
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [4]:
train_df.isna().sum()

qid              0
question_text    0
target           0
dtype: int64

In [5]:
train_df["target"].unique()

array([0, 1])

In [6]:
train_df.index[train_df.duplicated()]

Int64Index([], dtype='int64')

## EDA of testing data

### Checking for null values and duplicated rows

In [7]:
test_df

Unnamed: 0,qid,question_text
0,a4f3da3a3df9dd881edd,My period is due on my wedding day. How can I ...
1,9914c62ed3f69684d549,How many numbers higher than a million can be ...
2,8138ae48649e37091a91,"How come I feel nothing for my family, but sti..."
3,981b4753d17ef14d09f7,"In case of collapse of the Democratic party, w..."
4,452e2c705276ba16b7b7,Who is Émile Naoumoff?
...,...,...
306117,a352dff4fcc2571815ce,Did anyone get an update on Maruti Suzuki All ...
306118,ad4a8498d97c536c67b9,What 5 people in history do you find the most ...
306119,19784a27b55d4b453fda,How can I remove the tan on my forehead?
306120,370191dba26465997879,"If you are a well known hacker, will you be mo..."


In [8]:
test_df.isna().sum()

qid              0
question_text    0
dtype: int64

In [9]:
test_df.index[test_df.duplicated()]

Int64Index([], dtype='int64')

## Preprocessing training and testing data

### Cleaning the text
Common data cleaning steps on all text:

1.Make text all lower case

2.Remove punctuation

3.Remove numerical values

4.Remove common non-sensical text (/n)

5.Tokenize text

6.Remove stop words

7.Stemming / lemmatization


### Here (in round 1) we are doing the following things:-
1. Making the text lower case.
2. Removing text in square brackets
3. Removing punctuation marks from the text
4. Removing words containing numbers.

In [10]:
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [11]:
train_df.question_text= train_df.question_text.apply(round1)
test_df.question_text= test_df.question_text.apply(round1)

### Here (in round2) we are doing:-
1. Getting rid of additional punctuation
2. Removing some non-sensical text
3. Removing urls
4. Removing HTML tags

In [12]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub(r"http\S+", '', text)
    text = re.sub(re.compile('<.*?>') , '', text)

    return text
round2 = lambda x: clean_text_round2(x)

In [13]:
train_df.question_text= train_df.question_text.apply(round2)
test_df.question_text= test_df.question_text.apply(round2)
test_df.question_text

0         my period is due on my wedding day how can i s...
1         how many numbers higher than a million can be ...
2         how come i feel nothing for my family but stil...
3         in case of collapse of the democratic party wi...
4                                     who is émile naoumoff
                                ...                        
306117    did anyone get an update on maruti suzuki all ...
306118    what  people in history do you find the most i...
306119              how can i remove the tan on my forehead
306120    if you are a well known hacker will you be mor...
306121    if your new enemies be bigger and more dangero...
Name: question_text, Length: 306122, dtype: object

### Tokenization
Tokenization is the process of segmenting running text into sentences and words. In essence, it’s the task of cutting a text into pieces called tokens. 
$\newline$ Here we are going to use word tokenizer i.e. the words are the tokens

In [14]:
def tokenization(text):
    return word_tokenize(text)
# applying function to the column
train_df['question_text']= train_df['question_text'].apply(lambda x: tokenization(x))
test_df['question_text']= test_df['question_text'].apply(lambda x: tokenization(x))

In [15]:
train_df.question_text

0         [what, are, interesting, facts, about, microso...
1         [what, are, those, things, which, are, not, go...
2         [what, should, i, know, to, avoid, being, upso...
3          [how, i, add, any, account, with, payment, bank]
4         [which, multi, level, marketing, products, are...
                                ...                        
999995                     [how, is, cse, at, vit, chennai]
999996    [how, can, we, prevent, a, holocaust, by, robo...
999997    [how, can, i, help, a, student, remember, key,...
999998    [what, is, the, difference, between, lace, clo...
999999    [what, happens, when, you, look, into, a, brok...
Name: question_text, Length: 1000000, dtype: object

In [16]:
test_df

Unnamed: 0,qid,question_text
0,a4f3da3a3df9dd881edd,"[my, period, is, due, on, my, wedding, day, ho..."
1,9914c62ed3f69684d549,"[how, many, numbers, higher, than, a, million,..."
2,8138ae48649e37091a91,"[how, come, i, feel, nothing, for, my, family,..."
3,981b4753d17ef14d09f7,"[in, case, of, collapse, of, the, democratic, ..."
4,452e2c705276ba16b7b7,"[who, is, émile, naoumoff]"
...,...,...
306117,a352dff4fcc2571815ce,"[did, anyone, get, an, update, on, maruti, suz..."
306118,ad4a8498d97c536c67b9,"[what, people, in, history, do, you, find, the..."
306119,19784a27b55d4b453fda,"[how, can, i, remove, the, tan, on, my, forehead]"
306120,370191dba26465997879,"[if, you, are, a, well, known, hacker, will, y..."


### Lemmatization
Lemmatization is a tool that performs full morphological analysis to more accurately find the root, or “lemma” for a word.

In [17]:
# Lemmatization
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
train_df['question_text']=train_df['question_text'].apply(lambda x:lemmatizer(x))
test_df['question_text']=test_df['question_text'].apply(lambda x:lemmatizer(x))
# train_df.shape
train_df

Unnamed: 0,qid,question_text,target
0,dda0b0efc8ba86e81ec4,"[what, are, interesting, fact, about, microsof...",0
1,dc708b74a108d0fc0ad9,"[what, are, those, thing, which, are, not, gon...",0
2,06a27ec5d82dacd8bfe0,"[what, should, i, know, to, avoid, being, upso...",0
3,00cbb6b17e3ceb7c5358,"[how, i, add, any, account, with, payment, bank]",0
4,7c304888973a701585a0,"[which, multi, level, marketing, product, are,...",0
...,...,...,...
999995,4bd96088d0b5f0f2c4f4,"[how, is, cse, at, vit, chennai]",0
999996,e80edbfc086f7125940f,"[how, can, we, prevent, a, holocaust, by, robo...",0
999997,1506dfad6bd340782a1f,"[how, can, i, help, a, student, remember, key,...",0
999998,b56c60fd407f2f85553c,"[what, is, the, difference, between, lace, clo...",0


### Stop Words removal
Stop words are commonly occurring words that for some computational processes provide little information or in some cases introduce unnecessary noise and therefore need to be removed.

In [18]:
stopwords=stopwords.words('english')
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output
#applying the function
train_df['question_text']= train_df['question_text'].apply(lambda x:remove_stopwords(x))
test_df['question_text']= test_df['question_text'].apply(lambda x:remove_stopwords(x))

In [19]:
train_df

Unnamed: 0,qid,question_text,target
0,dda0b0efc8ba86e81ec4,"[interesting, fact, microsoft, history]",0
1,dc708b74a108d0fc0ad9,"[thing, gon, na, happen, ever]",0
2,06a27ec5d82dacd8bfe0,"[know, avoid, upsold, getting, car, brake, cha...",0
3,00cbb6b17e3ceb7c5358,"[add, account, payment, bank]",0
4,7c304888973a701585a0,"[multi, level, marketing, product, actually, w...",0
...,...,...,...
999995,4bd96088d0b5f0f2c4f4,"[cse, vit, chennai]",0
999996,e80edbfc086f7125940f,"[prevent, holocaust, robot, ai, alien]",0
999997,1506dfad6bd340782a1f,"[help, student, remember, key, step, informati...",0
999998,b56c60fd407f2f85553c,"[difference, lace, closure, lace, frontal]",0


### Creating sentences out of the remaining words 

In [20]:
def makeSentence(text):
    return ' '.join(list(text))

train_df.question_text = train_df['question_text'].apply(lambda x: makeSentence(x))
test_df.question_text = test_df['question_text'].apply(lambda x: makeSentence(x))

print(train_df.question_text)
# print(makeSentence(train_df.question_text[0]))
# ' '.join(list(train_df.question_text)[0])

0                        interesting fact microsoft history
1                                  thing gon na happen ever
2               know avoid upsold getting car brake changed
3                                  add account payment bank
4         multi level marketing product actually worth p...
                                ...                        
999995                                      cse vit chennai
999996                     prevent holocaust robot ai alien
999997    help student remember key step information wri...
999998                 difference lace closure lace frontal
999999                           happens look broken mirror
Name: question_text, Length: 1000000, dtype: object


In [21]:
test_df

Unnamed: 0,qid,question_text
0,a4f3da3a3df9dd881edd,period due wedding day stop pill option
1,9914c62ed3f69684d549,many number higher million formed digit
2,8138ae48649e37091a91,come feel nothing family still love pet friend...
3,981b4753d17ef14d09f7,case collapse democratic party republican part...
4,452e2c705276ba16b7b7,émile naoumoff
...,...,...
306117,a352dff4fcc2571815ce,anyone get update maruti suzuki india engineer...
306118,ad4a8498d97c536c67b9,people history find interesting
306119,19784a27b55d4b453fda,remove tan forehead
306120,370191dba26465997879,well known hacker prone hacked


In [22]:
train_df.to_csv("preprocessed_train.csv",index=False)
test_df.to_csv("preprocessed_test.csv",index=False)

In [23]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,roc_curve,auc,f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [24]:
train_df = pd.read_csv("preprocessed_train.csv")
test_df = pd.read_csv("preprocessed_test.csv")

In [25]:
test_df

Unnamed: 0,qid,question_text
0,a4f3da3a3df9dd881edd,period due wedding day stop pill option
1,9914c62ed3f69684d549,many number higher million formed digit
2,8138ae48649e37091a91,come feel nothing family still love pet friend...
3,981b4753d17ef14d09f7,case collapse democratic party republican part...
4,452e2c705276ba16b7b7,émile naoumoff
...,...,...
306117,a352dff4fcc2571815ce,anyone get update maruti suzuki india engineer...
306118,ad4a8498d97c536c67b9,people history find interesting
306119,19784a27b55d4b453fda,remove tan forehead
306120,370191dba26465997879,well known hacker prone hacked


## Vectorization of the dataset

In [26]:

tfidf = CountVectorizer()
# Numericalize the train dataset
train = tfidf.fit_transform(train_df.question_text.values.astype('U'))
# Numericalize the test dataset
# test = tfidf.transform(X_test.values.astype('U'))

# for test_df
test_df_matrix = tfidf.transform(test_df.question_text.values.astype('U'))
# print(test_df_matrix.shape)
train

<1000000x178219 sparse matrix of type '<class 'numpy.int64'>'
	with 6244781 stored elements in Compressed Sparse Row format>

In [27]:
train

<1000000x178219 sparse matrix of type '<class 'numpy.int64'>'
	with 6244781 stored elements in Compressed Sparse Row format>

## Doing train test split of the train_df.csv

In [28]:

X_train, X_test, y_train, y_test = train_test_split(
    train, train_df.target.values, test_size=0.15, stratify=train_df.target.values)
# X_train, X_test, y_train, y_test = train_test_split(
#     train, train_df.target.values, test_size=0.15)
X_train.shape

# test_x = test_df.question_text
train = X_train
test = X_test

In [29]:
# print(train)

In [30]:
import pickle

def storeData(text,obj):
    # Its important to use binary mode
    dbfile = open(text+'.pickle', 'ab')
    # source, destination
    pickle.dump(obj, dbfile)                     
    dbfile.close()

In [31]:
storeData('X_train',X_train)
storeData('X_test',X_test)
storeData('y_train',y_train)
storeData('y_test',y_test)
storeData('test_df_matrix',test_df_matrix)