In [135]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt 
train_df = pd.read_csv("train_df.csv")
test_df = pd.read_csv("test_df.csv")

## Preprocessing training data

In [136]:
train_df = train_df.head(500)
print(train_df)

                      qid                                      question_text  \
0    dda0b0efc8ba86e81ec4  What are interesting facts about Microsoft his...   
1    dc708b74a108d0fc0ad9  What are those things which are not gonna happ...   
2    06a27ec5d82dacd8bfe0  What should I know to avoid being "upsold" whe...   
3    00cbb6b17e3ceb7c5358           How I add any account with payment bank?   
4    7c304888973a701585a0  Which Multi level marketing products are actua...   
..                    ...                                                ...   
495  6ecd29ec852839f36bd2  Can wisdom teeth in the top jaw cause maxilla ...   
496  c4a83bbbf6fdc9041efc  Is it okay to still marry your lover even if y...   
497  895059d9b8a42b88f002  Is hoping for an apocalyptic event, or a termi...   
498  f204815516e3b5e96486            Is there any book about cryptocurrency?   
499  cd4069e36c171a9cd36b  If atheist are so peaceful, then how come athe...   

     target  
0         0  
1         0

In [137]:
train_df.describe()

Unnamed: 0,target
count,500.0
mean,0.074
std,0.262033
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [138]:
train_df.isna().sum()

qid              0
question_text    0
target           0
dtype: int64

In [139]:
train_df["target"].unique()

array([0, 1])

In [140]:
train_df.index[train_df.duplicated()]

Int64Index([], dtype='int64')

## Preprocessing testing data

In [141]:
test_df

Unnamed: 0,qid,question_text
0,a4f3da3a3df9dd881edd,My period is due on my wedding day. How can I ...
1,9914c62ed3f69684d549,How many numbers higher than a million can be ...
2,8138ae48649e37091a91,"How come I feel nothing for my family, but sti..."
3,981b4753d17ef14d09f7,"In case of collapse of the Democratic party, w..."
4,452e2c705276ba16b7b7,Who is Émile Naoumoff?
...,...,...
306117,a352dff4fcc2571815ce,Did anyone get an update on Maruti Suzuki All ...
306118,ad4a8498d97c536c67b9,What 5 people in history do you find the most ...
306119,19784a27b55d4b453fda,How can I remove the tan on my forehead?
306120,370191dba26465997879,"If you are a well known hacker, will you be mo..."


In [142]:
test_df.isna().sum()

qid              0
question_text    0
dtype: int64

## Cleaning the text
When dealing with numerical data, data cleaning often involves removing null values and duplicate data, dealing with outliers, etc. With text data, there are some common data cleaning techniques, which are also known as text pre-processing techniques.

With text data, this cleaning process can go on forever. There's always an exception to every cleaning step. So, we're going to follow the MVP (minimum viable product) approach - start simple and iterate. Here are a bunch of things you can do to clean your data. We're going to execute just the common cleaning steps here and the rest can be done at a later point to improve our results.

Common data cleaning steps on all text:

Make text all lower case
Remove punctuation
Remove numerical values
Remove common non-sensical text (/n)
Tokenize text
Remove stop words
More data cleaning steps after tokenization:

Stemming / lemmatization
Parts of speech tagging
Create bi-grams or tri-grams
Deal with typos
And more...

In [143]:
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [144]:
train_df.question_text = train_df.question_text.apply(round1)
train_df.question_text

0      what are interesting facts about microsoft his...
1      what are those things which are not gonna happ...
2      what should i know to avoid being upsold when ...
3                how i add any account with payment bank
4      which multi level marketing products are actua...
                             ...                        
495    can wisdom teeth in the top jaw cause maxilla ...
496    is it okay to still marry your lover even if y...
497    is hoping for an apocalyptic event or a termin...
498               is there any book about cryptocurrency
499    if atheist are so peaceful then how come athei...
Name: question_text, Length: 500, dtype: object

In [145]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text
round2 = lambda x: clean_text_round2(x)

In [146]:
train_df.question_text= train_df.question_text.apply(round2)
train_df.question_text

0      what are interesting facts about microsoft his...
1      what are those things which are not gonna happ...
2      what should i know to avoid being upsold when ...
3                how i add any account with payment bank
4      which multi level marketing products are actua...
                             ...                        
495    can wisdom teeth in the top jaw cause maxilla ...
496    is it okay to still marry your lover even if y...
497    is hoping for an apocalyptic event or a termin...
498               is there any book about cryptocurrency
499    if atheist are so peaceful then how come athei...
Name: question_text, Length: 500, dtype: object

In [147]:
train_df.question_text

0      what are interesting facts about microsoft his...
1      what are those things which are not gonna happ...
2      what should i know to avoid being upsold when ...
3                how i add any account with payment bank
4      which multi level marketing products are actua...
                             ...                        
495    can wisdom teeth in the top jaw cause maxilla ...
496    is it okay to still marry your lover even if y...
497    is hoping for an apocalyptic event or a termin...
498               is there any book about cryptocurrency
499    if atheist are so peaceful then how come athei...
Name: question_text, Length: 500, dtype: object

In [148]:
# Tokenization
#defining function for tokenization
import re
def tokenization(text):
    tokens = re.split('W+',text)
    return tokens
#applying function to the column
train_df['question_text']= train_df['question_text'].apply(lambda x: tokenization(x))
train_df

Unnamed: 0,qid,question_text,target
0,dda0b0efc8ba86e81ec4,[what are interesting facts about microsoft hi...,0
1,dc708b74a108d0fc0ad9,[what are those things which are not gonna hap...,0
2,06a27ec5d82dacd8bfe0,[what should i know to avoid being upsold when...,0
3,00cbb6b17e3ceb7c5358,[how i add any account with payment bank],0
4,7c304888973a701585a0,[which multi level marketing products are actu...,0
...,...,...,...
495,6ecd29ec852839f36bd2,[can wisdom teeth in the top jaw cause maxilla...,0
496,c4a83bbbf6fdc9041efc,[is it okay to still marry your lover even if ...,0
497,895059d9b8a42b88f002,[is hoping for an apocalyptic event or a termi...,0
498,f204815516e3b5e96486,[is there any book about cryptocurrency],0


In [156]:
from nltk.corpus import stopwords
nltk.download('stopwords')
train_df.question_text = [word for word in train_df.question_text if not word in stopwords.words()]
train_df

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/karanjitsaha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,qid,question_text,target
0,dda0b0efc8ba86e81ec4,[what are interesting facts about microsoft hi...,0
1,dc708b74a108d0fc0ad9,[what are those things which are not gonna hap...,0
2,06a27ec5d82dacd8bfe0,[what should i know to avoid being upsold when...,0
3,00cbb6b17e3ceb7c5358,[how i add any account with payment bank],0
4,7c304888973a701585a0,[which multi level marketing products are actu...,0
...,...,...,...
495,6ecd29ec852839f36bd2,[can wisdom teeth in the top jaw cause maxilla...,0
496,c4a83bbbf6fdc9041efc,[is it okay to still marry your lover even if ...,0
497,895059d9b8a42b88f002,[is hoping for an apocalyptic event or a termi...,0
498,f204815516e3b5e96486,[is there any book about cryptocurr],0


In [150]:
import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [151]:
train_df['question_text'] = train_df['question_text'].apply(lambda x: [ps.stem(y) for y in x]) # Stem every word.
# train_df = train_df.drop(columns=['question_text']) # Get rid of the unstemmed column.

In [152]:
train_df

Unnamed: 0,qid,question_text,target
0,dda0b0efc8ba86e81ec4,[what are interesting facts about microsoft hi...,0
1,dc708b74a108d0fc0ad9,[what are those things which are not gonna hap...,0
2,06a27ec5d82dacd8bfe0,[what should i know to avoid being upsold when...,0
3,00cbb6b17e3ceb7c5358,[how i add any account with payment bank],0
4,7c304888973a701585a0,[which multi level marketing products are actu...,0
...,...,...,...
495,6ecd29ec852839f36bd2,[can wisdom teeth in the top jaw cause maxilla...,0
496,c4a83bbbf6fdc9041efc,[is it okay to still marry your lover even if ...,0
497,895059d9b8a42b88f002,[is hoping for an apocalyptic event or a termi...,0
498,f204815516e3b5e96486,[is there any book about cryptocurr],0


In [153]:
# # We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
# from sklearn.feature_extraction.text import CountVectorizer

# cv = CountVectorizer(stop_words='english')
# data_cv = cv.fit_transform(train_df.stemmed)
# data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names_out())
# data_dtm.index = train_df.index
# data_dtm