In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt 
train_df = pd.read_csv("../input/datafiles/test_df.csv")
test_df = pd.read_csv("../input/datafiles/test_df.csv")

## Preprocessing training data

In [2]:
train_df = train_df
# print(train_df)

In [3]:
train_df.describe()

Unnamed: 0,qid,question_text
count,306122,306122
unique,306122,306122
top,a4f3da3a3df9dd881edd,My period is due on my wedding day. How can I ...
freq,1,1


In [4]:
train_df.isna().sum()

qid              0
question_text    0
dtype: int64

In [5]:
# train_df["target"].unique()

In [6]:
train_df.index[train_df.duplicated()]

Int64Index([], dtype='int64')

## Preprocessing testing data

In [7]:
test_df

Unnamed: 0,qid,question_text
0,a4f3da3a3df9dd881edd,My period is due on my wedding day. How can I ...
1,9914c62ed3f69684d549,How many numbers higher than a million can be ...
2,8138ae48649e37091a91,"How come I feel nothing for my family, but sti..."
3,981b4753d17ef14d09f7,"In case of collapse of the Democratic party, w..."
4,452e2c705276ba16b7b7,Who is Émile Naoumoff?
...,...,...
306117,a352dff4fcc2571815ce,Did anyone get an update on Maruti Suzuki All ...
306118,ad4a8498d97c536c67b9,What 5 people in history do you find the most ...
306119,19784a27b55d4b453fda,How can I remove the tan on my forehead?
306120,370191dba26465997879,"If you are a well known hacker, will you be mo..."


In [8]:
test_df.isna().sum()

qid              0
question_text    0
dtype: int64

## Cleaning the text
When dealing with numerical data, data cleaning often involves removing null values and duplicate data, dealing with outliers, etc. With text data, there are some common data cleaning techniques, which are also known as text pre-processing techniques.

With text data, this cleaning process can go on forever. There's always an exception to every cleaning step. So, we're going to follow the MVP (minimum viable product) approach - start simple and iterate. Here are a bunch of things you can do to clean your data. We're going to execute just the common cleaning steps here and the rest can be done at a later point to improve our results.

Common data cleaning steps on all text:

Make text all lower case
Remove punctuation
Remove numerical values
Remove common non-sensical text (/n)
Tokenize text
Remove stop words
More data cleaning steps after tokenization:

Stemming / lemmatization
Parts of speech tagging
Create bi-grams or tri-grams
Deal with typos
And more...

In [9]:
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [10]:
train_df.question_text = train_df.question_text.apply(round1)
train_df.question_text

0         my period is due on my wedding day how can i s...
1         how many numbers higher than a million can be ...
2         how come i feel nothing for my family but stil...
3         in case of collapse of the democratic party wi...
4                                     who is émile naoumoff
                                ...                        
306117    did anyone get an update on maruti suzuki all ...
306118    what  people in history do you find the most i...
306119              how can i remove the tan on my forehead
306120    if you are a well known hacker will you be mor...
306121    if your new enemies be bigger and more dangero...
Name: question_text, Length: 306122, dtype: object

In [11]:
def  clean_text(df, text_field, new_text_field_name):
    df[new_text_field_name] = df[text_field].str.lower()
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    # remove numbers
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"\d+", "", elem))
    
    return df
data_clean = clean_text(train_df, 'question_text', 'question_text')
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    return text
round2 = lambda x: clean_text_round2(x)

In [12]:
train_df.question_text= train_df.question_text.apply(round2)
train_df.question_text

0         my period is due on my wedding day how can i s...
1         how many numbers higher than a million can be ...
2         how come i feel nothing for my family but stil...
3         in case of collapse of the democratic party wi...
4                                      who is mile naoumoff
                                ...                        
306117    did anyone get an update on maruti suzuki all ...
306118    what  people in history do you find the most i...
306119              how can i remove the tan on my forehead
306120    if you are a well known hacker will you be mor...
306121    if your new enemies be bigger and more dangero...
Name: question_text, Length: 306122, dtype: object

In [13]:
train_df.question_text

0         my period is due on my wedding day how can i s...
1         how many numbers higher than a million can be ...
2         how come i feel nothing for my family but stil...
3         in case of collapse of the democratic party wi...
4                                      who is mile naoumoff
                                ...                        
306117    did anyone get an update on maruti suzuki all ...
306118    what  people in history do you find the most i...
306119              how can i remove the tan on my forehead
306120    if you are a well known hacker will you be mor...
306121    if your new enemies be bigger and more dangero...
Name: question_text, Length: 306122, dtype: object

In [14]:
# Tokenization
#defining function for tokenization
import re
def tokenization(text):
    tokens = re.split('W+',text)
    return tokens[0].split(" ")
#applying function to the column
train_df['question_text']= train_df['question_text'].apply(lambda x: tokenization(x))
# train_df.iloc[1].question_text

In [15]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
# train_df.question_text = [word for word in train_df.question_text if not word in stopwords.words('english')]
# train_df
# print(stopwords.words('english'))
stopwords=stopwords.words('english')
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output
#applying the function
train_df['question_text']= train_df['question_text'].apply(lambda x:remove_stopwords(x))
train_df

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,qid,question_text
0,a4f3da3a3df9dd881edd,"[period, due, wedding, day, stop, pill, option]"
1,9914c62ed3f69684d549,"[many, numbers, higher, million, formed, digit..."
2,8138ae48649e37091a91,"[come, feel, nothing, family, still, love, pet..."
3,981b4753d17ef14d09f7,"[case, collapse, democratic, party, republican..."
4,452e2c705276ba16b7b7,"[mile, naoumoff]"
...,...,...
306117,a352dff4fcc2571815ce,"[anyone, get, update, maruti, suzuki, india, e..."
306118,ad4a8498d97c536c67b9,"[, people, history, find, interesting]"
306119,19784a27b55d4b453fda,"[remove, tan, forehead]"
306120,370191dba26465997879,"[well, known, hacker, prone, hacked]"


In [16]:
# import nltk
# from nltk.stem import PorterStemmer
# ps = PorterStemmer()

In [17]:
# train_df['question_text'] = train_df['question_text'].apply(lambda x: [ps.stem(y) for y in x]) # Stem every word.
# # train_df = train_df.drop(columns=['question_text']) # Get rid of the unstemmed column.

In [18]:
train_df

Unnamed: 0,qid,question_text
0,a4f3da3a3df9dd881edd,"[period, due, wedding, day, stop, pill, option]"
1,9914c62ed3f69684d549,"[many, numbers, higher, million, formed, digit..."
2,8138ae48649e37091a91,"[come, feel, nothing, family, still, love, pet..."
3,981b4753d17ef14d09f7,"[case, collapse, democratic, party, republican..."
4,452e2c705276ba16b7b7,"[mile, naoumoff]"
...,...,...
306117,a352dff4fcc2571815ce,"[anyone, get, update, maruti, suzuki, india, e..."
306118,ad4a8498d97c536c67b9,"[, people, history, find, interesting]"
306119,19784a27b55d4b453fda,"[remove, tan, forehead]"
306120,370191dba26465997879,"[well, known, hacker, prone, hacked]"


In [19]:
import nltk
nltk.download('omw-1.4')
  

# Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
train_df['question_text']=train_df['question_text'].apply(lambda x:lemmatizer(x))
# train_df

[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
# listOfWords = train_df.question_text[:][0]
# listOfWords.extend(train_df.question_text[:][1])
# listOfWords

In [21]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
words=[]
for i in range(train_df.question_text.shape[0]):
    words.extend(word for word in train_df.question_text[:][i])
# print(words)
data_cv = cv.fit_transform(word for word in words)
# data_cv = train_df.question_text 
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names_out())
# data_dtm.index = train_df.index
data_dtm

Unnamed: 0,aa,aaa,aaabbb,aaalike,aaaq,aaarated,aac,aachen,aadar,aadgar,...,zygote,zyklene,zyklon,zymase,zynga,zynq,zyrtec,zyzz,zz,zzzquil
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999492,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1999493,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1999494,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1999495,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
