## Importing important libraries and reading the training and testing data

In [57]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt 
train_df = pd.read_csv("train_df.csv")
test_df = pd.read_csv("test_df.csv")

## Preprocessing training data

### Checking for null values, invalid target values and duplicated rows

In [58]:
# train_df = train_df.head(9000)

In [59]:
train_df.describe()

Unnamed: 0,target
count,1000000.0
mean,0.06187
std,0.240919
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [60]:
train_df.isna().sum()

qid              0
question_text    0
target           0
dtype: int64

In [61]:
train_df["target"].unique()

array([0, 1])

In [62]:
train_df.index[train_df.duplicated()]

Int64Index([], dtype='int64')

## Preprocessing testing data

### Checking for null values and duplicated rows

In [63]:
test_df

Unnamed: 0,qid,question_text
0,a4f3da3a3df9dd881edd,My period is due on my wedding day. How can I ...
1,9914c62ed3f69684d549,How many numbers higher than a million can be ...
2,8138ae48649e37091a91,"How come I feel nothing for my family, but sti..."
3,981b4753d17ef14d09f7,"In case of collapse of the Democratic party, w..."
4,452e2c705276ba16b7b7,Who is Émile Naoumoff?
...,...,...
306117,a352dff4fcc2571815ce,Did anyone get an update on Maruti Suzuki All ...
306118,ad4a8498d97c536c67b9,What 5 people in history do you find the most ...
306119,19784a27b55d4b453fda,How can I remove the tan on my forehead?
306120,370191dba26465997879,"If you are a well known hacker, will you be mo..."


In [64]:
test_df.isna().sum()

qid              0
question_text    0
dtype: int64

In [65]:
test_df.index[test_df.duplicated()]

Int64Index([], dtype='int64')

## Cleaning the text
Common data cleaning steps on all text:

1.Make text all lower case

2.Remove punctuation

3.Remove numerical values

4.Remove common non-sensical text (/n)

5.Tokenize text

6.Remove stop words

7.Stemming / lemmatization


### Here (in round 1) we are doing the following things:-
1. Making the text lower case.
2. Removing text in square brackets
3. Removing punctuation marks from the text
4. Removing words containing numbers.

In [66]:
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [67]:
train_df.question_text= train_df.question_text.apply(round1)
# train_df.question_text
test_df.question_text= test_df.question_text.apply(round1)


### Here (in round2) we are doing:-
1. Getting rid of additional punctuation
2. Removing some non-sensical text
3. Removing urls
4. Removing HTML tags

In [68]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub(r"http\S+", '', text)
    text = re.sub(re.compile('<.*?>') , '', text)

    return text
round2 = lambda x: clean_text_round2(x)

In [69]:
train_df.question_text= train_df.question_text.apply(round2)
test_df.question_text= test_df.question_text.apply(round2)
test_df.question_text

0         my period is due on my wedding day how can i s...
1         how many numbers higher than a million can be ...
2         how come i feel nothing for my family but stil...
3         in case of collapse of the democratic party wi...
4                                     who is émile naoumoff
                                ...                        
306117    did anyone get an update on maruti suzuki all ...
306118    what  people in history do you find the most i...
306119              how can i remove the tan on my forehead
306120    if you are a well known hacker will you be mor...
306121    if your new enemies be bigger and more dangero...
Name: question_text, Length: 306122, dtype: object

In [70]:
# train_df.question_text

### Tokenization
Tokenization is the process of segmenting running text into sentences and words. In essence, it’s the task of cutting a text into pieces called tokens. 
$\newline$ Here we are going to use word tokenizer i.e. the words are the tokens

In [71]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize,word_tokenize
def tokenization(text):
    return word_tokenize(text)
# applying function to the column
train_df['question_text']= train_df['question_text'].apply(lambda x: tokenization(x))
test_df['question_text']= test_df['question_text'].apply(lambda x: tokenization(x))

# train_df.iloc[1].question_text

[nltk_data] Downloading package punkt to
[nltk_data]     /home/karanjitsaha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [72]:
train_df.question_text
# import nltk
# from nltk.stem import PorterStemmer
# ps = PorterStemmer()

0         [what, are, interesting, facts, about, microso...
1         [what, are, those, things, which, are, not, go...
2         [what, should, i, know, to, avoid, being, upso...
3          [how, i, add, any, account, with, payment, bank]
4         [which, multi, level, marketing, products, are...
                                ...                        
999995                     [how, is, cse, at, vit, chennai]
999996    [how, can, we, prevent, a, holocaust, by, robo...
999997    [how, can, i, help, a, student, remember, key,...
999998    [what, is, the, difference, between, lace, clo...
999999    [what, happens, when, you, look, into, a, brok...
Name: question_text, Length: 1000000, dtype: object

In [73]:
# train_df['question_text'] = train_df['question_text'].apply(lambda x: [ps.stem(y) for y in x]) # Stem every word.
# # train_df = train_df.drop(columns=['question_text']) # Get rid of the unstemmed column.

In [74]:
test_df

Unnamed: 0,qid,question_text
0,a4f3da3a3df9dd881edd,"[my, period, is, due, on, my, wedding, day, ho..."
1,9914c62ed3f69684d549,"[how, many, numbers, higher, than, a, million,..."
2,8138ae48649e37091a91,"[how, come, i, feel, nothing, for, my, family,..."
3,981b4753d17ef14d09f7,"[in, case, of, collapse, of, the, democratic, ..."
4,452e2c705276ba16b7b7,"[who, is, émile, naoumoff]"
...,...,...
306117,a352dff4fcc2571815ce,"[did, anyone, get, an, update, on, maruti, suz..."
306118,ad4a8498d97c536c67b9,"[what, people, in, history, do, you, find, the..."
306119,19784a27b55d4b453fda,"[how, can, i, remove, the, tan, on, my, forehead]"
306120,370191dba26465997879,"[if, you, are, a, well, known, hacker, will, y..."


### Lemmatization
Lemmatization is a tool that performs full morphological analysis to more accurately find the root, or “lemma” for a word.

In [75]:
import nltk
nltk.download('omw-1.4')
# Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
train_df['question_text']=train_df['question_text'].apply(lambda x:lemmatizer(x))
test_df['question_text']=test_df['question_text'].apply(lambda x:lemmatizer(x))
# train_df.shape
train_df


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/karanjitsaha/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/karanjitsaha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,qid,question_text,target
0,dda0b0efc8ba86e81ec4,"[what, are, interesting, fact, about, microsof...",0
1,dc708b74a108d0fc0ad9,"[what, are, those, thing, which, are, not, gon...",0
2,06a27ec5d82dacd8bfe0,"[what, should, i, know, to, avoid, being, upso...",0
3,00cbb6b17e3ceb7c5358,"[how, i, add, any, account, with, payment, bank]",0
4,7c304888973a701585a0,"[which, multi, level, marketing, product, are,...",0
...,...,...,...
999995,4bd96088d0b5f0f2c4f4,"[how, is, cse, at, vit, chennai]",0
999996,e80edbfc086f7125940f,"[how, can, we, prevent, a, holocaust, by, robo...",0
999997,1506dfad6bd340782a1f,"[how, can, i, help, a, student, remember, key,...",0
999998,b56c60fd407f2f85553c,"[what, is, the, difference, between, lace, clo...",0


In [76]:
train_df["question_text"][1]

['what',
 'are',
 'those',
 'thing',
 'which',
 'are',
 'not',
 'gon',
 'na',
 'happen',
 'ever']

### Stop Words removal
Stop words are commonly occurring words that for some computational processes provide little information or in some cases introduce unnecessary noise and therefore need to be removed.

In [77]:
# import nltk
# from nltk.corpus import stopwords
# nltk.download('stopwords')
# # train_df.question_text = [word for word in train_df.question_text if not word in stopwords.words('english')]
# # train_df
# # print(stopwords.words('english'))
# stopwords=stopwords.words('english')
# def remove_stopwords(text):
#     output= [i for i in text if i not in stopwords]
#     return output
# #applying the function
# train_df['question_text']= train_df['question_text'].apply(lambda x:remove_stopwords(x))
# test_df['question_text']= test_df['question_text'].apply(lambda x:remove_stopwords(x))

In [78]:
# remove_stopwords(lemmatizer(["thing","gonna","happen"]))

In [79]:
train_df

Unnamed: 0,qid,question_text,target
0,dda0b0efc8ba86e81ec4,"[what, are, interesting, fact, about, microsof...",0
1,dc708b74a108d0fc0ad9,"[what, are, those, thing, which, are, not, gon...",0
2,06a27ec5d82dacd8bfe0,"[what, should, i, know, to, avoid, being, upso...",0
3,00cbb6b17e3ceb7c5358,"[how, i, add, any, account, with, payment, bank]",0
4,7c304888973a701585a0,"[which, multi, level, marketing, product, are,...",0
...,...,...,...
999995,4bd96088d0b5f0f2c4f4,"[how, is, cse, at, vit, chennai]",0
999996,e80edbfc086f7125940f,"[how, can, we, prevent, a, holocaust, by, robo...",0
999997,1506dfad6bd340782a1f,"[how, can, i, help, a, student, remember, key,...",0
999998,b56c60fd407f2f85553c,"[what, is, the, difference, between, lace, clo...",0


In [80]:
def makeSentence(text):
    return ' '.join(list(text))

train_df.question_text = train_df['question_text'].apply(lambda x: makeSentence(x))
test_df.question_text = test_df['question_text'].apply(lambda x: makeSentence(x))

print(train_df.question_text)
# print(makeSentence(train_df.question_text[0]))
# ' '.join(list(train_df.question_text)[0])

0         what are interesting fact about microsoft history
1         what are those thing which are not gon na happ...
2         what should i know to avoid being upsold when ...
3                   how i add any account with payment bank
4         which multi level marketing product are actual...
                                ...                        
999995                            how is cse at vit chennai
999996    how can we prevent a holocaust by robot ai or ...
999997    how can i help a student remember key step and...
999998    what is the difference between lace closure la...
999999      what happens when you look into a broken mirror
Name: question_text, Length: 1000000, dtype: object


## Doing train test split of the train_df.csv

In [81]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df.question_text, train_df.target.values , test_size=0.15, random_state=0)
X_train.shape

test_x = test_df.question_text

## Vectorization of the dataset

In [None]:
from sklearn.naive_bayes import MultinomialNB
# Import modules for evaluation purposes
# Import libraries for predcton
from sklearn import metrics
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,roc_curve,auc,f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfidf = CountVectorizer()
# Numericalize the train dataset
train = tfidf.fit_transform(X_train.values.astype('U'))
# Numericalize the test dataset
test = tfidf.transform(X_test.values.astype('U'))

# for test_df
test_df_matrix = tfidf.transform(test_x.values.astype('U'))
print(test_df_matrix.shape)
print(train.shape)

: 

: 

In [None]:
# tfidf.vocabulary_

{'can': 8630,
 'you': 65741,
 'show': 53526,
 'that': 58833,
 'the': 58847,
 'curve': 13703,
 'ha': 24765,
 'three': 59114,
 'inflection': 28633,
 'point': 45078,
 'on': 41602,
 'they': 58982,
 'all': 1656,
 'lie': 33447,
 'one': 41623,
 'straight': 56381,
 'line': 33615,
 'how': 26808,
 'do': 16611,
 'have': 25452,
 'sex': 52883,
 'orally': 41905,
 'what': 64364,
 'is': 29744,
 'life': 33458,
 'and': 2268,
 'death': 14353,
 'cause': 9253,
 'toilet': 59568,
 'to': 59503,
 'keep': 31392,
 'running': 50879,
 'continuously': 12311,
 'best': 5884,
 'way': 63996,
 'start': 55951,
 'out': 42214,
 'blogging': 6742,
 'make': 34937,
 'money': 37848,
 'why': 64602,
 'so': 54685,
 'many': 35331,
 'people': 43774,
 'ignore': 27658,
 'fact': 20216,
 'think': 59020,
 'men': 36427,
 'woman': 64968,
 'could': 12793,
 'use': 62297,
 'technology': 58367,
 'change': 9727,
 'dating': 14191,
 'for': 21682,
 'better': 5940,
 'should': 53502,
 'younger': 65750,
 'person': 43990,
 'always': 1905,
 'greet': 24

## Applying Multinomial Naive Bayes to the model

In [None]:
# model = MultinomialNB()
# model.fit(train, y_train)
# print("train f1 score:", metrics.f1_score(y_train,model.predict(train)))
# print("test f1 score:", metrics.f1_score(y_test,model.predict(test)))


In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sn

# # Create the confussion matrix
# def plot_confussion_matrix(y_test, y_pred):
#     ''' Plot the confussion matrix for the target labels and predictions '''
#     cm = confusion_matrix(y_test, y_pred)

#     # Create a dataframe with the confussion matrix values
#     df_cm = pd.DataFrame(cm, range(cm.shape[0]),
#                   range(cm.shape[1]))
#     #plt.figure(figsize = (10,7))
#     # Plot the confussion matrix
#     sn.set(font_scale=1.4) #for label size
#     sn.heatmap(df_cm, annot=True,fmt='.0f',annot_kws={"size": 10})# font size
#     plt.show()

# # ROC Curve
# # plot no skill
# # Calculate the points in the ROC curve
# def plot_roc_curve(y_test, y_pred):
#     ''' Plot the ROC curve for the target labels and predictions'''
#     fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
#     roc_auc= auc(fpr,tpr)

#     plt.title('Receiver Operating Characteristic')
#     plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
#     plt.legend(loc = 'lower right')
#     plt.plot([0, 1], [0, 1],'r--')
#     plt.xlim([0, 1])
#     plt.ylim([0, 1])
#     plt.ylabel('True Positive Rate')
#     plt.xlabel('False Positive Rate')
#     plt.show()

In [None]:
# # Predicting the Test set results
# y_pred = model.predict(test)

# print(metrics.classification_report(y_test, y_pred,  digits=5))
# # plot_confussion_matrix(y_test, y_pred)
# # plot_roc_curve(y_test, y_pred)

# test_y_pred = model.predict(test_df_matrix)
# test_y_pred

In [None]:
# Test_DF_TARGET = pd.DataFrame(test_y_pred,columns=['target'])
# Test_DF_TARGET

In [None]:
# TEST_DF_QID = pd.DataFrame(test_df ,columns=['qid'])
# TEST_DF_QID

In [None]:
# TEST_DF = pd.concat([TEST_DF_QID, Test_DF_TARGET], axis=1, join='inner')
# # TEST_DF = TEST_DF.drop('', axis=1)
# # TEST_DF.reset_index(drop=True)
# TEST_DF

In [None]:
# TEST_DF.to_csv("sample_submission.csv",index=False)

## Applying Logistic Regression to our dataset

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=50000,solver="lbfgs")
model.fit(train,y_train)
y_pred = model.predict(train)
print("train f1 score: ",f1_score(y_train,y_pred))
print("test f1 score: ",f1_score(y_test,model.predict(test)))
test_y_pred = model.predict(test_df_matrix)

train f1 score:  0.5983304971715036
test f1 score:  0.5374478511356864


In [None]:
Test_DF_TARGET = pd.DataFrame(test_y_pred,columns=['target'])
TEST_DF_QID = pd.DataFrame(test_df ,columns=['qid'])
TEST_DF = pd.concat([TEST_DF_QID, Test_DF_TARGET], axis=1, join='inner')
# TEST_DF = TEST_DF.drop('', axis=1)
# TEST_DF.reset_index(drop=True)
TEST_DF.to_csv("sample_submission.csv",index=False)

## Applying SVM classifier to our dataset

In [None]:
# from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
# from sklearn.svm import SVC

# # Define the parameters to tune
# parameters = { 
#     'C': [1.0, 10],
#     'gamma': [1, 'auto', 'scale']
# }
# # Tune yyperparameters  using Grid Search and a SVM model
# model = GridSearchCV(SVC(kernel='rbf'), parameters, cv=5, n_jobs=-1).fit(train, train_df.target)
# # model = RandomizedSearchCV(SVC(kernel='rbf'), parameters, cv=5, n_jobs=-1).fit(train, train_df.target)

# print("train score:", model.score(train, train_df.target))

In [None]:
# # Predicting the Test set results
# y_pred = model.predict(train)

# print(metrics.classification_report(train_df.target, y_pred,  digits=5))
# plot_confussion_matrix(train_df.target, y_pred)
# plot_roc_curve(train_df.target, y_pred)

## Applying XG Boost classifier to our dataset

In [None]:
# from lightgbm import LGBMClassifier
# from sklearn.metrics import f1_score

# def f1_metric(ytrue,preds):
#     ''' Return the F1 Score value for the preds and true values, ytrue '''
#     return 'f1_score', f1_score((preds>=0.5).astype('int'), ytrue, average='macro'), True

# params = {
#     'learning_rate': 0.1,
#     'n_estimators': 100,
#     'colsample_bytree': 0.5,
#     'metric': 'f1_score',
#     # 'boosting_type':'goss',
#     # 'baggeng_freq':1,
#     # 'bagging_fraction' : float(0.5),
# }

# full_clf = LGBMClassifier(**params)

# # Fit or train the xgboost model
# full_clf.fit(train.astype(np.float32), y_train, eval_set=[(train.astype(np.float32), y_train), (test.astype(np.float32), y_test)],
#              verbose=400, eval_metric=f1_metric)
# #Show the results
# print("train f1 score:", metrics.f1_score(y_train,full_clf.predict(train)))
# print("test f1 score:", metrics.f1_score( y_test,full_clf.predict(test)))


In [None]:
# # Predicting the Test set results
# Y_pred = full_clf.predict(test.astype(np.float32))

# print(metrics.classification_report(y_test, y_pred,  digits=5))
# # plot_confussion_matrix(y_test, y_pred)
# # plot_roc_curve(y_test, y_pred)

In [None]:
# # Predicting the Test set results
# test_y_pred = full_clf.predict(test_df_matrix.astype(np.float32))

# # print(metrics.classification_report(test_df_matrix, test_y_pred,  digits=5))
# # plot_confussion_matrix(y_test, y_pred)
# # plot_roc_curve(y_test, y_pred)

In [None]:
# Test_DF_TARGET = pd.DataFrame(test_y_pred,columns=['target'])
# Test_DF_TARGET

In [None]:
# TEST_DF_QID = pd.DataFrame(test_df ,columns=['qid'])
# TEST_DF_QID

In [None]:
# TEST_DF = pd.concat([TEST_DF_QID, Test_DF_TARGET], axis=1, join='inner')
# # TEST_DF = TEST_DF.drop('', axis=1)
# # TEST_DF.reset_index(drop=True)
# TEST_DF

In [None]:
# TEST_DF.to_csv("sample_submission.csv",index=False)
# TEST_DF.target.value_counts()