In [None]:
from sklearn.preprocessing import StandardScaler

# Baseline Classifier

This notebook implements the baseline classification model using pre-processed training data (with and without lemmatization), and uses the bag of words approach with TF-IDF for vectorization. Finally, it uses Multinomial Naive Bayes Classifier implementation from sklearn to develop the baseline classifier.

We program the following steps for developing the baseline model:
1. Apply Count Vectorizer
2. TF-IDF Transformation
3. Apply Multinomial Naive Bayes Classifier on data with and without lemmatization
4. Test on test set
5. View Accuracy Metrics
6. Pickle the models

### To do:

- [ ] Provide description for all processes and reason to perform them
- [ ] Describe Directory Structure
- [x] Load Dataset into pandas
- [x] Apply Count Vectorizer
- [x] TF-IDF Transformation
- [x] Apply Multinomial Naive Bayes Classifier on data with and without lemmatization
- [x] Test on test set
- [x] View Accuracy Metrics
- [x] Pickle the models

### Import Statements and File Paths

In [None]:
import pandas as pd
import numpy as np
from numpy import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from IPython.display import display, HTML
import re
import pickle
import scipy

data_dir = "/content/drive/MyDrive/Quora-Data/"
train_csv_with_lemmatization = data_dir + "pre-processing/train_EDA1_preprocessing_with_lemma_EDA2.csv"
test_csv_with_lemmatization = data_dir + "pre-processing/test_EDA1_preprocessing_with_lemma_EDA2.csv"

train_csv_without_lemmatization = data_dir + "pre-processing/train_EDA1_preprocessing_without_lemma_EDA2.csv"
test_csv_without_lemmatization = data_dir + "pre-processing/test_EDA1_preprocessing_without_lemma_EDA2.csv"

model_with_lemmatization_data = data_dir + "Models/MultinomialNB_lemmatized_data.sav"
model_without_lemmatization_data = data_dir + "Models/MultinomialNB_non_lemmatized_data.sav"

### Load datasets into pandas dataframes

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def data_loader(list_X, file_Y):
    df = pd.read_csv(list_X[0])
    for i in range(1, len(list_X)):
        df = df.join(pd.read_csv(list_X[i]))
    return (df[df.columns.difference(['Y', 'qid1', 'qid', 'qid2', 'id'])], pd.read_csv(file_Y)['Y'])

X_train, Y_train = data_loader([
                                data_dir + "pre-processing/features1_2_selected_train.csv"
                    ], 
                    data_dir + "pre-processing/train.csv")

In [None]:
df1 = pd.read_csv(train_csv_with_lemmatization)
display(df1)
df2 = pd.read_csv(train_csv_without_lemmatization)
display(df2)

Unnamed: 0,id,qid1,qid2,q1_orig,q2_orig,num_word_q1,num_word_q2,num_char_q1,num_common_words,first_word_same,last_word_same,first_2_same,last_2_same,common_word_ratio,num_sent_diff,num_word_diff_ratio,question1,question2,num_word_q1_proc,num_word_q2_proc,num_char_q1_proc,num_common_words_proc,first_word_same_proc,last_word_same_proc,first_2_same_proc,last_2_same_proc,common_word_ratio_proc,num_sent_diff_proc,num_word_diff_ratio_proc,lcs_ratio_max,lcs_ratio_min,fuzz_rat,fuzz_part_rat,fuzz_rat_proc,fuzz_part_rat_proc,Y
0,394437,434361,527326,how do i install apk files on my windows phone?,"how can i backup a (.xap, / . appx) file insta...",10,15,10,6,1,1,0.5,1.0,0.230769,0.40,0.192308,install apk file window phone ?,"backup ( .xap , / . appx ) file installed wind...",6,13,6,4,0,1,0.0,1.0,0.230769,0.40,0.192308,0.267857,0.468750,0.61,0.74,0.53,0.84,0
1,373988,8023,10567,what were the major effects of the cambodia ea...,what were the major effects of the cambodia ea...,21,21,21,16,1,0,1.0,0.5,0.372093,0.00,0.000000,"major effect cambodia earthquake , effect comp...","major effect cambodia earthquake , effect comp...",11,11,11,7,1,1,1.0,0.5,0.372093,0.00,0.000000,0.657895,0.675676,0.94,0.94,0.90,0.89,1
2,183101,280083,280084,how is stack exchange better than quora?,is stack exchange better than quora? why or wh...,7,10,7,6,0,0,0.5,0.0,0.333333,0.25,0.166667,stack exchange better quora ?,stack exchange better quora ? ?,5,6,5,5,1,1,1.0,0.5,0.333333,0.25,0.166667,0.935484,0.966667,0.78,0.90,0.97,1.00,1
3,43553,78324,78325,how to prevent from pimples to break out insid...,how can i avoid getting pimples inside my nose?,11,9,11,4,1,1,0.5,0.5,0.190476,0.00,0.095238,prevent pimple break inside nose ?,avoid getting pimple inside nose ?,6,6,6,4,0,1,0.0,1.0,0.190476,0.00,0.095238,0.411765,0.400000,0.54,0.49,0.68,0.74,1
4,213919,319381,46044,what are some good books and online courses to...,what is a good online course on probability an...,21,10,21,6,1,0,0.5,0.0,0.187500,0.00,0.343750,good book online course follow grab concept st...,good online course probability statistic ?,11,6,11,6,1,1,0.5,0.5,0.187500,0.00,0.343750,0.197368,0.348837,0.50,0.49,0.58,0.60,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323427,346872,475259,475260,what is the benefit of using const for declari...,what is the benefit of using enum to declare a...,10,11,10,6,1,0,1.0,0.0,0.272727,0.00,0.045455,benefit using const declaring constant ?,benefit using enum declare constant ?,6,6,6,4,1,1,1.0,1.0,0.272727,0.00,0.045455,0.350000,0.368421,0.82,0.84,0.86,0.81,0
323428,143678,14804,40458,how can i get a complete list of all old gmail...,how do i find my own gmail accounts list?,15,9,15,5,1,0,0.5,0.0,0.200000,0.00,0.240000,get complete list old gmail account name ?,find gmail account list ?,8,5,8,4,0,1,0.0,0.5,0.200000,0.00,0.240000,0.380952,0.615385,0.54,0.54,0.57,0.72,1
323429,128137,14317,34001,where can i found modern colours and textures ...,"where can i get wide range of floor tile, wall...",13,16,13,8,1,1,1.0,1.0,0.266667,0.00,0.100000,found modern colour texture floor tile sydney ?,"get wide range floor tile , wall tile porcelai...",8,12,8,4,0,1,0.0,1.0,0.266667,0.00,0.100000,0.229508,0.291667,0.56,0.50,0.48,0.43,1
323430,323891,449906,449907,support@ 1877#778#89.69 acer technical support...,@support@ 1877#778#89.69 compaq technical supp...,7,7,7,5,0,1,0.5,1.0,0.333333,0.00,0.000000,support @ 1877 # 778 # 89.69 acer technical su...,@ support @ 1877 # 778 # 89.69 compaq technica...,13,14,13,10,0,1,1.0,1.0,0.333333,0.00,0.000000,0.471429,0.492537,0.93,0.92,0.93,0.92,0


Unnamed: 0,id,qid1,qid2,q1_orig,q2_orig,num_word_q1,num_word_q2,num_char_q1,num_common_words,first_word_same,last_word_same,first_2_same,last_2_same,common_word_ratio,num_sent_diff,num_word_diff_ratio,question1,question2,num_word_q1_proc,num_word_q2_proc,num_char_q1_proc,num_common_words_proc,first_word_same_proc,last_word_same_proc,first_2_same_proc,last_2_same_proc,common_word_ratio_proc,num_sent_diff_proc,num_word_diff_ratio_proc,lcs_ratio_max,lcs_ratio_min,fuzz_rat,fuzz_part_rat,fuzz_rat_proc,fuzz_part_rat_proc,Y
0,394437,434361,527326,how do i install apk files on my windows phone?,"how can i backup a (.xap, / . appx) file insta...",10,15,10,6,1,1,0.5,1.0,0.230769,0.40,0.192308,install apk file window phone ?,"backup ( .xap , / . appx ) file installed wind...",6,13,6,4,0,1,0.0,1.0,0.230769,0.40,0.192308,0.267857,0.468750,0.61,0.74,0.53,0.84,0
1,373988,8023,10567,what were the major effects of the cambodia ea...,what were the major effects of the cambodia ea...,21,21,21,16,1,0,1.0,0.5,0.372093,0.00,0.000000,"major effect cambodia earthquake , effect comp...","major effect cambodia earthquake , effect comp...",11,11,11,7,1,1,1.0,0.5,0.372093,0.00,0.000000,0.657895,0.675676,0.94,0.94,0.90,0.89,1
2,183101,280083,280084,how is stack exchange better than quora?,is stack exchange better than quora? why or wh...,7,10,7,6,0,0,0.5,0.0,0.333333,0.25,0.166667,stack exchange better quora ?,stack exchange better quora ? ?,5,6,5,5,1,1,1.0,0.5,0.333333,0.25,0.166667,0.935484,0.966667,0.78,0.90,0.97,1.00,1
3,43553,78324,78325,how to prevent from pimples to break out insid...,how can i avoid getting pimples inside my nose?,11,9,11,4,1,1,0.5,0.5,0.190476,0.00,0.095238,prevent pimple break inside nose ?,avoid getting pimple inside nose ?,6,6,6,4,0,1,0.0,1.0,0.190476,0.00,0.095238,0.411765,0.400000,0.54,0.49,0.68,0.74,1
4,213919,319381,46044,what are some good books and online courses to...,what is a good online course on probability an...,21,10,21,6,1,0,0.5,0.0,0.187500,0.00,0.343750,good book online course follow grab concept st...,good online course probability statistic ?,11,6,11,6,1,1,0.5,0.5,0.187500,0.00,0.343750,0.197368,0.348837,0.50,0.49,0.58,0.60,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323427,346872,475259,475260,what is the benefit of using const for declari...,what is the benefit of using enum to declare a...,10,11,10,6,1,0,1.0,0.0,0.272727,0.00,0.045455,benefit using const declaring constant ?,benefit using enum declare constant ?,6,6,6,4,1,1,1.0,1.0,0.272727,0.00,0.045455,0.350000,0.368421,0.82,0.84,0.86,0.81,0
323428,143678,14804,40458,how can i get a complete list of all old gmail...,how do i find my own gmail accounts list?,15,9,15,5,1,0,0.5,0.0,0.200000,0.00,0.240000,get complete list old gmail account name ?,find gmail account list ?,8,5,8,4,0,1,0.0,0.5,0.200000,0.00,0.240000,0.380952,0.615385,0.54,0.54,0.57,0.72,1
323429,128137,14317,34001,where can i found modern colours and textures ...,"where can i get wide range of floor tile, wall...",13,16,13,8,1,1,1.0,1.0,0.266667,0.00,0.100000,found modern colour texture floor tile sydney ?,"get wide range floor tile , wall tile porcelai...",8,12,8,4,0,1,0.0,1.0,0.266667,0.00,0.100000,0.229508,0.291667,0.56,0.50,0.48,0.43,1
323430,323891,449906,449907,support@ 1877#778#89.69 acer technical support...,@support@ 1877#778#89.69 compaq technical supp...,7,7,7,5,0,1,0.5,1.0,0.333333,0.00,0.000000,support @ 1877 # 778 # 89.69 acer technical su...,@ support @ 1877 # 778 # 89.69 compaq technica...,13,14,13,10,0,1,1.0,1.0,0.333333,0.00,0.000000,0.471429,0.492537,0.93,0.92,0.93,0.92,0


### Define Pipeline and Train model

In [None]:
def tf_idf_transformer(X):
    vectorizer = TfidfVectorizer(analyzer='word')
    #tfidf_vectorized = 
    #print(tfidf_vectorized)
    X_tr = vectorizer.fit_transform(X)
    return X_tr


In [None]:
combined_features_lemmatized = df1['question1']+df1['question2']
df1 = df1.assign(combined_features=combined_features_lemmatized)
combined_features_not_lemmatized = df2['question1']+df2['question2']
df2 = df2.assign(combined_features=combined_features_not_lemmatized)
df1_tfidf = tf_idf_transformer(df1.combined_features.values.astype("U"))
df2_tfidf = tf_idf_transformer(df2.combined_features.values.astype("U"))

In [None]:
X_train = pd.DataFrame(StandardScaler().fit_transform(X_train), columns=X_train.columns, index=X_train.index)

In [None]:
X_train

Unnamed: 0,common_word_ratio,common_word_ratio_proc,first_2_same_proc,fuzz_part_rat,fuzz_part_rat_proc,fuzz_rat,fuzz_rat_proc,last_2_same,last_2_same_proc,last_word_same,last_word_same_proc,lcs_ratio_max,lcs_ratio_max_proc,lcs_ratio_min,lcs_ratio_min_proc
0,0.211293,-0.270399,-0.999132,0.538039,0.616293,0.095089,-0.200991,1.792307,1.559240,1.511933,1.358173,-0.040662,-0.468858,0.247894,-0.274461
1,1.400809,0.248841,1.515165,1.608844,1.036266,1.614730,1.281256,0.533814,0.278353,-0.661405,-0.736283,2.221685,1.038586,1.789216,0.655057
2,1.074570,1.739250,1.515165,1.394683,1.456240,0.877934,1.760806,-0.724678,1.559240,-0.661405,1.358173,1.834463,2.326766,2.191386,1.681034
3,-0.127851,0.327513,-0.999132,-0.800468,0.149655,-0.227259,0.278560,0.533814,1.559240,1.511933,1.358173,-0.692668,-0.088571,-0.854987,-0.531155
4,-0.152902,0.654497,0.258017,-0.800468,-0.410309,-0.411458,-0.157395,-0.724678,0.278353,-0.661405,-0.736283,-0.848221,-0.754420,-0.579267,-0.522988
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323427,0.564452,0.327513,1.515165,1.073441,0.522965,1.062133,1.106874,-0.724678,0.278353,-0.661405,1.358173,0.885004,-0.113995,0.562312,-0.438150
323428,-0.047690,0.140666,-0.999132,-0.532767,0.242983,-0.227259,-0.244586,-0.724678,0.278353,-0.661405,-0.736283,-0.310055,0.008043,-0.001567,0.584905
323429,0.513440,-0.544442,-0.999132,-0.746928,-1.250256,-0.135160,-0.549754,1.792307,1.559240,1.511933,1.358173,-0.458960,-0.656386,-0.654807,-0.829579
323430,1.074570,1.217263,1.515165,1.501764,1.036266,1.568681,1.455637,1.792307,1.559240,1.511933,1.358173,0.962609,0.458906,0.632095,0.064708


In [None]:
df1_tfidf

<323432x73209 sparse matrix of type '<class 'numpy.float64'>'
	with 2598617 stored elements in Compressed Sparse Row format>

In [None]:
df1_sm = scipy.sparse.coo_matrix((X_train.values))
#df2_sm = scipy.sparse.coo_matrix((df2.drop(["id", "qid1", "qid2", "question1", "question2", "q1_orig", "q2_orig", "combined_features", "Y"], axis=1).values))

In [None]:
df1_final = scipy.sparse.hstack((df1_sm, df1_tfidf))
#df2_final = scipy.sparse.hstack((df2_sm, df2_tfidf))

In [None]:
df1_tf = df1.assign(tfidf=df1_tfidf)
df2_tf = df2.assign(tfidf=df2_tfidf)

In [None]:
df1_tf_drop = df1_tf.drop(["id", "qid1", "qid2", "question1", "question2", "q1_orig", "q2_orig", "combined_features"], axis=1)
df2_tf_drop = df2_tf.drop(["id", "qid1", "qid2", "question1", "question2", "q1_orig", "q2_orig", "combined_features"], axis=1)

NameError: ignored

In [None]:
x1_train, x1_test, y1_train, y1_test = train_test_split(
    df1_final, 
    df1["Y"],
    test_size=0.1, random_state=10)
##x2_train, x2_test, y2_train, y2_test = train_test_split(df2_final, 
 #   df2["Y"],test_size=0.1, random_state=10)

In [None]:
is_duplicate = ['0', '1']

def naiveBayes(x_train,y_train,x_test,y_test, lemmatized_flag, t=1):
    nb = Pipeline([('clf', MultinomialNB(alpha=t),)])
    nb.fit(x_train,y_train)
    if(lemmatized_flag == 0):
        pickle.dump(nb,open(model_without_lemmatization_data,'wb'))
    else:
        pickle.dump(nb,open(model_with_lemmatization_data,'wb'))
    y_pred = nb.predict(x_test)
    print("Naive Bayes: "+str(accuracy_score(y_pred,y_test)))
    print(classification_report(y_test, y_pred,target_names=is_duplicate))
    print(confusion_matrix(y_test, y_pred))
    return nb, accuracy_score(y_pred,y_test)

print("Dataset with Lemmatization:")
nb_lem = naiveBayes(x1_train,y1_train,x1_test,y1_test, 1, t=0.01)

print("_________________________________________________________________________")
print("Dataset without Lemmatization:")
#nb_wlem = naiveBayes(x2_train,y2_train,x2_test,y2_test, 0)

Dataset with Lemmatization:


ValueError: ignored

In [None]:
xgb_proba = pd.read_csv("/content/drive/MyDrive/Quora-Data/Models/XGRF_probas_train_0.74.csv", header=None)

FileNotFoundError: ignored

In [None]:
xgb_proba

NameError: ignored

In [None]:
nb_proba = pd.DataFrame(nb_lem[0].predict_proba(df1_final))

In [None]:
df1[["q1_orig", "q2_orig"]][nb_proba[1] > 0.8][df1["Y"] == 1]

  """Entry point for launching an IPython kernel.


Unnamed: 0,q1_orig,q2_orig
1,what were the major effects of the cambodia ea...,what were the major effects of the cambodia ea...
2,how is stack exchange better than quora?,is stack exchange better than quora? why or wh...
8,what was the significance of the battle of som...,what was the significance of the battle of som...
14,how can you avoid developing limerence?,how do i get over limerence?
15,why is marijuana illegal?,why is marijuana still illegal?
...,...,...
323415,what is the shaken baby syndrome?,what is shaken baby syndrome?
323416,what do you think of quanzhou?,what is your review of quanzhou?
323417,how do i lose weight fast?,what is the easiest way to lose weight faster?
323426,what are the major differences between chinese...,what is the difference between chinese and wes...


In [None]:
df1[["q1_orig", "q2_orig"]][nb_proba[1] > 0.5][nb_proba[1] < 0.6][df1["Y"] == 1]

  """Entry point for launching an IPython kernel.
  """Entry point for launching an IPython kernel.


Unnamed: 0,q1_orig,q2_orig
80,how can i make sure that no one can see my fri...,how do i hide friends list on facebook?
108,what is actual meaning of life?,what's the meaning of life? (mathematically an...
159,is universal health care good? why or why not?,should the u.s. implement a universal health c...
364,what is the best pen to write the upsc mains e...,which is the best pen to use in upsc exams?
518,what are donald trump's chances against hillar...,who will win the election? donald trump or hil...
...,...,...
323307,why are so many questions posted to quora that...,why do so many people ask questions on quora t...
323325,what languages should be learnt to develop an ...,what computer languages should i learn to be a...
323329,what is the most probable (the most evidence) ...,what is the most probable base structure of th...
323376,what's the best solution to the kashmir issue?,what is the solution of kashmir conflict?


In [None]:
df1[["q1_orig", "q2_orig"]][nb_proba[1] > 0.5][nb_proba[1] < 0.6][df1["Y"] != 1]

  """Entry point for launching an IPython kernel.
  """Entry point for launching an IPython kernel.


Unnamed: 0,q1_orig,q2_orig
111,what can you say about filipino people?,what do you not know about filipinos?
139,what are the advantages and disadvantages of v...,what are the advantages and disadvantages of t...
193,what if a google search was case-sensitive?,is google search case sensitive?
225,why is there no option to +1 quora posts?,why does quora have an option of answering one...
228,what universities does federal signal recruit ...,what universities does third federal savings r...
...,...,...
323015,what if china attacked the usa?,what will happen if the usa attacks china?
323111,what do people misunderstand about space travel?,what does science say about space travel?
323123,which usa visa should i apply for?,where should i apply for usa visa?
323338,why did war between india and pakistan happene...,what will happen if there is a war between ind...


In [None]:
df1["Y"]

0         0
1         1
2         1
3         1
4         0
         ..
323427    0
323428    1
323429    1
323430    0
323431    0
Name: Y, Length: 323432, dtype: int64

In [None]:
for alpha in [0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 1, 10]:
    print(alpha)
    nb_lem = naiveBayes(x1_train,y1_train,x1_test,y1_test, 1, t=alpha)

0.0001
Naive Bayes: 0.7383749690823646
              precision    recall  f1-score   support

           0       0.84      0.73      0.78     20341
           1       0.62      0.76      0.68     12003

    accuracy                           0.74     32344
   macro avg       0.73      0.74      0.73     32344
weighted avg       0.76      0.74      0.74     32344

[[14776  5565]
 [ 2897  9106]]
0.001
Naive Bayes: 0.7366126638634677
              precision    recall  f1-score   support

           0       0.84      0.72      0.77     20341
           1       0.62      0.76      0.68     12003

    accuracy                           0.74     32344
   macro avg       0.73      0.74      0.73     32344
weighted avg       0.76      0.74      0.74     32344

[[14660  5681]
 [ 2838  9165]]
0.005
Naive Bayes: 0.7339846648528321
              precision    recall  f1-score   support

           0       0.84      0.71      0.77     20341
           1       0.61      0.77      0.68     12003

    a