# Base Level EDA

In [1]:
#importing the required libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

In [2]:
df=pd.read_csv(r"C:\Users\asus\Documents\Projects\Question_Similarity_Pairs\Input\train.csv",).sample(60000)

In [3]:
df.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
309023,309023,336618,122242,What is the poorest country in Asia? Why is it...,Which is the poorest country in Asia?,1
72095,72095,123944,123945,Is SVR Technologies reliable for online course?,How far is GIAN valuable and reliable or is it...,0
99824,99824,165662,165663,What is the nutritional importance of proteins...,What is nutritional importance of proteins and...,1
233982,233982,344364,344365,Why can't I find any newer pretentious movie r...,Why does Biswa Kalyan Rath look so frustrated ...,0
46573,46573,83291,83292,Is elec elex a core branch?,I got a rank of 13K in UPESEAT. I need a core ...,0


In [4]:
print("The count of class 0 is ",df.is_duplicate.value_counts()[0])
print("The count of class 1 is ",df.is_duplicate.value_counts()[1])
zero_class=df.is_duplicate.value_counts()[0]
one_class=df.is_duplicate.value_counts()[1]
print("The total % records of zero class=",zero_class/df['is_duplicate'].count())
print("The total % records of one class=",one_class/df['is_duplicate'].count())



The count of class 0 is  37994
The count of class 1 is  22006
The total % records of zero class= 0.6332333333333333
The total % records of one class= 0.3667666666666667


We can see that the data we have is imbalanced

In [5]:
print("NAN in is_duplicate: ",df['is_duplicate'].isna().sum())
print("NAN in question1: ",df.question1.isna().sum())
print("NAN in question2 :",df.question2.isna().sum())

NAN in is_duplicate:  0
NAN in question1:  0
NAN in question2 : 1


<b> Now since here we're having very less count for NAN data compared to total data, so we can remove these rows</b>

In [6]:
df.dropna(axis=0,subset=['question1','question2'],how='any',inplace=True)

In [7]:
print("NAN in question1: ",df.question1.isna().sum())
print("NAN in question2 :",df.question2.isna().sum())
print("Total rows: ",df.count())

NAN in question1:  0
NAN in question2 : 0
Total rows:  id              59999
qid1            59999
qid2            59999
question1       59999
question2       59999
is_duplicate    59999
dtype: int64


<b> Vectorization </b>

In [8]:
new_df=df[['question1','question2','is_duplicate']]
new_df.head(3)

Unnamed: 0,question1,question2,is_duplicate
309023,What is the poorest country in Asia? Why is it...,Which is the poorest country in Asia?,1
72095,Is SVR Technologies reliable for online course?,How far is GIAN valuable and reliable or is it...,0
99824,What is the nutritional importance of proteins...,What is nutritional importance of proteins and...,1


In [9]:
cv=CountVectorizer(max_features=4000)
data_question=list(new_df['question1'])+list(new_df['question2'])
q1,q2=np.vsplit(cv.fit_transform(data_question).toarray(),2)
df1=pd.DataFrame(q1,index=new_df.index)
df2=pd.DataFrame(q2,index=new_df.index)
final_df=pd.concat([df1,df2],axis=1)
final_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3990,3991,3992,3993,3994,3995,3996,3997,3998,3999
309023,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
72095,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99824,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
233982,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46573,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377549,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
259776,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
237928,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
188538,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:

X_train,X_test,y_train,y_test=train_test_split(final_df,new_df['is_duplicate'],test_size=0.2,stratify=new_df['is_duplicate'])
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3990,3991,3992,3993,3994,3995,3996,3997,3998,3999
395412,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
375525,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
356397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
370502,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
224956,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260465,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
257079,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
338702,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
202921,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
#Building a base model over unprocessed data
dt_model=LogisticRegression()
dt_model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
test_pred=dt_model.predict_proba(X_test)
train_pred=dt_model.predict_proba(X_train)
print("The Train loss is :",log_loss(y_train,train_pred))
print("The Test loss is :",log_loss(y_test,test_pred))


The Train loss is : 0.4444556245868045
The Test loss is : 0.5621842778470422


### Observation:
* There's a lot of overfitting happening
* The Test accuracy is pretty bad on unprocessed data (which is that we haven't done any pre-processing yet and nor any feature engineering)
* But now we atleast know the worst performance that the model can make.