# Base Level EDA

In [1]:
#importing the required libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df=pd.read_csv(r"C:\Users\asus\Documents\Projects\Question_Similarity_Pairs\Input\train.csv",).sample(6000)

In [3]:
df.head(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
234954,234954,316427,345556,When and why do we use friend function for ope...,Can we use friend function in operator overloa...,1
5278,5278,10391,10392,How can I know if he's genuinely depressed?,Now I am 26 years old but I didn't have a spou...,0
281689,281689,401489,401490,Does the autobahn have more accidents than oth...,Does the US have more accounts of DWI than oth...,0
320047,320047,65100,76005,How to get away with someone to whom you have ...,How do you kill someone and get away with it?,1
22525,22525,42257,42258,Does the law of attraction really work?,Is the law of attraction true?,1


In [6]:
print("The count of class 0 is ",df.is_duplicate.value_counts()[0])
print("The count of class 1 is ",df.is_duplicate.value_counts()[1])
zero_class=df.is_duplicate.value_counts()[0]
one_class=df.is_duplicate.value_counts()[1]
print("The total % records of zero class=",zero_class/df['is_duplicate'].count())
print("The total % records of one class=",one_class/df['is_duplicate'].count())



The count of class 0 is  3759
The count of class 1 is  2241
The total % records of zero class= 0.6265
The total % records of one class= 0.3735


We can see that the data we have is imbalanced

In [48]:
print("NAN in is_duplicate: ",df['is_duplicate'].isna().sum())
print("NAN in question1: ",df.question1.isna().sum())
print("NAN in question2 :",df.question2.isna().sum())

NAN in is_duplicate:  0
NAN in question1:  0
NAN in question2 : 0


<b> Now since here we're having very less count for NAN data compared to total data, so we can remove these rows</b>

In [49]:
df.dropna(axis=0,subset=['question1','question2'],how='any',inplace=True)

In [50]:
print("NAN in question1: ",df.question1.isna().sum())
print("NAN in question2 :",df.question2.isna().sum())
print("Total rows: ",df.count())

NAN in question1:  0
NAN in question2 : 0
Total rows:  id              6000
qid1            6000
qid2            6000
question1       6000
question2       6000
is_duplicate    6000
dtype: int64


<b> Vectorization </b>

In [51]:
new_df=df[['question1','question2','is_duplicate']]
new_df.head(3)

Unnamed: 0,question1,question2,is_duplicate
375906,"Is there a source for objective, unbiased, non...",Who is the most unbiased and reliable source o...,0
47896,What is best treatment/exercises for lumber st...,Where can I buy International designs of Sofas...,0
285615,Can tourists get a multiple entry Schengen visa?,If I have my Schengen visa from France embassy...,0


In [62]:
cv=CountVectorizer(max_features=4000)
data_question=list(new_df['question1'])+list(new_df['question2'])
q1,q2=np.vsplit(cv.fit_transform(data_question).toarray(),2)
df1=pd.DataFrame(q1,index=new_df.index)
df2=pd.DataFrame(q2,index=new_df.index)
final_df=pd.concat([df1,df2],axis=1)
final_df

In [65]:

X_train,X_test,y_train,y_test=train_test_split(final_df,new_df['is_duplicate'],test_size=0.2,stratify=new_df['is_duplicate'])
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3990,3991,3992,3993,3994,3995,3996,3997,3998,3999
114318,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40803,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
109402,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
281580,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
189626,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39612,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
173124,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
306131,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
398928,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
#Building a base model over unprocessed data
dt_model=LogisticRegression()
dt_model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [73]:
test_pred=dt_model.predict(X_test)
train_pred=dt_model.predict(X_train)
print("The Train Accurary is :",accuracy_score(y_train,train_pred))
print("The Test Accuracy is :",accuracy_score(y_test,test_pred))


The Train Accurary is : 0.9397916666666667
The Test Accuracy is : 0.6666666666666666


### Observation:
* There's a lot of overfitting happening
* The Test accuracy is pretty bad on unprocessed data (which is that we haven't done any pre-processing yet and nor any feature engineering)
* But now we atleast know the worst performance that the model can make.