In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [56]:
df = pd.read_csv('quora question pairs.csv')

In [57]:
df.sample(5)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
179812,179812,230474,114077,How can I get better at tennis?,What are some tips for getting better at tennis?,1
303006,303006,106738,426089,Is Jennifer Aniston Indian?,Who is Jennifer Aniston?,0
135545,135545,29973,216425,Who is the best companion in the Fallout 4 game?,How can I fix graphic problem in Fallout 4 whe...,0
317996,317996,70069,16180,If you follow someone on Instagram can they se...,Can someone see if you have viewed public Inst...,1
285551,285551,4951,69550,Why do people ask questions on Quora that can ...,Why do so many people ask soppy questions on Q...,1


In [58]:
new_df = df.sample(30000, random_state=2)

In [59]:
print(new_df['is_duplicate'].value_counts())
print(new_df['is_duplicate'].value_counts()/new_df['is_duplicate'].count() * 100)

0    19013
1    10987
Name: is_duplicate, dtype: int64
0    63.376667
1    36.623333
Name: is_duplicate, dtype: float64


In [60]:
new_df.isnull().sum()

id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64

In [61]:
new_df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
398782,398782,496695,532029,What is the best marketing automation tool for...,What is the best marketing automation tool for...,1
115086,115086,187729,187730,I am poor but I want to invest. What should I do?,I am quite poor and I want to be very rich. Wh...,0
327711,327711,454161,454162,I am from India and live abroad. I met a guy f...,T.I.E.T to Thapar University to Thapar Univers...,0


In [62]:
new_df['len_q1'] = new_df['question1'].str.len()
new_df['len_q2'] = new_df['question2'].str.len()

In [63]:
new_df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,len_q1,len_q2
398782,398782,496695,532029,What is the best marketing automation tool for...,What is the best marketing automation tool for...,1,76,77
115086,115086,187729,187730,I am poor but I want to invest. What should I do?,I am quite poor and I want to be very rich. Wh...,0,49,57
327711,327711,454161,454162,I am from India and live abroad. I met a guy f...,T.I.E.T to Thapar University to Thapar Univers...,0,105,120


In [64]:
new_df['words_q1'] = new_df['question1'].apply(lambda row: len(row.split(' ')))
new_df['words_q2'] = new_df['question2'].apply(lambda row: len(row.split(' ')))

In [65]:
new_df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,len_q1,len_q2,words_q1,words_q2
398782,398782,496695,532029,What is the best marketing automation tool for...,What is the best marketing automation tool for...,1,76,77,12,12
115086,115086,187729,187730,I am poor but I want to invest. What should I do?,I am quite poor and I want to be very rich. Wh...,0,49,57,12,15
327711,327711,454161,454162,I am from India and live abroad. I met a guy f...,T.I.E.T to Thapar University to Thapar Univers...,0,105,120,25,17


In [66]:
def common_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(' ')))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(' ')))
    
    return len(w1 & w2)

In [67]:
new_df['common_word'] = new_df.apply(common_words, axis=1)
new_df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,len_q1,len_q2,words_q1,words_q2,common_word
398782,398782,496695,532029,What is the best marketing automation tool for...,What is the best marketing automation tool for...,1,76,77,12,12,11
115086,115086,187729,187730,I am poor but I want to invest. What should I do?,I am quite poor and I want to be very rich. Wh...,0,49,57,12,15,7
327711,327711,454161,454162,I am from India and live abroad. I met a guy f...,T.I.E.T to Thapar University to Thapar Univers...,0,105,120,25,17,2


In [68]:
def total_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(' ')))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(' ')))
    
    return (len(w1) + len(w2))

In [69]:
new_df['total_word'] = new_df.apply(total_words, axis=1)
new_df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,len_q1,len_q2,words_q1,words_q2,common_word,total_word
398782,398782,496695,532029,What is the best marketing automation tool for...,What is the best marketing automation tool for...,1,76,77,12,12,11,24
115086,115086,187729,187730,I am poor but I want to invest. What should I do?,I am quite poor and I want to be very rich. Wh...,0,49,57,12,15,7,23
327711,327711,454161,454162,I am from India and live abroad. I met a guy f...,T.I.E.T to Thapar University to Thapar Univers...,0,105,120,25,17,2,34


In [70]:
new_df['word_share'] = round(new_df['common_word']/new_df['total_word'], 2)

In [71]:
new_df.head(3)

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,len_q1,len_q2,words_q1,words_q2,common_word,total_word,word_share
398782,398782,496695,532029,What is the best marketing automation tool for...,What is the best marketing automation tool for...,1,76,77,12,12,11,24,0.46
115086,115086,187729,187730,I am poor but I want to invest. What should I do?,I am quite poor and I want to be very rich. Wh...,0,49,57,12,15,7,23,0.3
327711,327711,454161,454162,I am from India and live abroad. I met a guy f...,T.I.E.T to Thapar University to Thapar Univers...,0,105,120,25,17,2,34,0.06


In [72]:
ques_df = new_df[['question1', 'question2']]
ques_df.head(3)

Unnamed: 0,question1,question2
398782,What is the best marketing automation tool for...,What is the best marketing automation tool for...
115086,I am poor but I want to invest. What should I do?,I am quite poor and I want to be very rich. Wh...
327711,I am from India and live abroad. I met a guy f...,T.I.E.T to Thapar University to Thapar Univers...


In [73]:
final_df = new_df.drop(columns=['id','qid1','qid2','question1','question2'])

In [74]:
final_df.head(3)

Unnamed: 0,is_duplicate,len_q1,len_q2,words_q1,words_q2,common_word,total_word,word_share
398782,1,76,77,12,12,11,24,0.46
115086,0,49,57,12,15,7,23,0.3
327711,0,105,120,25,17,2,34,0.06


In [75]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3000)

questions = list(ques_df['question1']) + list(ques_df['question2'])

cv = CountVectorizer(max_features=3000)
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)

In [76]:
q2_arr.shape

(30000, 3000)

In [77]:
temp_df1 = pd.DataFrame(q1_arr, index= ques_df.index)
temp_df2 = pd.DataFrame(q2_arr, index= ques_df.index)
temp_df = pd.concat([temp_df1, temp_df2], axis=1)
temp_df.shape

(30000, 6000)

In [78]:
final_df = pd.concat([final_df, temp_df], axis=1)
final_df.head(3)

Unnamed: 0,is_duplicate,len_q1,len_q2,words_q1,words_q2,common_word,total_word,word_share,0,1,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
398782,1,76,77,12,12,11,24,0.46,0,0,...,0,0,0,0,0,0,0,0,0,0
115086,0,49,57,12,15,7,23,0.3,0,0,...,0,0,0,0,0,0,0,0,0,0
327711,0,105,120,25,17,2,34,0.06,0,0,...,0,0,0,0,0,0,0,0,0,0


In [79]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(final_df.iloc[:,1:], final_df.iloc[:,0], test_size=0.2, random_state=42)

In [80]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier()

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_pred, y_test)



0.7633333333333333

# Advanced Features

# 1. Token Features

cwc_min: This is the ratio of the number of common words to the length of the smaller question

cwc_max: This is the ratio of the number of common words to the length of the larger question

csc_min: This is the ratio of the number of common stop words to the smaller stop word count among the two questions

csc_max: This is the ratio of the number of common stop words to the larger stop word count among the two questions

ctc_min: This is the ratio of the number of common tokens to the smaller token count among the two questions

ctc_max: This is the ratio of the number of common tokens to the larger token count among the two questions

last_word_eq: 1 if the last word in the two questions is same, 0 otherwise

first_word_eq: 1 if the first word in the two questions is same, 0 otherwise

# 2. Length Based Features

mean_len: Mean of the length of the two questions (number of words)

abs_len_diff: Absolute difference between the length of the two questions (number of words)

longest_substr_ratio: Ratio of the length of the longest substring among the two questions to the length of the smaller question

# 3. Fuzzy Features
fuzz_ratio: fuzz_ratio score from fuzzywuzzy

fuzz_partial_ratio: fuzz_partial_ratio from fuzzywuzzy

token_sort_ratio: token_sort_ratio from fuzzywuzzy

token_set_ratio: token_set_ratio from fuzzywuzz