<a href="https://colab.research.google.com/github/Manisha-Karim/Duplicate-Question-Prediction/blob/main/Quora_Similar_Question.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings('ignore')

In [None]:
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/quora-question-pairs/train.csv')

In [None]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [None]:
df.drop(['id', 'qid1', 'qid2'], axis = 1, inplace = True)

In [None]:
d1 = (df[df['is_duplicate'] == 1]).sample(20000)
d2 = (df[df['is_duplicate'] == 0]).sample(20000)

In [None]:
df = pd.concat([d1, d2], axis=0)

In [None]:
df.shape

(40000, 3)

# Data Cleaning

In [None]:
nltk.download('stopwords')
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stopwords = set(stop)
stopwords.add("said")
stopwords.add("br")
stopwords.add(" ")
stopwords.remove("not")
stopwords.remove("no")

In [None]:
porter = PorterStemmer()

In [None]:
def clean_words(raw_text):

    # Lowercase

    lowercase = raw_text.lower().split()

    # Replace symbol
    raw_text = raw_text.replace('%', ' percent ')
    raw_text = raw_text.replace('$', ' dollar ')
    raw_text = raw_text.replace('₹', ' rupee ')
    raw_text = raw_text.replace('€', ' euro ')
    raw_text = raw_text.replace('@', ' at ')

    # Replace words
    raw_text = raw_text.replace("'ve", " have")
    raw_text = raw_text.replace("n't", " not")
    raw_text = raw_text.replace("'re", " are")
    raw_text = raw_text.replace("'ll", " will")
    raw_text = raw_text.replace('[math]', '')
    raw_text = raw_text.replace('math', '')

    # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953

    contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    raw_text_decontracted = []

    for word in raw_text.split():

        if word in contractions:
            word = contractions[word]

        raw_text_decontracted.append(word)

    raw_text = ' '.join(raw_text_decontracted)

    # Remove symbols
    remove_HTML = BeautifulSoup(raw_text, 'html.parser').get_text()

    # Keep alphabets
    remove_symbols = re.sub('[^a-zA-z]',' ', remove_HTML)

    remove_stopwords = [words for words in lowercase if not words in stop]
    stemmed = [porter.stem(words) for words in remove_stopwords]
    return ( ' '.join(stemmed))


In [None]:
df['question1'] = (df['question1']).astype(str)
df['question2'] = (df['question2']).astype(str)

In [None]:
df['question1'] = df['question1'].apply(clean_words)
df['question2'] = df['question2'].apply(clean_words)

In [None]:
df.head()

Unnamed: 0,question1,question2,is_duplicate
97444,us still land free?,america realli land free?,1
129390,handl love someon feel same?,feel express love someon reciproc love?,1
266734,stage fright? overcom it?,best way overcom stage fright?,1
186530,differ ba (hons) english & ba english?,what' differ english honour ba english?,1
330352,employ skills?,employ skills?,1


# Feature Engineering

#### Length of Questions

In [None]:
df['q1_len'] = df['question1'].apply(lambda row: len(row.split(" ")))
df['q2_len'] = df['question2'].apply(lambda row: len(row.split(" ")))

#### Common Words

In [None]:
def common_words(text):
    w1 = set(map(lambda word: word.lower().strip(), text['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), text['question2'].split(" ")))
    return len(w1 & w2)


In [None]:
word_common = df.apply(common_words, axis=1)
word_common

97444     2
129390    3
266734    3
186530    4
330352    2
         ..
374877    0
370614    3
334414    0
238278    7
225511    2
Length: 40000, dtype: int64

In [None]:
def total_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
    return (len(w1) + len(w2))

In [None]:
word_total = df.apply(total_words, axis=1)
word_total

97444      8
129390    11
266734     9
186530    12
330352     4
          ..
374877     3
370614     8
334414    22
238278    16
225511     7
Length: 40000, dtype: int64

In [None]:
df['common_word_ratio'] = round(word_common/word_total,2)
df.head()

Unnamed: 0,question1,question2,is_duplicate,q1_len,q2_len,common_word_ratio
97444,us still land free?,america realli land free?,1,4,4,0.25
129390,handl love someon feel same?,feel express love someon reciproc love?,1,5,6,0.27
266734,stage fright? overcom it?,best way overcom stage fright?,1,4,5,0.33
186530,differ ba (hons) english & ba english?,what' differ english honour ba english?,1,7,6,0.33
330352,employ skills?,employ skills?,1,2,2,0.5


#### Common words to minimum length of total word

In [None]:
df['min_common_word_ratio'] = round(word_common/df[['q1_len','q2_len']].min(axis =1) + 0.0001 , 2)
df['max_common_word_ratio'] = round(word_common/df[['q1_len','q2_len']].max(axis =1) + 0.0001 , 2)
df.head()

Unnamed: 0,question1,question2,is_duplicate,q1_len,q2_len,common_word_ratio,min_common_word_ratio,max_common_word_ratio
97444,us still land free?,america realli land free?,1,4,4,0.25,0.5,0.5
129390,handl love someon feel same?,feel express love someon reciproc love?,1,5,6,0.27,0.6,0.5
266734,stage fright? overcom it?,best way overcom stage fright?,1,4,5,0.33,0.75,0.6
186530,differ ba (hons) english & ba english?,what' differ english honour ba english?,1,7,6,0.33,0.67,0.57
330352,employ skills?,employ skills?,1,2,2,0.5,1.0,1.0


#### First/Last word similar

In [None]:
def fetch_token_features(row):

    q1 = row['question1']
    q2 = row['question2']


    token_features = [0.0]*2

    # Converting the Sentence into Tokens:
    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features


    # Last word of both question is same or not
    token_features[0] = int(q1_tokens[-1] == q2_tokens[-1])

    # First word of both question is same or not
    token_features[1] = int(q1_tokens[0] == q2_tokens[0])

    return token_features

In [None]:
token = df.apply(fetch_token_features, axis=1)

df["last_word_similar"]  = list(map(lambda x: x[0], token))
df["first_word_similar"] = list(map(lambda x: x[1], token))

In [None]:
df.head()

Unnamed: 0,question1,question2,is_duplicate,q1_len,q2_len,common_word_ratio,min_common_word_ratio,max_common_word_ratio,last_word_similar,first_word_similar
97444,us still land free?,america realli land free?,1,4,4,0.25,0.5,0.5,1.0,0.0
129390,handl love someon feel same?,feel express love someon reciproc love?,1,5,6,0.27,0.6,0.5,0.0,0.0
266734,stage fright? overcom it?,best way overcom stage fright?,1,4,5,0.33,0.75,0.6,0.0,0.0
186530,differ ba (hons) english & ba english?,what' differ english honour ba english?,1,7,6,0.33,0.67,0.57,1.0,0.0
330352,employ skills?,employ skills?,1,2,2,0.5,1.0,1.0,1.0,1.0


#### Absolute Length and Mean Length

In [None]:
def fetch_length_features(row):

    q1 = row['question1']
    q2 = row['question2']

    length_features = [0.0]*2

    # Converting the Sentence into Tokens:
    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return length_features

    # Absolute length features
    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))

    #Average Token Length of both Questions
    length_features[1] = (len(q1_tokens) + len(q2_tokens))/2


    return length_features

In [None]:
length_features = df.apply(fetch_length_features, axis=1)

df['abs_len_diff'] = list(map(lambda x: x[0], length_features))
df['mean_len'] = list(map(lambda x: x[1], length_features))

#### Feature Impact

# Count Vectorizer

In [None]:
X = df.drop('is_duplicate', axis = 1)
y = df['is_duplicate']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [None]:
cv = CountVectorizer(max_features=1000)
q1_arr = cv.fit_transform(X_train['question1']).toarray()
q2_arr = cv.fit_transform(X_train['question2']).toarray()

In [None]:
temp_df1 = pd.DataFrame(q1_arr, index = X_train.index)
temp_df2 = pd.DataFrame(q2_arr, X_train.index)

In [None]:
temp_df = pd.concat([temp_df1, temp_df2], axis=1)

In [None]:
X_train.shape

(32000, 11)

In [None]:
X_train = pd.concat([X_train, temp_df], axis=1)
print(X_train.shape)
X_train.head()

(32000, 2011)


Unnamed: 0,question1,question2,q1_len,q2_len,common_word_ratio,min_common_word_ratio,max_common_word_ratio,last_word_similar,first_word_similar,abs_len_diff,...,990,991,992,993,994,995,996,997,998,999
227182,much time requir learn php?,much time requir learn php know c c++?,5,8,0.31,0.8,0.5,0.0,1.0,3.0,...,0,0,0,0,0,0,0,0,0,0
129055,get girlfriend?,way get girlfriend?,2,3,0.4,1.0,0.67,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
143238,rubi rail compani hire fresher pune?,compani pune hire rubi rail freshers?,6,6,0.33,0.67,0.67,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
182780,best day life? happened?,best day life made amaz stay mind forever?,4,8,0.17,0.5,0.25,0.0,1.0,4.0,...,0,0,0,0,0,0,0,0,0,0
162536,lesser known fact big bang theori (tv series)?,mind-blow fact big bang theori (tv series)?,8,7,0.4,0.86,0.75,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
cv = CountVectorizer(max_features=1000)
q1_arr = cv.fit_transform(X_test['question1']).toarray()
q2_arr = cv.fit_transform(X_test['question2']).toarray()

In [None]:
temp_df1 = pd.DataFrame(q1_arr, index= X_test.index)
temp_df2 = pd.DataFrame(q2_arr, index = X_test.index)

In [None]:
temp_df = pd.concat([temp_df1, temp_df2], axis=1)

In [None]:
X_test = pd.concat([X_test, temp_df], axis=1)
print(X_test.shape)
X_test.head()

(8000, 2011)


Unnamed: 0,question1,question2,q1_len,q2_len,common_word_ratio,min_common_word_ratio,max_common_word_ratio,last_word_similar,first_word_similar,abs_len_diff,...,990,991,992,993,994,995,996,997,998,999
210539,best book 2016?,one best book 2016?,3,4,0.43,1.0,0.75,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
81780,i'm nativ speaker. read quora everi day improv...,could read quora improv english significantly?,10,6,0.31,0.83,0.5,1.0,0.0,4.0,...,0,0,0,0,0,0,0,0,0,0
266449,get boyfriend birthday?,gift boyfriend birthday?,3,3,0.33,0.67,0.67,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
221473,"could good altern ""i look forward hear you""?","correct: ""i eagerli look forward work here""?",8,7,0.2,0.43,0.38,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
361649,best way gain confidence?,key confidence?,4,2,0.17,0.5,0.25,1.0,0.0,2.0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X_test.drop(['question1','question2'],axis=1, inplace = True)
X_train.drop(['question1','question2'],axis=1, inplace = True)

In [None]:
X_train.head()

Unnamed: 0,q1_len,q2_len,common_word_ratio,min_common_word_ratio,max_common_word_ratio,last_word_similar,first_word_similar,abs_len_diff,mean_len,0,...,990,991,992,993,994,995,996,997,998,999
227182,5,8,0.31,0.8,0.5,0.0,1.0,3.0,6.5,0,...,0,0,0,0,0,0,0,0,0,0
129055,2,3,0.4,1.0,0.67,1.0,0.0,1.0,2.5,0,...,0,0,0,0,0,0,0,0,0,0
143238,6,6,0.33,0.67,0.67,0.0,0.0,0.0,6.0,0,...,0,0,0,0,0,0,0,0,0,0
182780,4,8,0.17,0.5,0.25,0.0,1.0,4.0,6.0,0,...,0,0,0,0,0,0,0,0,0,0
162536,8,7,0.4,0.86,0.75,1.0,0.0,1.0,7.5,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X_train.columns = X_train.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [None]:
X_train.shape

(32000, 2009)

## Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.7135