In [1]:
import numpy as np
import pandas as pd


import re
from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings('ignore')


In [2]:
from tqdm import tqdm, tqdm_notebook
# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`

tqdm.pandas()

In [3]:
df = pd.read_csv('https://github.com/Koorimikiran369/Quora-Question-Pairing/raw/main/train.csv.zip')

In [4]:
df = df.sample(30000,random_state=2)

In [5]:
df.shape

(30000, 6)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 398782 to 312470
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            30000 non-null  int64 
 1   qid1          30000 non-null  int64 
 2   qid2          30000 non-null  int64 
 3   question1     30000 non-null  object
 4   question2     30000 non-null  object
 5   is_duplicate  30000 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 1.6+ MB


In [7]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
398782,398782,496695,532029,What is the best marketing automation tool for...,What is the best marketing automation tool for...,1
115086,115086,187729,187730,I am poor but I want to invest. What should I do?,I am quite poor and I want to be very rich. Wh...,0
327711,327711,454161,454162,I am from India and live abroad. I met a guy f...,T.I.E.T to Thapar University to Thapar Univers...,0
367788,367788,498109,491396,Why do so many people in the U.S. hate the sou...,My boyfriend doesnt feel guilty when he hurts ...,0
151235,151235,237843,50930,Consequences of Bhopal gas tragedy?,What was the reason behind the Bhopal gas trag...,0


In [8]:
df.tail()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
243932,243932,26193,356455,What are some good web scraping tutorials?,What are some good web scraping programs?,1
91980,91980,154063,154064,Can I apply for internet banking in SBI withou...,I have internet banking kit of SBI but it's no...,0
266955,266955,133017,384210,How much HE laundry detergent do you use in a ...,Can I use regular Dawn dishsoap in my dishwash...,0
71112,71112,122427,122428,What is the best way to understand and learn m...,What are some of the best ways to learn math?,1
312470,312470,436915,436916,What would the Modi-led government do in case ...,"If Pakistan mounts a 26/11 type attack again, ...",1


In [9]:
val = df.is_duplicate.value_counts()

In [10]:
# non duplicate value percentage
print('non duplicate question ratio',val[0]/df.shape[0]*100)
# duplicate value percentage
print('duplicate question ratio',val[1]/df.shape[0]*100)

non duplicate question ratio 63.37666666666667
duplicate question ratio 36.623333333333335


In [11]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
398782,398782,496695,532029,What is the best marketing automation tool for...,What is the best marketing automation tool for...,1
115086,115086,187729,187730,I am poor but I want to invest. What should I do?,I am quite poor and I want to be very rich. Wh...,0
327711,327711,454161,454162,I am from India and live abroad. I met a guy f...,T.I.E.T to Thapar University to Thapar Univers...,0
367788,367788,498109,491396,Why do so many people in the U.S. hate the sou...,My boyfriend doesnt feel guilty when he hurts ...,0
151235,151235,237843,50930,Consequences of Bhopal gas tragedy?,What was the reason behind the Bhopal gas trag...,0


In [12]:
from imblearn.under_sampling import RandomUnderSampler

# X: Feature matrix, y: Target variable
X = df.iloc[:,:-1]  # Your feature data
y = df.iloc[:,-1:] # Your target labels

# Create an instance of RandomUnderSampler
rus = RandomUnderSampler(random_state=42)

# Perform undersampling
X_resampled, y_resampled = rus.fit_resample(X, y)

# Print the class distribution after undersampling
unique, counts = np.unique(y_resampled, return_counts=True)
print("Class distribution after undersampling:")
for label, count in zip(unique, counts):
    print("Class {}: {}".format(label, count))

# Continue with your model training using the undersampled data (X_resampled, y_resampled)


Class distribution after undersampling:
Class 0: 10987
Class 1: 10987


In [13]:
type (X_resampled)

pandas.core.frame.DataFrame

In [14]:
type(y_resampled)

pandas.core.frame.DataFrame

In [15]:
df = pd.concat([X_resampled, y_resampled], axis=1)

In [16]:
val = df.is_duplicate.value_counts()
# non duplicate value percentage
print('non duplicate question ratio',val[0]/df.shape[0]*100)
# duplicate value percentage
print('duplicate question ratio',val[1]/df.shape[0]*100)

non duplicate question ratio 50.0
duplicate question ratio 50.0


In [17]:
def preprocess(q):

    q = str(q).lower().strip()

    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')

    # The pattern '[math]' appears around 900 times in the whole dataset.
    q = q.replace('[math]', '')

    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)

    # Decontracting words
    contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")

    # Removing HTML tags
    q = BeautifulSoup(q)
    q = q.get_text()

    # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()


    return q



In [18]:
preprocess("I've already! wasn't <b>done</b>?")


'i have already  was not done'

In [19]:
df['question1'] = df['question1'].progress_apply(preprocess)
df['question2'] = df['question2'].progress_apply(preprocess)


100%|██████████| 21974/21974 [00:07<00:00, 3040.15it/s]
100%|██████████| 21974/21974 [00:07<00:00, 2964.48it/s]


In [20]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk import pos_tag, word_tokenize
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [21]:
def pre_process(raw_text, flag):

    sentence = raw_text


    # Remove stop words
    clean_tokens = [t for t in sentence.split() if t not in stopwords.words("english")]

    # Stemming/Lemmatization and POS tagging
    if flag == 'stem':
        stemmer = PorterStemmer()
        token_list = []
        for word, tag in pos_tag(clean_tokens):
            wntag = tag[0].lower()
            wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
            stem = stemmer.stem(word, wntag) if wntag else word
            token_list.append(stem)
    else:
        lemmatizer = WordNetLemmatizer()
        token_list = []
        for word, tag in pos_tag(clean_tokens):
            wntag = tag[0].lower()
            wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
            lemma = lemmatizer.lemmatize(word, wntag) if wntag else word
            token_list.append(lemma)

    return pd.Series([" ".join(token_list)])

In [22]:
df['cleanQ1_lemma'] = df['question1'].progress_apply(lambda x: pre_process(x, 'lemma'))

100%|██████████| 21974/21974 [01:01<00:00, 356.67it/s]


In [23]:
df['cleanQ2_lemma'] = df['question2'].progress_apply(lambda x: pre_process(x, 'lemma'))

100%|██████████| 21974/21974 [00:52<00:00, 422.51it/s]


In [24]:
df['question1'].values[0:3]

array(['when function prototype is not necessary in c',
       'what does it mean if you see your husband taking some women in a hotel room',
       'law school  which language is more beneficial to learn for a lawyer   german or spanish'],
      dtype=object)

In [25]:
df['cleanQ1_lemma'].values[0:3]

array(['function prototype necessary c',
       'mean see husband take woman hotel room',
       'law school language beneficial learn lawyer german spanish'],
      dtype=object)

In [26]:
df['question2'].values[0:3]

array(['should i learn c before learning c',
       'what would be your very first reaction if you open the newspaper and read this headline   justin bieber found dead in his hotel room',
       'which language is beneficial for an engineer to learn  out of the following  german  french  japanese  or spanish'],
      dtype=object)

In [27]:
df['cleanQ2_lemma'].values[0:3]

array(['learn c learn c',
       'would first reaction open newspaper read headline justin bieber find dead hotel room',
       'language beneficial engineer learn follow german french japanese spanish'],
      dtype=object)

In [28]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,cleanQ1_lemma,cleanQ2_lemma
0,87407,147217,147218,when function prototype is not necessary in c,should i learn c before learning c,0,function prototype necessary c,learn c learn c
1,310950,435155,435156,what does it mean if you see your husband taki...,what would be your very first reaction if you ...,0,mean see husband take woman hotel room,would first reaction open newspaper read headl...
2,206689,310189,310190,law school which language is more beneficial ...,which language is beneficial for an engineer t...,0,law school language beneficial learn lawyer ge...,language beneficial engineer learn follow germ...
3,165212,256592,256593,should i update my old ipad 3 from 9 1 to ios ...,my ipad mini 3 just crashed it runs ios 9 th...,0,update old ipad 3 9 1 io 9 3 4,ipad mini 3 crashed run ios 9 skype app open
4,304449,427708,427709,is the running business running apps wearabl...,when is the right time to stop running a tech ...,0,run business run apps wearable etc exploitable...,right time stop run tech business home take ne...


# feature extraction Basic

In [29]:
df['q1_len'] = df['cleanQ1_lemma'].str.len()
df['q2_len'] = df['cleanQ2_lemma'].str.len()

In [30]:
df['q1_num_words'] = df['cleanQ1_lemma'].apply(lambda row: len(row.split(" ")))
df['q2_num_words'] = df['cleanQ1_lemma'].apply(lambda row: len(row.split(" ")))
df.head()


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,cleanQ1_lemma,cleanQ2_lemma,q1_len,q2_len,q1_num_words,q2_num_words
0,87407,147217,147218,when function prototype is not necessary in c,should i learn c before learning c,0,function prototype necessary c,learn c learn c,30,15,4,4
1,310950,435155,435156,what does it mean if you see your husband taki...,what would be your very first reaction if you ...,0,mean see husband take woman hotel room,would first reaction open newspaper read headl...,38,84,7,7
2,206689,310189,310190,law school which language is more beneficial ...,which language is beneficial for an engineer t...,0,law school language beneficial learn lawyer ge...,language beneficial engineer learn follow germ...,58,72,8,8
3,165212,256592,256593,should i update my old ipad 3 from 9 1 to ios ...,my ipad mini 3 just crashed it runs ios 9 th...,0,update old ipad 3 9 1 io 9 3 4,ipad mini 3 crashed run ios 9 skype app open,30,44,10,10
4,304449,427708,427709,is the running business running apps wearabl...,when is the right time to stop running a tech ...,0,run business run apps wearable etc exploitable...,right time stop run tech business home take ne...,65,76,10,10


In [31]:
def common_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['cleanQ1_lemma'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['cleanQ2_lemma'].split(" ")))
    return len(w1 & w2)


In [32]:
df['word_common'] = df.apply(common_words, axis=1)
df.head()


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,cleanQ1_lemma,cleanQ2_lemma,q1_len,q2_len,q1_num_words,q2_num_words,word_common
0,87407,147217,147218,when function prototype is not necessary in c,should i learn c before learning c,0,function prototype necessary c,learn c learn c,30,15,4,4,1
1,310950,435155,435156,what does it mean if you see your husband taki...,what would be your very first reaction if you ...,0,mean see husband take woman hotel room,would first reaction open newspaper read headl...,38,84,7,7,2
2,206689,310189,310190,law school which language is more beneficial ...,which language is beneficial for an engineer t...,0,law school language beneficial learn lawyer ge...,language beneficial engineer learn follow germ...,58,72,8,8,5
3,165212,256592,256593,should i update my old ipad 3 from 9 1 to ios ...,my ipad mini 3 just crashed it runs ios 9 th...,0,update old ipad 3 9 1 io 9 3 4,ipad mini 3 crashed run ios 9 skype app open,30,44,10,10,3
4,304449,427708,427709,is the running business running apps wearabl...,when is the right time to stop running a tech ...,0,run business run apps wearable etc exploitable...,right time stop run tech business home take ne...,65,76,10,10,3


In [33]:
def total_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['cleanQ1_lemma'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['cleanQ2_lemma'].split(" ")))
    return (len(w1) + len(w2))


In [34]:
df['word_total'] = df.apply(total_words, axis=1)
df.head()


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,cleanQ1_lemma,cleanQ2_lemma,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total
0,87407,147217,147218,when function prototype is not necessary in c,should i learn c before learning c,0,function prototype necessary c,learn c learn c,30,15,4,4,1,6
1,310950,435155,435156,what does it mean if you see your husband taki...,what would be your very first reaction if you ...,0,mean see husband take woman hotel room,would first reaction open newspaper read headl...,38,84,7,7,2,20
2,206689,310189,310190,law school which language is more beneficial ...,which language is beneficial for an engineer t...,0,law school language beneficial learn lawyer ge...,language beneficial engineer learn follow germ...,58,72,8,8,5,17
3,165212,256592,256593,should i update my old ipad 3 from 9 1 to ios ...,my ipad mini 3 just crashed it runs ios 9 th...,0,update old ipad 3 9 1 io 9 3 4,ipad mini 3 crashed run ios 9 skype app open,30,44,10,10,3,18
4,304449,427708,427709,is the running business running apps wearabl...,when is the right time to stop running a tech ...,0,run business run apps wearable etc exploitable...,right time stop run tech business home take ne...,65,76,10,10,3,23


In [35]:
df['word_share'] = round(df['word_common']/df['word_total'],2)
df.head()


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,cleanQ1_lemma,cleanQ2_lemma,q1_len,q2_len,q1_num_words,q2_num_words,word_common,word_total,word_share
0,87407,147217,147218,when function prototype is not necessary in c,should i learn c before learning c,0,function prototype necessary c,learn c learn c,30,15,4,4,1,6,0.17
1,310950,435155,435156,what does it mean if you see your husband taki...,what would be your very first reaction if you ...,0,mean see husband take woman hotel room,would first reaction open newspaper read headl...,38,84,7,7,2,20,0.1
2,206689,310189,310190,law school which language is more beneficial ...,which language is beneficial for an engineer t...,0,law school language beneficial learn lawyer ge...,language beneficial engineer learn follow germ...,58,72,8,8,5,17,0.29
3,165212,256592,256593,should i update my old ipad 3 from 9 1 to ios ...,my ipad mini 3 just crashed it runs ios 9 th...,0,update old ipad 3 9 1 io 9 3 4,ipad mini 3 crashed run ios 9 skype app open,30,44,10,10,3,18,0.17
4,304449,427708,427709,is the running business running apps wearabl...,when is the right time to stop running a tech ...,0,run business run apps wearable etc exploitable...,right time stop run tech business home take ne...,65,76,10,10,3,23,0.13


# Advanced Features

In [36]:
# Advanced Features
from nltk.corpus import stopwords

def fetch_token_features(row):

    q1 = row['question1']
    q2 = row['question2']

    SAFE_DIV = 0.0001

    STOP_WORDS = stopwords.words("english")

    token_features = [0.0]*8

    # Converting the Sentence into Tokens:
    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    # Get the non-stopwords in Questions
    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])

    #Get the stopwords in Questions
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])

    # Get the common non-stopwords from Question pair
    common_word_count = len(q1_words.intersection(q2_words))

    # Get the common stopwords from Question pair
    common_stop_count = len(q1_stops.intersection(q2_stops))

    # Get the common Tokens from Question pair
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))


    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)

    # Last word of both question is same or not
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])

    # First word of both question is same or not
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])

    return token_features


In [37]:
token_features = df.apply(fetch_token_features, axis=1)

df["cwc_min"]       = list(map(lambda x: x[0], token_features))
df["cwc_max"]       = list(map(lambda x: x[1], token_features))
df["csc_min"]       = list(map(lambda x: x[2], token_features))
df["csc_max"]       = list(map(lambda x: x[3], token_features))
df["ctc_min"]       = list(map(lambda x: x[4], token_features))
df["ctc_max"]       = list(map(lambda x: x[5], token_features))
df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
df["first_word_eq"] = list(map(lambda x: x[7], token_features))


In [38]:
pip install distance

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting distance
  Downloading Distance-0.1.3.tar.gz (180 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.3/180.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: distance
  Building wheel for distance (setup.py) ... [?25l[?25hdone
  Created wheel for distance: filename=Distance-0.1.3-py3-none-any.whl size=16258 sha256=e97bf8989e4c51408eba86b921f89023654fa01ed9be1d1d52d179923d61cb68
  Stored in directory: /root/.cache/pip/wheels/e8/bb/de/f71bf63559ea9a921059a5405806f7ff6ed612a9231c4a9309
Successfully built distance
Installing collected packages: distance
Successfully installed distance-0.1.3


In [39]:
import distance

def fetch_length_features(row):

    q1 = row['cleanQ1_lemma']
    q2 = row['cleanQ2_lemma']

    length_features = [0.0]*3

    # Converting the Sentence into Tokens:
    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return length_features

    # Absolute length features
    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))

    #Average Token Length of both Questions
    length_features[1] = (len(q1_tokens) + len(q2_tokens))/2

    strs = list(distance.lcsubstrings(q1, q2))
    length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)

    return length_features


In [40]:
length_features = df.apply(fetch_length_features, axis=1)

df['abs_len_diff'] = list(map(lambda x: x[0], length_features))
df['mean_len'] = list(map(lambda x: x[1], length_features))
df['longest_substr_ratio'] = list(map(lambda x: x[2], length_features))


In [41]:
pip install fuzzywuzzy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [42]:
# Fuzzy Features
from fuzzywuzzy import fuzz

def fetch_fuzzy_features(row):

    q1 = row['question1']
    q2 = row['question2']

    fuzzy_features = [0.0]*4

    # fuzz_ratio
    fuzzy_features[0] = fuzz.QRatio(q1, q2)

    # fuzz_partial_ratio
    fuzzy_features[1] = fuzz.partial_ratio(q1, q2)

    # token_sort_ratio
    fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)

    # token_set_ratio
    fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)

    return fuzzy_features


In [43]:
fuzzy_features = df.progress_apply(fetch_fuzzy_features, axis=1)

# Creating new feature columns for fuzzy features
df['fuzz_ratio'] = list(map(lambda x: x[0], fuzzy_features))
df['fuzz_partial_ratio'] = list(map(lambda x: x[1], fuzzy_features))
df['token_sort_ratio'] = list(map(lambda x: x[2], fuzzy_features))
df['token_set_ratio'] = list(map(lambda x: x[3], fuzzy_features))


100%|██████████| 21974/21974 [01:31<00:00, 240.75it/s]


In [44]:
print(df.shape)
df.head()


(21974, 30)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,cleanQ1_lemma,cleanQ2_lemma,q1_len,q2_len,...,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,longest_substr_ratio,fuzz_ratio,fuzz_partial_ratio,token_sort_ratio,token_set_ratio
0,87407,147217,147218,when function prototype is not necessary in c,should i learn c before learning c,0,function prototype necessary c,learn c learn c,30,15,...,0.124998,1.0,0.0,0.0,4.0,0.125,28,33,28,29
1,310950,435155,435156,what does it mean if you see your husband taki...,what would be your very first reaction if you ...,0,mean see husband take woman hotel room,would first reaction open newspaper read headl...,38,84,...,0.291665,1.0,1.0,6.0,10.0,0.282051,52,44,50,57
2,206689,310189,310190,law school which language is more beneficial ...,which language is beneficial for an engineer t...,0,law school language beneficial learn lawyer ge...,language beneficial engineer learn follow germ...,58,72,...,0.555552,1.0,0.0,1.0,8.5,0.338983,61,60,68,83
3,165212,256592,256593,should i update my old ipad 3 from 9 1 to ios ...,my ipad mini 3 just crashed it runs ios 9 th...,0,update old ipad 3 9 1 io 9 3 4,ipad mini 3 crashed run ios 9 skype app open,30,44,...,0.368419,0.0,0.0,0.0,10.0,0.16129,16,35,53,68
4,304449,427708,427709,is the running business running apps wearabl...,when is the right time to stop running a tech ...,0,run business run apps wearable etc exploitable...,right time stop run tech business home take ne...,65,76,...,0.199999,0.0,0.0,4.0,12.0,0.151515,42,46,45,51


In [45]:
from sklearn.preprocessing import MinMaxScaler

X = MinMaxScaler().fit_transform(df[['cwc_min', 'cwc_max', 'csc_min', 'csc_max' , 'ctc_min' , 'ctc_max' , 'last_word_eq', 'first_word_eq' , 'abs_len_diff' , 'mean_len' , 'token_set_ratio' , 'token_sort_ratio' ,  'fuzz_ratio' , 'fuzz_partial_ratio' , 'longest_substr_ratio']])
y = df['is_duplicate'].values


In [46]:
# merge texts
corpus = list(df['cleanQ1_lemma']) + list(df['cleanQ2_lemma'])

In [47]:
display(type(corpus))
display(len(corpus))

list

43948

# first we can go with Bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=3000)
q1_arr, q2_arr = np.vsplit(cv.fit_transform(corpus).toarray(),2)


In [None]:
temp_df1 = pd.DataFrame(q1_arr, index= df.index)
temp_df2 = pd.DataFrame(q2_arr, index= df.index)
temp_df = pd.concat([temp_df1, temp_df2], axis=1)
temp_df.shape


(21974, 6000)

In [None]:
df = pd.concat([df, temp_df], axis=1)
print(df.shape)
df.head()


(21974, 6030)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,cleanQ2_lemma,cleanQ1_lemma,q1_len,q2_len,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
0,87407,147217,147218,when function prototype is not necessary in c,should i learn c before learning c,0,learn c learn c,function prototype necessary c,30,15,...,0,0,0,0,0,0,0,0,0,0
1,310950,435155,435156,what does it mean if you see your husband taki...,what would be your very first reaction if you ...,0,would first reaction open newspaper read headl...,mean see husband take woman hotel room,38,84,...,0,0,0,0,0,0,0,0,0,0
2,206689,310189,310190,law school which language is more beneficial ...,which language is beneficial for an engineer t...,0,language beneficial engineer learn follow germ...,law school language beneficial learn lawyer ge...,58,72,...,0,0,0,0,0,0,0,0,0,0
3,165212,256592,256593,should i update my old ipad 3 from 9 1 to ios ...,my ipad mini 3 just crashed it runs ios 9 th...,0,ipad mini 3 crashed run ios 9 skype app open,update old ipad 3 9 1 io 9 3 4,30,44,...,0,0,0,0,0,0,0,0,0,0
4,304449,427708,427709,is the running business running apps wearabl...,when is the right time to stop running a tech ...,0,right time stop run tech business home take ne...,run business run apps wearable etc exploitable...,65,76,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df.iloc[:,5:6].values

array([[0],
       [0],
       [0],
       ...,
       [1],
       [1],
       [1]])

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,8:].values,df.iloc[:,5:6].values,test_size=0.2,random_state=1)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
rf_y_pred = rf.predict(X_test)
accuracy_score(y_test,rf_y_pred)

0.7840728100113765

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
xgb_y_pred = xgb.predict(X_test)
accuracy_score(y_test,xgb_y_pred)


0.7929465301478953

# downloding the BOW and Models

In [None]:
import pickle

pickle.dump(rf,open('rf_model.pkl','wb'))
pickle.dump(xgb,open('xgb_model.pkl','wb'))
pickle.dump(cv,open('cv.pkl','wb'))


# lets try with TFIDF before running BOW run from here

In [60]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
Tf = TfidfVectorizer(max_features=3000)
q1_arr, q2_arr = np.vsplit(Tf.fit_transform(corpus).toarray(),2)


In [61]:
display(len(q1_arr))
display(q1_arr[0])
display(len(q2_arr))
display(q2_arr[0])

21974

array([0., 0., 0., ..., 0., 0., 0.])

21974

array([0., 0., 0., ..., 0., 0., 0.])

In [62]:
temp_df1 = pd.DataFrame(q1_arr, index= df.index)
temp_df2 = pd.DataFrame(q2_arr, index= df.index)
temp_df = pd.concat([temp_df1, temp_df2], axis=1)
temp_df.shape


(21974, 6000)

In [63]:
df = pd.concat([df, temp_df], axis=1)
print(df.shape)
df.head()


(21974, 6030)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,cleanQ1_lemma,cleanQ2_lemma,q1_len,q2_len,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
0,87407,147217,147218,when function prototype is not necessary in c,should i learn c before learning c,0,function prototype necessary c,learn c learn c,30,15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,310950,435155,435156,what does it mean if you see your husband taki...,what would be your very first reaction if you ...,0,mean see husband take woman hotel room,would first reaction open newspaper read headl...,38,84,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,206689,310189,310190,law school which language is more beneficial ...,which language is beneficial for an engineer t...,0,law school language beneficial learn lawyer ge...,language beneficial engineer learn follow germ...,58,72,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,165212,256592,256593,should i update my old ipad 3 from 9 1 to ios ...,my ipad mini 3 just crashed it runs ios 9 th...,0,update old ipad 3 9 1 io 9 3 4,ipad mini 3 crashed run ios 9 skype app open,30,44,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,304449,427708,427709,is the running business running apps wearabl...,when is the right time to stop running a tech ...,0,run business run apps wearable etc exploitable...,right time stop run tech business home take ne...,65,76,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,8:].values,df.iloc[:,5:6].values,test_size=0.2,random_state=1)

In [65]:
from sklearn.metrics import accuracy_score

In [66]:
from sklearn.ensemble import RandomForestClassifier
rf_tf = RandomForestClassifier()
rf_tf.fit(X_train,y_train)
rf_tf_y_pred = rf_tf.predict(X_test)
accuracy_score(y_test,rf_tf_y_pred)

0.783617747440273

In [70]:
from xgboost import XGBClassifier
xgb_tf = XGBClassifier()
xgb_tf.fit(X_train,y_train)
xgb_tf_y_pred = xgb_tf.predict(X_test)
accuracy_score(y_test,xgb_tf_y_pred)


0.79226393629124

# downloding the TFIDF and Models

In [89]:
import pickle

pickle.dump(rf_tf,open('rf_tf_model.pkl','wb'))
pickle.dump(xgb_tf,open('xgb_tf_model.pkl','wb'))
pickle.dump(Tf,open('tfidf.pkl','wb'))


# confussion matrix

In [None]:
from sklearn.metrics import confusion_matrix
# for random forest model with bow
confusion_matrix(y_test,rf_y_pred)

array([[1573,  642],
       [ 307, 1873]])

In [69]:
from sklearn.metrics import confusion_matrix
# for random forest model with tfidf
confusion_matrix(y_test,rf_tf_y_pred)

array([[1574,  641],
       [ 310, 1870]])

In [None]:
# for random xgb with bow
confusion_matrix(y_test,xgb_y_pred)


array([[1613,  602],
       [ 308, 1872]])

In [71]:
# for random xgb with tfidf
confusion_matrix(y_test,xgb_tf_y_pred)

array([[1602,  613],
       [ 300, 1880]])

# query point creation

In [72]:
def preprocess(q):

    q = str(q).lower().strip()

    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('₹', ' rupee ')
    q = q.replace('€', ' euro ')
    q = q.replace('@', ' at ')

    # The pattern '[math]' appears around 900 times in the whole dataset.
    q = q.replace('[math]', '')

    # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
    q = q.replace(',000,000,000 ', 'b ')
    q = q.replace(',000,000 ', 'm ')
    q = q.replace(',000 ', 'k ')
    q = re.sub(r'([0-9]+)000000000', r'\1b', q)
    q = re.sub(r'([0-9]+)000000', r'\1m', q)
    q = re.sub(r'([0-9]+)000', r'\1k', q)

    # Decontracting words
    contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")

    # Removing HTML tags
    q = BeautifulSoup(q)
    q = q.get_text()

    # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()


    return q



In [73]:
def pre_process(q, flag):

    sentence = q


    # Remove stop words
    clean_tokens = [t for t in sentence.split() if t not in stopwords.words("english")]

    # Stemming/Lemmatization and POS tagging
    if flag == 'stem':
        stemmer = PorterStemmer()
        token_list = []
        for word, tag in pos_tag(clean_tokens):
            wntag = tag[0].lower()
            wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
            stem = stemmer.stem(word, wntag) if wntag else word
            token_list.append(stem)
    else:
        lemmatizer = WordNetLemmatizer()
        token_list = []
        for word, tag in pos_tag(clean_tokens):
            wntag = tag[0].lower()
            wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
            lemma = lemmatizer.lemmatize(word, wntag) if wntag else word
            token_list.append(lemma)
    return " ".join(token_list)
    #return pd.Series([" ".join(token_list)])

In [74]:
def test_common_words(q1,q2):
    w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))
    return len(w1 & w2)


In [75]:
def test_total_words(q1,q2):
    w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))
    return (len(w1) + len(w2))

In [76]:
def test_fetch_token_features(q1,q2):

    SAFE_DIV = 0.0001

    STOP_WORDS = stopwords.words("english")

    token_features = [0.0]*8

    # Converting the Sentence into Tokens:
    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    # Get the non-stopwords in Questions
    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])

    #Get the stopwords in Questions
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])

    # Get the common non-stopwords from Question pair
    common_word_count = len(q1_words.intersection(q2_words))

    # Get the common stopwords from Question pair
    common_stop_count = len(q1_stops.intersection(q2_stops))

    # Get the common Tokens from Question pair
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))


    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)

    # Last word of both question is same or not
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])

    # First word of both question is same or not
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])

    return token_features


In [77]:
def test_fetch_length_features(q1,q2):

    length_features = [0.0]*3

    # Converting the Sentence into Tokens:
    q1_tokens = q1.split()
    q2_tokens = q2.split()

    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return length_features

    # Absolute length features
    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))

    #Average Token Length of both Questions
    length_features[1] = (len(q1_tokens) + len(q2_tokens))/2

    strs = list(distance.lcsubstrings(q1, q2))
    length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)

    return length_features

In [78]:
def test_fetch_fuzzy_features(q1,q2):

    fuzzy_features = [0.0]*4

    # fuzz_ratio
    fuzzy_features[0] = fuzz.QRatio(q1, q2)

    # fuzz_partial_ratio
    fuzzy_features[1] = fuzz.partial_ratio(q1, q2)

    # token_sort_ratio
    fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)

    # token_set_ratio
    fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)

    return fuzzy_features


In [82]:
def query_point_creator(q1,q2):

    input_query = []

    # preprocess
    q1 = preprocess(q1)
    q2 = preprocess(q2)

    #stemming/lemmatization
    q1 = pre_process(q1,'lemma')
    q2 = pre_process(q2,'lemma')

    # fetch basic features
    input_query.append(len(q1))
    input_query.append(len(q2))

    input_query.append(len(q1.split(" ")))
    input_query.append(len(q2.split(" ")))

    input_query.append(test_common_words(q1,q2))
    input_query.append(test_total_words(q1,q2))
    input_query.append(round(test_common_words(q1,q2)/test_total_words(q1,q2),2))

    # fetch token features
    token_features = test_fetch_token_features(q1,q2)
    input_query.extend(token_features)

    # fetch length based features
    length_features = test_fetch_length_features(q1,q2)
    input_query.extend(length_features)

    # fetch fuzzy features
    fuzzy_features = test_fetch_fuzzy_features(q1,q2)
    input_query.extend(fuzzy_features)

    # bow feature for q1
    q1_bow = Tf.transform([q1]).toarray()

    # bow feature for q2
    q2_bow = Tf.transform([q2]).toarray()



    return np.hstack((np.array(input_query).reshape(1,22),q1_bow,q2_bow))


In [87]:
q1 = 'Where is the capital of India?'
q2 = 'What is the current capital of Pakistan?'
q3 = 'Which city serves as the capital of India?'
q4 = 'What is the business capital of India?'
q5 = 'What is the business capital of Pakistan?'



# with bow and models

In [None]:
display(rf.predict(query_point_creator(q1,q2)))
display(xgb.predict(query_point_creator(q1,q2)))

array([0])

array([1])

In [None]:
display(rf.predict(query_point_creator(q2,q4)))
display(xgb.predict(query_point_creator(q2,q4)))

array([0])

array([0])

In [None]:
display(rf.predict(query_point_creator(q1,q2)))
display(xgb.predict(query_point_creator(q1,q2)))

array([0])

array([1])

# with tfidf and models

In [84]:
display(rf_tf.predict(query_point_creator(q1,q2)))
display(xgb_tf.predict(query_point_creator(q1,q2)))

array([0])

array([0])

In [85]:
display(rf_tf.predict(query_point_creator(q2,q4)))
display(xgb_tf.predict(query_point_creator(q2,q4)))

array([0])

array([0])

In [86]:
display(rf_tf.predict(query_point_creator(q1,q3)))
display(xgb_tf.predict(query_point_creator(q1,q3)))

array([1])

array([1])

In [88]:
display(rf_tf.predict(query_point_creator(q2,q5)))
display(xgb_tf.predict(query_point_creator(q5,q2)))

array([1])

array([1])

In [None]:
#Creating a dataframe that contains the question ID of both qid1 and qid2
qids = pd.Series(list(df['qid1']) + list(df['qid2']))

#Counting the total number of unique questions
unique_qs = len(np.unique(qids))
print('Total number of Unique Questions are: {}\n'.format(unique_qs))

#Counting the number of unique questions that appear more than once
qs_morethan_onetime = np.sum(qids.value_counts() > 1)
percentage = round(qs_morethan_onetime / unique_qs * 100, 2)
print('Number of unique questions that appear more than one time: {} ({}%)\n'.format(qs_morethan_onetime, percentage))

#Finding the maximum number of times a single question is repeated
max_repeated = max(qids.value_counts())
print('Max number of times a single question is repeated: {}\n'.format(max_repeated))

#Printing the frequency of all questions
q_vals = qids.value_counts()
print(q_vals)

Total number of Unique Questions are: 40098

Number of unique questions that appear more than one time: 2794 (6.97%)

Max number of times a single question is repeated: 8

2374      8
11264     8
11397     7
2439      7
25163     7
         ..
199747    1
258433    1
26798     1
440136    1
436916    1
Length: 40098, dtype: int64
