# Quora Question Pairs

## Data Cleaning

### Import packages

In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import nltk
import os

In [2]:
RAW_DATA_FOLDER = "raw_data" # create raw_data folder before running this line
STAGING_DATA_FOLDER = "staging_data" # create staging_data folder before running this line
CLEAN_DATA_FOLDER = "clean_data" # create clean_data folder before running this line
nltk.download("punkt") # This downloads the helper files for word_tokenize

[nltk_data] Downloading package punkt to /Users/nbthakur/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
#nltk.download("wordnet") # Helper files for Lemmatization

In [4]:
nltk.download("stopwords") # Helper files for stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nbthakur/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Read the Dataset into a Pandas Dataframe

In [5]:
df = pd.read_csv(os.path.join(RAW_DATA_FOLDER, "quora_duplicate_questions.tsv"), sep="\t") 
# os.path.join helps to join raw_data folder with the dataset file name using / this is make sure this notebook runs on windows as well

In [6]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [7]:
df.shape

(404290, 6)

In [8]:
df["is_duplicate"].sum() # count of label 1

149263

In [9]:
len(df)-(df["is_duplicate"].sum()) # Count of label 0

255027

In [10]:
(len(df)-(df["is_duplicate"].sum()))/(len(df)) # the ratio of 0:1

0.630802146973707

## Can create distributions of the ratio of target label. Post augmentation also we can show again

## Data Cleaning

### Missing Data

#### 1. Deleted the missing row because we could not find the question text from any other row.
#### 2. We had another missing row with similar text so mergerd it making it a single row.

In [11]:
# Number of null values in each column
nulls_per_column = df.isnull().sum()

print("nulls in each column: ")
print(nulls_per_column)

nulls in each column: 
id              0
qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64


In [12]:
df.loc[(df["question1"].isnull()) | (df["question2"].isnull())]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0
363362,363362,493340,493341,,My Chinese name is Haichao Yu. What English na...,0


### Handle Missing Data

Remove the non-resolvable null row

In [13]:
df.loc[df["question1"].isnull()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
363362,363362,493340,493341,,My Chinese name is Haichao Yu. What English na...,0


In [14]:
df.loc[df["id"] == 363362]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
363362,363362,493340,493341,,My Chinese name is Haichao Yu. What English na...,0


In [15]:
# Dropping the row with id 363362 since that question does not make sense and there is only one instance of this qid1
df = df[df.id != 363362]

In [16]:
df.loc[df["id"] == 363362]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate


In [17]:
# Number of null values in each column
nulls_per_column = df.isnull().sum()

print("nulls in each column: ")
print(nulls_per_column)

nulls in each column: 
id              0
qid1            0
qid2            0
question1       0
question2       2
is_duplicate    0
dtype: int64


### Handle Data Inconsistency 1

Handle resolvable nulls - question text can be obtained from other example

In [18]:
df.loc[df["question2"].isnull()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,174364,How can I develop android app?,,0
201841,201841,303951,174364,How can I create an Android app?,,0


In [19]:
# we can see that qui2 is same for both since this id is NaN it is deleted from the data.
# Merging two rows with the similar question to make it one row
df.loc[105780, "qid2"] = df.loc[201841, "qid1"]
df.loc[105780, "question2"] = df.loc[201841, "question1"]
df = df[df.id != 201841]

In [20]:
df.loc[df["question2"].isnull()]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate


In [21]:
df.loc[df["id"] == 105780]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,303951,How can I develop android app?,How can I create an Android app?,0


In [22]:
# Setting the is_duplicate value to 1 since it is duplicate
df.loc[105780, "is_duplicate"] = 1

In [23]:
df.loc[df["id"] == 105780]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
105780,105780,174363,303951,How can I develop android app?,How can I create an Android app?,1


In [24]:
# Again checking number of null values in each column
nulls_per_column = df.isnull().sum()

print("nulls in each column: ")
print(nulls_per_column)

nulls in each column: 
id              0
qid1            0
qid2            0
question1       0
question2       0
is_duplicate    0
dtype: int64


In [25]:
df.shape # two rows has been removed now which were not necessary

(404288, 6)

### Handle Data Inconsistency 2

Handle possible duplicate data

In [26]:
df[["qid1", "qid2", "question1", "question2", "is_duplicate"]].duplicated().sum()

0

In [27]:
df["uuid"] = df.apply(lambda row: tuple(sorted([row["qid1"], row["qid2"]])), axis=1)
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,uuid
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,"(1, 2)"
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,"(3, 4)"
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,"(5, 6)"
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,"(7, 8)"
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,"(9, 10)"


In [28]:
df[df["uuid"].duplicated()]
df = df.drop(columns="uuid")

#### Inconsistent Data

#### Step 1. Some question texts had multiple question ids associated with it.
#### Step 2. Select only one question id for that question text.
#### Step 3. Replace question id for all the question text that match the above question text.
#### Step 4. Remove the duplicate ids.
#### STep 5. Storing them in a new file.

#### To check if the data is inconsistent or not we first split the data into question ids and question text.

#### Create question pairs

In [29]:
q_pairs = df.loc[:, ["id", "qid1", "qid2", "is_duplicate"]]

#### Create the question bank

In [30]:
df1 = df[["qid1", "question1"]].rename(columns={"qid1": "qid", "question1": "question"})
df2 = df[["qid2", "question2"]].rename(columns={"qid2": "qid", "question2": "question"})
qs = pd.concat([df1, df2], axis=0)
qs = qs.drop_duplicates()
qs = qs.dropna()
qs = qs.sort_values("qid", axis=0)

### Handle Data Inconsistency 3

Handle question texts with multiple question ids

In [31]:
# Identify duplicate question text in the questions dataframe and show the first 30 duplicates
qs[qs["question"].duplicated(keep=False)].sort_values(["question", "qid"]).head(n=30)

Unnamed: 0,qid,question
13016,25026,?
20794,39204,?
47056,84068,?
96725,161071,?
104101,171925,?
134403,214814,?
208485,312495,?
273065,391451,?
402423,535899,?
36829,42085,? to be deleted


In [32]:
# Sorting the above questions on the length of the question
qs[qs["question"].duplicated(keep=False)].sort_values(
    ["question", "qid"]
).drop_duplicates("question").sort_values("question", key=lambda x: x.str.len()).head(n=20)

Unnamed: 0,qid,question
13016,25026,?
260779,376791,Why?
25228,47035,What?
17682,33561,deleted
87861,147899,Deleted.
303,606,Do I exist?
151566,238292,Am I crazy?
210668,315300,What is sex?
78579,133960,What is tcs?
25698,47881,What is SIP?


Lets remember to remove the above questions later with the below question ids since these question don't mean anything - 

25026, 42085, 376791, 47035, 33561, 147899

#### Remove the duplicate questions from question bank, and generate mapping for question pairs

In [33]:
# Keep the first qid for each question
qs_dedup = qs.drop_duplicates(subset="question", keep="first")
# Mapping from question text to new qid (the one we kept)
qid_map = pd.Series(qs_dedup['qid'].values, index=qs_dedup['question']).to_dict()
# Reverse mapping to find duplicates and their corresponding kept qid
reverse_map = qs.set_index('qid')['question'].map(qid_map)
# Creating the final mapping from all old qids to the new (kept) qids
final_qid_map = pd.Series(reverse_map, index=qs['qid']).to_dict()
# rename qs_dedup to qs
qs = qs_dedup

example

originally the dataset was:

question bank
| id | question |
| --- | --- |
| 1 | ? |
| 2 | ? |
| 3 | ? |
| 4 | ? to be deleted |
| 5| ? to be deleted |

question pair
| qid1 | qid2 | is_duplicate |
| --- | --- | ---|
| 787 | 1 | 0 |
| 2 | 500 | 1 |
| 101 | 3 | 0 |
| 4 | 7 | 0 |
| 17 | 5 | 0 |


Post transformations:

question bank
| id | question |
| --- | --- |
| 1 | ? |
| 1 | ? |
| 1 | ? |
| 4 | ? to be deleted |
| 4| ? to be deleted |

question pair
| qid1 | qid2 | is_duplicate |
| --- | --- | ---|
| 787 | 1 | 0 |
| 1 | 500 | 1 |
| 101 | 1 | 0 |
| 4 | 7 | 0 |
| 17 | 4 | 0 |

Removing duplicates:

question bank
| id | question |
| --- | --- |
| 1 | ? |
| 4 | ? to be deleted |

question id
| qid1 | qid2 | is_duplicate |
| --- | --- | ---|
| 787 | 1 | 0 |
| 1 | 500 | 1 |
| 101 | 1 | 0 |
| 4 | 7 | 0 |
| 17 | 4 | 0 |


#### Update question pairs dataframe by renaming the ids 

for example 25026 - ? all the ? now has id 25026

In [34]:
q_pairs['qid1'] = q_pairs['qid1'].map(final_qid_map)
q_pairs['qid2'] = q_pairs['qid2'].map(final_qid_map)

### Handle Data Inconsistency 4

Remove meaningless questions example below

| qid | question |
| --- | --- |
| 25026 | ? |
| 42085 | ? to be deleted |
| 487791 | To be deleted? |
| 247924 | ? (To be deleted) |
| 376791 | Why? |
| 47035 | What? |
| 33561 | deleted |
| 147899 | Deleted. |
| 279869 | [removed] |
| 308701 | Not needed |
| 388799 | Deleted question. |

In [35]:
remove_qids = [
    25026,
    42085,
    376791,
    47035,
    33561,
    147899,
    279869,
    308701,
    487791,
    247924,
    388799,
]

# Recording the qids that needs to be removed

#### Remove questions with length less than 4

In [36]:
# Checking questions with length less than 4
qs[qs.question.str.len() < 4]

Unnamed: 0,qid,question
3306,6553,.
13016,25026,?
23884,44699,HH
44619,80055,Na
86457,145814,Is?
108978,178936,i
109009,178982,Hh
115347,188110,o
158778,247989,A
169290,262028,111


In [37]:
# Adding the questions with length less than 4 to the remove quids list and deleting only questions with len less than 4
remove_qids += qs[qs.question.str.len() < 4]["qid"].to_list()
qs = qs[qs.question.str.len() >= 4]

#### Remove further questions with meaningless text

In [38]:
qs.sort_values("question", key=lambda x: x.str.len()).head(n=30) # Sorted the questions based on the length of the question

Unnamed: 0,qid,question
102512,169595,ok ?
180461,276676,spam
54029,95429,I'm
216861,323090,Aaas
208199,312129,Edit
164553,255663,Nana
208798,312898,Can?
236655,347631,Spam
109311,179423,What
260779,376791,Why?


In [39]:
# remove the above questions as well
remove_qids += qs.sort_values("question", key=lambda x: x.str.len()).head(n=30)["qid"].to_list()
qs = qs[
    ~qs.qid.isin(
        qs.sort_values("question", key=lambda x: x.str.len()).head(n=30)["qid"]
    )
]

#### Make the question bank and question pairs consistent after deleting questions

In [40]:
for qid in remove_qids:
    qs = qs[qs.qid != qid]
    q_pairs = q_pairs[(q_pairs.qid1 != qid) & (q_pairs.qid2 != qid)]

#### Save the question bank and question pairs to staging data

In [41]:
q_pairs.to_csv(os.path.join(STAGING_DATA_FOLDER, "question_pairs.csv"), index=False)
qs.to_csv(os.path.join(STAGING_DATA_FOLDER, "questions.tsv"), sep="\t", index=False)

# Saving to a file

#### The data is now in 3NF and consistent.

In [42]:
# ### Create Corpus

# We created corpus because, a corpus can be used to train a tokenizer for learnable embeddings in the later stages of the project.

# with open(os.path.join(STAGING_DATA_FOLDER, "corpus.txt"), "w") as fp:
#     for line in qs.iloc[:, 1].tolist():
#         line = line.replace("\n", "")
#         if line.endswith(" "):
#             fp.write(line)
#         else:
#             fp.write(line + " ")

### Tokenization

#### This is a collection of all the words/tokens in our dataset. This is useful for tokenization and embeddings.

#### Need to install package nltk (natural language tool kit). pip install nltk

In [43]:
qs.head(30)

Unnamed: 0,qid,question
0,1,What is the step by step guide to invest in sh...
0,2,What is the step by step guide to invest in sh...
1,3,What is the story of Kohinoor (Koh-i-Noor) Dia...
17296,4,What would happen if the Indian government sto...
2,5,How can I increase the speed of my internet co...
2,6,How can Internet speed be increased by hacking...
3,7,Why am I mentally very lonely? How can I solve...
3,8,Find the remainder when [math]23^{24}[/math] i...
4,9,"Which one dissolve in water quikly sugar, salt..."
4,10,Which fish would survive in salt water?


In [44]:
# converting to lower case

qs["question"] = qs["question"].str.lower()

In [45]:
qs.head()

Unnamed: 0,qid,question
0,1,what is the step by step guide to invest in sh...
0,2,what is the step by step guide to invest in sh...
1,3,what is the story of kohinoor (koh-i-noor) dia...
17296,4,what would happen if the indian government sto...
2,5,how can i increase the speed of my internet co...


In [46]:
# Here the each sentence is converted to a list of words aka tokenization.

qs["question"] = qs["question"].map(word_tokenize)

In [47]:
qs.head(30)

Unnamed: 0,qid,question
0,1,"[what, is, the, step, by, step, guide, to, inv..."
0,2,"[what, is, the, step, by, step, guide, to, inv..."
1,3,"[what, is, the, story, of, kohinoor, (, koh-i-..."
17296,4,"[what, would, happen, if, the, indian, governm..."
2,5,"[how, can, i, increase, the, speed, of, my, in..."
2,6,"[how, can, internet, speed, be, increased, by,..."
3,7,"[why, am, i, mentally, very, lonely, ?, how, c..."
3,8,"[find, the, remainder, when, [, math, ], 23^, ..."
4,9,"[which, one, dissolve, in, water, quikly, suga..."
4,10,"[which, fish, would, survive, in, salt, water, ?]"


In [48]:
qs.tail(30)

Unnamed: 0,qid,question
404256,537904,"[what, are, the, most, common, questions, aske..."
404262,537905,"[how, do, you, troubleshoot, a, toshiba, lapto..."
404263,537906,"[why, does, co2, contribute, more, to, global,..."
404264,537907,"[is, it, safe, to, store, an, external, batter..."
404264,537908,"[how, do, i, make, a, safe, and, cheap, power,..."
404268,537909,"[why, do, n't, we, still, do, great, music, li..."
404268,537910,"[should, i, raise, my, young, child, on, 80, '..."
404269,537911,"[how, do, you, diagnose, antisocial, personali..."
404270,537912,"[what, is, the, difference, between, who, and,..."
404271,537913,"[does, stalin, have, any, grandchildren, that,..."


### Removing Noise

### Removing Stop Words
#### It is kind of making sense that is having the important or original meaning of the sentence preserved and removeing the rest.

In [49]:
stop_words = set(stopwords.words('english'))

In [50]:
list(stop_words)[:30]

['so',
 'very',
 're',
 'didn',
 'those',
 'its',
 'were',
 'whom',
 'there',
 "won't",
 'both',
 'through',
 'your',
 'than',
 'themselves',
 'yours',
 "she's",
 'being',
 'with',
 'and',
 'not',
 "aren't",
 'under',
 'am',
 'above',
 'have',
 'does',
 'again',
 'they',
 "shouldn't"]

In [51]:
qs["question"] = qs["question"].map(lambda row : [x for x in row if x not in stop_words])
# excludes all the words that are in stop_words from each question and includes rest.

In [52]:
qs.head(30)

Unnamed: 0,qid,question
0,1,"[step, step, guide, invest, share, market, ind..."
0,2,"[step, step, guide, invest, share, market, ?]"
1,3,"[story, kohinoor, (, koh-i-noor, ), diamond, ?]"
17296,4,"[would, happen, indian, government, stole, koh..."
2,5,"[increase, speed, internet, connection, using,..."
2,6,"[internet, speed, increased, hacking, dns, ?]"
3,7,"[mentally, lonely, ?, solve, ?]"
3,8,"[find, remainder, [, math, ], 23^, {, 24, }, [..."
4,9,"[one, dissolve, water, quikly, sugar, ,, salt,..."
4,10,"[fish, would, survive, salt, water, ?]"


In [53]:
qs.iloc[:, 1].tolist()[:5]

[['step', 'step', 'guide', 'invest', 'share', 'market', 'india', '?'],
 ['step', 'step', 'guide', 'invest', 'share', 'market', '?'],
 ['story', 'kohinoor', '(', 'koh-i-noor', ')', 'diamond', '?'],
 ['would',
  'happen',
  'indian',
  'government',
  'stole',
  'kohinoor',
  '(',
  'koh-i-noor',
  ')',
  'diamond',
  'back',
  '?'],
 ['increase', 'speed', 'internet', 'connection', 'using', 'vpn', '?']]

### Remove questions that are potentially meaningless now

In [54]:
# questions with length less than 2 after tokenization, stop word removal
qs[qs.question.str.len() < 2]

Unnamed: 0,qid,question
493,984,[study]
151378,13922,[?]
9581,18608,[?]
9581,18609,[?]
29040,53770,[?]
29040,53771,[?]
35641,65095,[?]
35641,65096,[?]
39405,71436,[]
42947,77322,[?]


In [55]:
for qid in qs[qs.question.str.len() < 2]["qid"].to_list():
    qs = qs[qs.qid != qid] # Removing questions with length less than 2 from question bank
    q_pairs = q_pairs[(q_pairs.qid1 != qid) & (q_pairs.qid2 != qid)] # Removing relevant qids from question pairs

### Create vocabulary

Vocabulary is created because these words will be used to find corresponding embeddings from pre trained embeddings like glove and fasttext.

In [56]:
vocab = set()
for line in qs.iloc[:, 1].tolist():
    for token in line:
        vocab.add(token)
vocab = pd.DataFrame({"token": sorted(vocab)})
vocab = vocab.reset_index().rename(columns={"index": "id"}) # new code
vocab["id"] = vocab["id"].map(lambda x: x+1) # new code, ids should start with 1, 0 is reserved for padding
# vocab.to_csv(os.path.join(STAGING_DATA_FOLDER, "vocab.tsv"), sep="\t", index_label="id")
vocab.to_csv(os.path.join(CLEAN_DATA_FOLDER, "vocab.tsv"), sep="\t", index=False) # updated

In [57]:
vocab.head()

Unnamed: 0,id,token
0,1,!
1,2,#
2,3,$
3,4,%
4,5,&


### Convert question tokens to IDs

In [58]:
# create an inverse mapping from token/word to its id,
# so that each token/word in question can be replaced by token id
vocab_mapping = {row["token"]: row["id"] for _, row in vocab.iterrows()}
# update the question column using the above dictionary
qs["question"] = qs["question"].map(lambda x: [vocab_mapping[token] for token in x])

### Save the cleaned questions

In [59]:
# qs.to_csv(os.path.join(CLEAN_DATA_FOLDER, "questions.tsv"), sep="\t", index=False)
qs.to_csv(os.path.join(STAGING_DATA_FOLDER, "questions.tsv"), sep="\t", index=False) # updated
q_pairs.to_csv(os.path.join(STAGING_DATA_FOLDER, "question_pairs.csv"), index=False)

In [60]:
# Lemmatization - Not sure we should do this because it is changing the words to differnt meaning words
# lemmatizer = WordNetLemmatizer()
# clean_qs["question"] = clean_qs["question"].map(lambda row : [lemmatizer.lemmatize(x) for x in row])
# clean_qs.head(30)
# clean_qs.tail(30)
#### We can see after performing lemmatization the words got changed to their root form like,

##### 1. physics - physic however Physics was unchanged
##### 2. slits - slit
##### 3. keeps - keep
##### 4. was - wa
##### 5. laws - law
##### 6. questions - question
##### 7. does - doe