In [None]:
!mkdir data

In [1]:
!wget http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv -P data

--2021-03-21 13:34:07--  http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv
Resolving qim.fs.quoracdn.net (qim.fs.quoracdn.net)... 151.101.9.2
Connecting to qim.fs.quoracdn.net (qim.fs.quoracdn.net)|151.101.9.2|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 58176133 (55M) [text/tab-separated-values]
Saving to: ‘data/quora_duplicate_questions.tsv’


2021-03-21 13:34:11 (16.6 MB/s) - ‘data/quora_duplicate_questions.tsv’ saved [58176133/58176133]



In [32]:
import warnings

import pandas as pd


def load_data(
    file_path, input_text_column, target_text_column, label_column, keep_label=1
):
    df = pd.read_csv(file_path, sep="\t", error_bad_lines=False)
    df = df.loc[df[label_column] == keep_label]
    df = df.rename(
        columns={input_text_column: "sentence1", target_text_column: "sentence2"}
    )
    df = df[["sentence1", "sentence2"]]

    return df


def clean_unnecessary_spaces(out_string):
    if not isinstance(out_string, str):
        warnings.warn(f">>> {out_string} <<< is not a string.")
        out_string = str(out_string)
    out_string = (
        out_string.replace(" .", ".")
        .replace(" ?", "?")
        .replace(" !", "!")
        .replace(" ,", ",")
        .replace(" ' ", "'")
        .replace(" n't", "n't")
        .replace(" 'm", "'m")
        .replace(" 's", "'s")
        .replace(" 've", "'ve")
        .replace(" 're", "'re")
    )
    return out_string

In [35]:
from sklearn.model_selection import train_test_split

df = load_data(
    "data/quora_duplicate_questions.tsv", "question1", "question2", "is_duplicate"
)
q_train, q_test = train_test_split(df)

q_train.to_csv("data/quora_train.tsv", sep="\t")
q_test.to_csv("data/quora_test.tsv", sep="\t")

In [31]:
q_train.head()

Unnamed: 0,sentence1,sentence2
208006,What are the best books for IIT JAM math and w...,What are the best books for preparing for the ...
353720,How do I know if I'm pregnant?,How do you know I'd your pregnant?
221132,What are some crazy and hard riddles?,What are the crazy riddles you know?
360995,Do Nazis still exist?,Are there still Nazis?
374244,How do I survive in Phd?,How do I survive PhD?


In [20]:

# The code block above only needs to be run once.
# After that, the two lines below are sufficient to load the Quora dataset.

# q_train = pd.read_csv("data/quora_train.tsv", sep="\t")
# q_test = pd.read_csv("data/quora_test.tsv", sep="\t")

q_train = q_train.dropna()
q_test = q_test.dropna()

q_train["sentence1"] = q_train["sentence1"].apply(clean_unnecessary_spaces)
q_train["sentence1"] = q_train["sentence1"].apply(clean_unnecessary_spaces)

q_test["sentence2"] = q_test["sentence2"].apply(clean_unnecessary_spaces)
q_test["sentence2"] = q_test["sentence2"].apply(clean_unnecessary_spaces)

In [27]:
import os
os.getcwd()
save_path = os.path.join(os.getcwd(), "data")

q_train_save = os.path.join(save_path, "q_train.csv")
q_test_save = os.path.join(save_path, "q_test.csv")

In [34]:
q_train.to_csv(q_train_save, index=False)
q_test.to_csv(q_test_save, index=False)

In [36]:
small_q_train = q_train[:100]
small_q_test = q_test[:100]

In [39]:
small_q_train_save = os.path.join(save_path, "small_q_train.csv")
small_q_test_save = os.path.join(save_path, "small_q_test.csv")

In [40]:
small_q_train.to_csv(small_q_train_save)
small_q_test.to_csv(small_q_test_save)