In [3]:
import os
import pandas as pd

from datasets import load_dataset
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


# Dataset description

Where else but Quora can a physicist help a chef with a math problem and get cooking tips in return? 

Quora is **a place to gain and share knowledge—about anything**. It’s a platform to ask questions and connect with people who contribute unique insights and quality answers. This empowers people to learn from each other and to better understand the world.

Over 100 million people visit Quora every month, so it's no surprise that many people ask similarly worded questions. Multiple questions with the same intent can cause seekers to spend more time finding the best answer to their question, and make writers feel they need to answer multiple versions of the same question. Quora values canonical questions because they provide a better experience to active seekers and writers, and offer more value to both of these groups in the long term.

Currently, Quora uses a Random Forest model to identify duplicate questions. In this competition, Kagglers are challenged to tackle this natural language processing problem by applying advanced techniques to classify whether question pairs are duplicates or not. Doing so will make it easier to find high quality answers to questions resulting in an improved experience for Quora writers, seekers, and readers.

In [4]:
dataset = load_dataset("quora")

Downloading builder script: 2.38kB [00:00, 734kB/s]                                                       
Downloading metadata: 1.13kB [00:00, 729kB/s]                                                             
Using custom data configuration default


Downloading and preparing dataset quora/default (download: 55.48 MiB, generated: 55.46 MiB, post-processed: Unknown size, total: 110.94 MiB) to /Users/lea.naccache/.cache/huggingface/datasets/quora/default/0.0.0/36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04...


Downloading data files:   0%|                                                       | 0/1 [00:00<?, ?it/s]
Downloading data:   0%|                                                       | 0.00/58.2M [00:00<?, ?B/s][A
Downloading data:   0%|                                               | 92.2k/58.2M [00:00<01:05, 887kB/s][A
Downloading data:   1%|▏                                              | 296k/58.2M [00:00<00:40, 1.41MB/s][A
Downloading data:   1%|▍                                              | 521k/58.2M [00:00<00:32, 1.77MB/s][A
Downloading data:   1%|▌                                              | 750k/58.2M [00:00<00:29, 1.95MB/s][A
Downloading data:   2%|▊                                              | 946k/58.2M [00:00<00:30, 1.89MB/s][A
Downloading data:   2%|▉                                             | 1.19M/58.2M [00:00<00:27, 2.06MB/s][A
Downloading data:   2%|█                                             | 1.40M/58.2M [00:00<00:29, 1.90MB/s][A
Downloading d

Downloading data:  25%|███████████▍                                  | 14.5M/58.2M [00:08<00:25, 1.69MB/s][A
Downloading data:  25%|███████████▌                                  | 14.7M/58.2M [00:08<00:25, 1.72MB/s][A
Downloading data:  26%|███████████▊                                  | 14.9M/58.2M [00:08<00:24, 1.75MB/s][A
Downloading data:  26%|███████████▉                                  | 15.1M/58.2M [00:08<00:24, 1.76MB/s][A
Downloading data:  26%|████████████                                  | 15.2M/58.2M [00:08<00:24, 1.77MB/s][A
Downloading data:  27%|████████████▏                                 | 15.4M/58.2M [00:09<00:23, 1.83MB/s][A
Downloading data:  27%|████████████▎                                 | 15.6M/58.2M [00:09<00:23, 1.83MB/s][A
Downloading data:  27%|████████████▌                                 | 15.8M/58.2M [00:09<00:23, 1.84MB/s][A
Downloading data:  28%|████████████▋                                 | 16.0M/58.2M [00:09<00:23, 1.83MB/s][A
Downloadin

Downloading data:  52%|███████████████████████▋                      | 30.0M/58.2M [00:16<00:13, 2.15MB/s][A
Downloading data:  52%|███████████████████████▉                      | 30.2M/58.2M [00:16<00:13, 2.12MB/s][A
Downloading data:  52%|████████████████████████                      | 30.4M/58.2M [00:16<00:12, 2.15MB/s][A
Downloading data:  53%|████████████████████████▏                     | 30.7M/58.2M [00:16<00:12, 2.13MB/s][A
Downloading data:  53%|████████████████████████▍                     | 30.9M/58.2M [00:16<00:12, 2.15MB/s][A
Downloading data:  53%|████████████████████████▌                     | 31.1M/58.2M [00:16<00:12, 2.14MB/s][A
Downloading data:  54%|████████████████████████▊                     | 31.3M/58.2M [00:16<00:12, 2.13MB/s][A
Downloading data:  54%|████████████████████████▉                     | 31.5M/58.2M [00:16<00:12, 2.13MB/s][A
Downloading data:  55%|█████████████████████████                     | 31.7M/58.2M [00:16<00:12, 2.11MB/s][A
Downloadin

Downloading data:  79%|████████████████████████████████████▌         | 46.2M/58.2M [00:23<00:05, 2.13MB/s][A
Downloading data:  80%|████████████████████████████████████▋         | 46.4M/58.2M [00:23<00:05, 2.15MB/s][A
Downloading data:  80%|████████████████████████████████████▊         | 46.6M/58.2M [00:23<00:05, 2.13MB/s][A
Downloading data:  80%|█████████████████████████████████████         | 46.8M/58.2M [00:24<00:05, 2.13MB/s][A
Downloading data:  81%|█████████████████████████████████████▏        | 47.0M/58.2M [00:24<00:05, 2.10MB/s][A
Downloading data:  81%|█████████████████████████████████████▎        | 47.3M/58.2M [00:24<00:05, 2.14MB/s][A
Downloading data:  82%|█████████████████████████████████████▌        | 47.5M/58.2M [00:24<00:05, 2.13MB/s][A
Downloading data:  82%|█████████████████████████████████████▋        | 47.7M/58.2M [00:24<00:04, 2.12MB/s][A
Downloading data:  82%|█████████████████████████████████████▉        | 47.9M/58.2M [00:24<00:04, 2.13MB/s][A
Downloadin

Dataset quora downloaded and prepared to /Users/lea.naccache/.cache/huggingface/datasets/quora/default/0.0.0/36ba4cd42107f051a158016f1bea6ae3f4685c5df843529108a54e42d86c1e04. Subsequent calls will reuse this data.


100%|███████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.00it/s]


In [5]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [6]:
dataset.keys()

dict_keys(['train'])

In [7]:
dataset["train"]

Dataset({
    features: ['questions', 'is_duplicate'],
    num_rows: 404290
})

In [8]:
dataset["train"][0]

{'questions': {'id': [1, 2],
  'text': ['What is the step by step guide to invest in share market in india?',
   'What is the step by step guide to invest in share market?']},
 'is_duplicate': False}

In [9]:
df = dataset["train"].to_pandas()
df = pd.concat(
    [
        pd.DataFrame(df["questions"].tolist()),
        df["is_duplicate"]
    ], 
    axis=1
)
df = pd.concat(
    [
        pd.DataFrame(df["id"].tolist()),
        pd.DataFrame(df["text"].tolist()),
        df["is_duplicate"].astype(int)
    ],
    axis=1
)
df.columns = ["q1_id", "q2_id", "q1_text", "q2_text", "is_duplicate"]

df.head()

Unnamed: 0,q1_id,q2_id,q1_text,q2_text,is_duplicate
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [10]:
train_df, test_df = train_test_split(df, 
                                     test_size=0.5, 
                                     stratify=df["is_duplicate"],
                                     random_state=42)

In [11]:
print("Proportion of duplicates in TRAIN:", "{:.4%}".format(train_df["is_duplicate"].mean()))
print("Proportion of duplicates in TEST:", "{:.4%}".format(test_df["is_duplicate"].mean()))

Proportion of duplicates in TRAIN: 36.9200%
Proportion of duplicates in TEST: 36.9195%


In [12]:
train_df.head()

Unnamed: 0,q1_id,q2_id,q1_text,q2_text,is_duplicate
171789,265395,265396,Which is more dangerous the far right or the f...,"What is far-right, centre-left, labor etc. and...",0
165943,236311,257592,"How did you run $1,000 into $100,000 in cattle...",How much capital do you need to buy enough sto...,0
355458,484691,484692,Is there any word that can be used to describe...,"In which languages, other than Hebrew is the w...",0
68808,29461,85270,Why do children choose parents teach swim?,Who need to teach children swim?,1
230214,28318,110667,"If the universe is everything, and scientists ...",They say that the universe is forever expandin...,1


In [13]:
test_df.head()

Unnamed: 0,q1_id,q2_id,q1_text,q2_text,is_duplicate
240472,214176,44437,What are the strongest majors in terms of job ...,What are the strongest majors in terms of job ...,0
231115,340838,340839,How can I stop binge eating without outside he...,How does one stop binge eating without using p...,1
36276,66161,66162,I am B Tech holder in biotechnology. I need a ...,Can we Create meeting room reservation in Shar...,0
350020,54397,220620,What is the best tank in World of Tanks?,"What is the best tank in World of Tanks, and why?",1
228348,267316,337401,"What is the meaning of ""You are pretty, yourse...",What does it mean to cut yourself short? What'...,0


In [14]:
if not os.path.exists("../data"):
    os.mkdir("../data")
    
train_df.to_csv("../data/train.csv", index=False)
test_df.to_csv("../data/test.csv", index=False)