In [1]:
%run setup.py  ← adds project root to sys.path
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from pathlib import Path

# 1. Load full dataset
df = pd.read_csv("../data/quora.csv")

# 2. Build a *group* key so that a physical question
#    (qid appears as qid1 or qid2) cannot leak across folds.
df["group"] = df[["qid1", "qid2"]].min(axis=1)

# 3. 80 / 10 / 10 split  (train / valid / test)
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, temp_idx = next(gss.split(df, df["is_duplicate"], df["group"]))

df_train = df.iloc[train_idx].reset_index(drop=True)
df_temp  = df.iloc[temp_idx].reset_index(drop=True)

gss2 = GroupShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
valid_idx, test_idx = next(gss2.split(df_temp, df_temp["is_duplicate"], df_temp["group"]))
df_valid = df_temp.iloc[valid_idx].reset_index(drop=True)
df_test  = df_temp.iloc[test_idx].reset_index(drop=True)

# 4. Save
out_dir = Path("../data/splits")
out_dir.mkdir(parents=True, exist_ok=True)
df_train.to_csv(out_dir / "train.csv", index=False)
df_valid.to_csv(out_dir / "valid.csv", index=False)
df_test.to_csv(out_dir / "test.csv",  index=False)

print("Saved:", list(out_dir.iterdir()))


Saved: [PosixPath('../data/splits/train.csv'), PosixPath('../data/splits/valid.csv'), PosixPath('../data/splits/test.csv')]
