# Dataset splitter

### Boilerplate 

In [11]:
import math
import json
import numpy as np
import pandas as pd

from datetime import datetime
from pprint import pprint
from os.path import join
from pathlib import Path

def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    # From: https://stackoverflow.com/a/38251063/5099361
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

## > Params

In [12]:
dataset_info = {}
use_kaggle_test, drop_purchases = True, True
ratio_train = 0.7
ratio_eval = 0.2
ratio_test = 0.1

kaggle_train_path = "/home/jupyter/mlspec-blackfriday/dataset/raw/train.csv"
kaggle_test_path = "/home/jupyter/mlspec-blackfriday/dataset/raw/test.csv"

output_path = "/home/jupyter/mlspec-blackfriday/dataset/parsed/" # + <timestamp>

assert math.isclose(ratio_train + ratio_eval + ratio_test, 1.0), "Ratio must have sum equal to 1"\

## Load the data

In [13]:
df_train=pd.read_csv(kaggle_train_path)
df_test=pd.read_csv(kaggle_test_path)

dataset_info["kaggle_train_path"] = kaggle_train_path
dataset_info["kaggle_test_path"] = kaggle_test_path

print(f"df_train shape:{df_train.shape}")
print(f"df_test shape:{df_test.shape}")

df_train shape:(550068, 12)
df_test shape:(233599, 11)


## Prepare the dataset

In [14]:
df = df_train.append(df_test) if use_kaggle_test else df_train
df = df.drop("Purchase", 1) if drop_purchases else df

dataset_info["use_kaggle_test"] = use_kaggle_test
dataset_info["dataset_len"] = len(df)
dataset_info["drop_purchases"] = drop_purchases

print(f"Dataset total len: {len(df)}")

Dataset total len: 783667


## Shuffle & split

In [15]:
df_train, df_eval, df_test = train_validate_test_split(df, ratio_train, ratio_eval, seed=42) # shuffle also

dataset_info["ratio_train"] = ratio_train
dataset_info["ratio_eval"] = ratio_eval
dataset_info["ratio_test"] = ratio_test

print(f"df_train len: {len(df_train):,}")
print(f"df_eval len: {len(df_eval):,}")
print(f"df_test len: {len(df_test):,}")

df_train len: 548,566
df_eval len: 156,733
df_test len: 78,368


## Store

In [16]:
timestamp = datetime.today().strftime('%Y%m%d%H%M')
output_folder = join(output_path, timestamp)
Path(output_folder).mkdir(parents=True, exist_ok=True)

out_train = join(output_folder, "train.csv") 
out_eval = join(output_folder, "eval.csv")
out_test = join(output_folder, "test.csv")
out_info = join(output_folder, "info.json")

df_train.to_csv(out_train)
df_eval.to_csv(out_eval)
df_test.to_csv(out_test)
open(out_info, "w").write(json.dumps(dataset_info, indent=2, sort_keys=True))

304

### Utility: sync with GCS

In [17]:
!bash /home/jupyter/mlspec-blackfriday/dataset/sync.sh

Copying './parsed/' folder to GCS path 'gs://mlteam-ml-specialization-2021-blackfriday/dataset/'
Skipping existing item: gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104191544/test.csv
Skipping existing item: gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104191544/eval.csv
Skipping existing item: gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/info.json
Skipping existing item: gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/eval.csv
Skipping existing item: gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/train.csv
Skipping existing item: gs://mlteam-ml-specialization-2021-blackfriday/dataset/parsed/202104130952/.ipynb_checkpoints/info-checkpoint.json
Copying file://./parsed/202104191549/eval.csv [Content-Type=text/csv]...
Copying file://./parsed/202104191549/test.csv [Content-Type=text/csv]...        
Copying file://./parsed/202104191549/info.json [Content-Type=applica

#### ~ End