# Splitting into train, dev, and test set

Splits are defined according to criteria in `20210527_initial_eda.ipynb`:

- remove books with less $< 5$ ratings
- remove users with less $< 5$ ratings
- training set ranges from August 2006 to June 2017
- dev set ranges from July to August 2017
- test set ranges from September to October 2017

Basic statistics about the different splits are printed at the end of the notebook.

In [1]:
from datetime import date
import gzip
import json
from pathlib import Path

import pandas as pd

from recreads.data.load import load_ratings_raw

In [2]:
input_dir = Path("../data/raw/20210527_initial_data_children_biography/")
output_dir = Path("../data/interim/20210602_initial_data_children_biography_train_dev_test/")
output_dir.mkdir(exist_ok=True, parents=True)

In [3]:
genres = ["children", "history_biography"]

In [4]:
for genre in genres:
    ratings = load_ratings_raw(data_dir=input_dir, genre=genre)

    complete_user_count = len(ratings.loc[:, "User_id"].unique())
    complete_book_count = len(ratings.loc[:, "Book_id"].unique())

    rating_cutoff = 5

    # remove books with < rating_cutoff ratings
    nrating_per_book = ratings.groupby("Book_id").size()
    nrating_per_book = nrating_per_book.reset_index()
    nrating_per_book.columns = ["Book_id", "N_ratings"]

    remove_books = (nrating_per_book.loc[nrating_per_book.loc[:, "N_ratings"] < rating_cutoff, "Book_id"]).unique()
    ratings = ratings.loc[~ratings.loc[:, "Book_id"].isin(remove_books), :]

    # remove users with < rating_cutoff ratings
    nrating_per_user = ratings.groupby("User_id").size()
    nrating_per_user = nrating_per_user.reset_index()
    nrating_per_user.columns = ["User_id", "N_ratings"]

    remove_users = (nrating_per_user.loc[nrating_per_user.loc[:, "N_ratings"] < rating_cutoff, "User_id"]).unique()
    ratings = ratings.loc[~ratings.loc[:, "User_id"].isin(remove_users), :]
    
    remaining_user_count = len(ratings.loc[:, "User_id"].unique())
    remaining_book_count = len(ratings.loc[:, "Book_id"].unique())
    
    print(f"{genre}: {remaining_book_count} of {complete_book_count} ({remaining_book_count/complete_book_count:%}) books remaining.")
    print(f"{genre}: {remaining_user_count} of {complete_user_count} ({remaining_user_count/complete_user_count:%}) users remaining.")

    # temporal split into train, dev, test
    dev_cutoff_date = date(year=2017, month=7, day = 1)
    test_cutoff_date = date(year=2017, month=9, day = 1)

    is_train = ratings.loc[:, "Month"].apply(lambda m: m < dev_cutoff_date)
    is_dev = ratings.loc[:, "Month"].apply(lambda m: dev_cutoff_date <= m < test_cutoff_date)
    is_test = ratings.loc[:, "Month"].apply(lambda m: test_cutoff_date <= m)

    assert is_train.sum() + is_dev.sum() + is_test.sum() == len(ratings)

    train_ratings_df = ratings.loc[is_train, :]
    dev_ratings_df = ratings.loc[is_dev, :]
    test_ratings_df = ratings.loc[is_test, :]

    # shuffle rows in each split
    train_ratings_df = train_ratings_df.sample(frac=1).reset_index(drop=True)
    dev_ratings_df = dev_ratings_df.sample(frac=1).reset_index(drop=True)
    test_ratings_df = test_ratings_df.sample(frac=1).reset_index(drop=True)

    # print basic statistics over splits and save to disc
    datasets = ["train", "dev", "test"]
    dfs = [train_ratings_df, dev_ratings_df, test_ratings_df]
    for ds, df in zip(datasets, dfs):
        user_count = len(df.loc[:, "User_id"].unique())
        book_count = len(df.loc[:, "Book_id"].unique())
        rating_count = len(df)
        rated_fraction = rating_count/(user_count * book_count)
        print(f"{genre} {ds}: {book_count=} {user_count=} {rating_count=} {rated_fraction=}")
        out_path = output_dir / f"{genre}-{ds}.parquet"
        df.to_parquet(out_path, compression="snappy")
    print("----")

children: 71155 of 122741 (57.971664%) books remaining.
children: 195318 of 462164 (42.261621%) users remaining.
children train: book_count=70741 user_count=193332 rating_count=5561568 rated_fraction=0.0004066514387285944
children dev: book_count=26275 user_count=22109 rating_count=109579 rated_fraction=0.00018863206036659732
children test: book_count=24753 user_count=20056 rating_count=94836 rated_fraction=0.00019102977545795057
----
history_biography: 164563 of 299484 (54.948845%) books remaining.
history_biography: 403225 of 660364 (61.061021%) users remaining.
history_biography train: book_count=163304 user_count=399234 rating_count=11012897 rated_fraction=0.00016891850723214463
history_biography dev: book_count=61618 user_count=95387 rating_count=286669 rated_fraction=4.877350243938103e-05
history_biography test: book_count=55771 user_count=84206 rating_count=233790 rated_fraction=4.978224313343219e-05
----
