# Splitting into train, dev, and test set

Splits are defined according to criteria in `20210527_initial_eda.ipynb`:

- remove books with less $< 5$ ratings
- remove users with less $< 5$ ratings
- training set ranges from August 2006 to June 2017
- dev set ranges from July to August 2017
- test set ranges from September to October 2017

In [1]:
from datetime import date
import gzip
import json
from pathlib import Path

import pandas as pd

from recreads.data.load import load_ratings_raw

In [2]:
input_dir = Path("../data/raw/20210527_initial_data_children_biography/")

In [3]:
genres = ["children"] #, "history_biography"]

In [4]:
for genre in genres:
    ratings = load_ratings_raw(data_dir=input_dir, genre=genres[0])

    complete_user_count = len(ratings.loc[:, "User_id"].unique())
    complete_book_count = len(ratings.loc[:, "Book_id"].unique())

    rating_cutoff = 5

    # remove books with < rating_cutoff ratings
    nrating_per_book = ratings.groupby("Book_id").size()
    nrating_per_book = nrating_per_book.reset_index()
    nrating_per_book.columns = ["Book_id", "N_ratings"]

    remove_books = (nrating_per_book.loc[nrating_per_book.loc[:, "N_ratings"] < rating_cutoff, "Book_id"]).unique()
    ratings = ratings.loc[~ratings.loc[:, "Book_id"].isin(remove_books), :]

    # remove users with < rating_cutoff ratings
    nrating_per_user = ratings.groupby("User_id").size()
    nrating_per_user = nrating_per_user.reset_index()
    nrating_per_user.columns = ["User_id", "N_ratings"]

    remove_users = (nrating_per_user.loc[nrating_per_user.loc[:, "N_ratings"] < rating_cutoff, "User_id"]).unique()
    ratings = ratings.loc[~ratings.loc[:, "User_id"].isin(remove_users), :]
    
    remaining_user_count = len(ratings.loc[:, "User_id"].unique())
    remaining_book_count = len(ratings.loc[:, "Book_id"].unique())
    
    print(f"{genre}: {remaining_book_count} of {complete_book_count} ({remaining_book_count/complete_book_count:%}) books remaining.")
    print(f"{genre}: {remaining_user_count} of {complete_user_count} ({remaining_user_count/complete_user_count:%}) users remaining.")

    # temporal split into train, dev, test
    dev_cutoff_date = date(year=2017, month=7, day = 1)
    test_cutoff_date = date(year=2017, month=9, day = 1)

    is_train = ratings.loc[:, "Month"].apply(lambda m: m < dev_cutoff_date)
    is_dev = ratings.loc[:, "Month"].apply(lambda m: dev_cutoff_date <= m < test_cutoff_date)
    is_test = ratings.loc[:, "Month"].apply(lambda m: test_cutoff_date <= m)

    assert is_train.sum() + is_dev.sum() + is_test.sum() == len(ratings)

    train_ratings_df = ratings.loc[is_train, :]
    dev_ratings_df = ratings.loc[is_dev, :]
    test_ratings_df = ratings.loc[is_test, :]

    train_ratings_df = train_ratings_df.sample(frac=1).reset_index(drop=True)
    dev_ratings_df = dev_ratings_df.sample(frac=1).reset_index(drop=True)
    test_ratings_df = test_ratings_df.sample(frac=1).reset_index(drop=True)

TODO: # ratings, #books, #users; ; check; save