# Train-Test-Validation Split

In this notebook, we split data into train, test, and validation sets. The split is done based on the split performed by chainRec

In [1]:
import json
import os
import random
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

random.seed(42)
np.random.seed(42)

### Loading Data

In [2]:
OUTPUT_DATA_DIR = "../output_data/"

if not os.path.exists(OUTPUT_DATA_DIR):
    os.makedirs(OUTPUT_DATA_DIR)

In [3]:
train_df = pd.read_csv(OUTPUT_DATA_DIR+"interactions_training.csv")
val_df = pd.read_csv(OUTPUT_DATA_DIR+"interactions_validation.csv")
test_df = pd.read_csv(OUTPUT_DATA_DIR+"interactions_testing.csv")

full_interactions = pd.concat([train_df, val_df, test_df], axis=0)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### Loading Mappings

We map user IDs and book IDs to numbers to be consistent with the chainRec mapping

In [4]:
def load_mapping(mapping_file):
    """Loads the mapping from `mapping_file`.
    
    Parameters
    ----------
    mapping_file: str
        The name of the mapping file to import.
    
    Returns
    -------
    pd.DataFrame
        The DataFrame created from the mapping.
    
    """
    return pd.read_csv(os.path.join("../mappings", "{}.csv".format(mapping_file)))

In [5]:
user_map = load_mapping("user_map")
book_map = load_mapping("book_map")

In [6]:
full_interactions['book_id'] = full_interactions['book_id'].apply(lambda x: str(x))
book_map['book_id'] = book_map['book_id'].apply(lambda x: str(x))

In [7]:
full_interactions = pd.merge(full_interactions, user_map, how="left", on=["user_id"])
full_interactions = pd.merge(full_interactions, book_map, how="left", on=["book_id"])

In [9]:
def create_user_book_id(data_df):
    """Creates a user_book_id in `data_df` for joining.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The DataFrame for which the ID field is created.
    
    Returns
    -------
    pd.DataFrame
        The DataFrame obtained from `data_df` after adding
        a user_book_id field.
    
    """
    data_df['user_number'] = data_df['user_number'].apply(lambda x: str(x))
    data_df['book_number'] = data_df['book_number'].apply(lambda x: str(x))
    data_df['user_book_id'] = data_df['user_number'] + "-" + data_df['book_number']
    return data_df.drop(columns=['user_number', 'book_number'])

In [10]:
full_interactions = create_user_book_id(full_interactions)

In [11]:
train_set = create_user_book_id(load_mapping("goodreads_train"))
val_set = create_user_book_id(load_mapping("goodreads_val"))
test_set = create_user_book_id(load_mapping("goodreads_test"))

### Splitting Data

In [12]:
train_set['is_train'] = 1
val_set['is_val'] = 1
test_set['is_test'] = 1

In [14]:
inter_train = pd.merge(full_interactions, train_set, how='left', on=['user_book_id'])
inter_train_val = pd.merge(inter_train, val_set, how='left', on=['user_book_id'])
inter_all = pd.merge(inter_train_val, test_set, how='left', on=['user_book_id'])

In [15]:
train_df = inter_all[inter_all['is_train'] == 1]
val_df = inter_all[inter_all['is_val'] == 1]
test_df = inter_all[inter_all['is_test'] == 1]

In [16]:
drop_cols = ['user_book_id', 'is_train','is_val', 'is_test']
train_df = train_df.drop(columns=drop_cols)
val_df = val_df.drop(columns=drop_cols)
test_df = test_df.drop(columns=drop_cols)

In [40]:
train_chain = train_df[['user_id', 'shelved', 'read', 'rated', 'recommended']]
train_chain_users = train_chain.groupby(train_chain['user_id'])[['shelved', 'read', 'rated', 'recommended']].sum()

In [41]:
train_chain_users['shelved_count'] = train_chain_users['shelved']
train_chain_users['read_count'] = train_chain_users['read']
train_chain_users['rated_count'] = train_chain_users['rated']
train_chain_users['recommended_count'] = train_chain_users['recommended']
train_chain_users = train_chain_users[['read_count', 'rated_count', 'recommended_count', 'shelved_count']]

In [42]:
train_chain_users = train_chain_users.reset_index()

In [43]:
train_chain_users.describe()

Unnamed: 0,read_count,rated_count,recommended_count,shelved_count
count,108389.0,108389.0,108389.0,108389.0
mean,1.583509,1.545775,1.351383,2.37704
std,2.309948,2.25107,1.851384,5.053172
min,0.0,0.0,0.0,1.0
25%,1.0,1.0,1.0,1.0
50%,1.0,1.0,1.0,1.0
75%,2.0,2.0,1.0,2.0
max,133.0,133.0,133.0,296.0


In [44]:
def merge_user_stats(data_df, user_stats_df, is_train):
    """Add the user statistics to `data_df`.
    
    Stats are added to `data_df` counting how many other books
    the user has shelved, read, rated, and recommended.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The DataFrame being augmented with user stats.
    user_stats_df: pd.DataFrame
        The DataFrame containing per user stats. For each user
        in the training data, `user_stats_df` contains counts
        of the number of books shelved, read, rated, and
        recommended based on the training data.
    is_train: bool
        Indicates if `data_df` is the training data. For training
        data, the counts are decremented by 1 as the counts are
        meant to represent the number of other books the user has
        interacted with.
    
    Returns
    -------
    pd.DataFrame
        The DataFrame obtained from `data_df` after augmenting it
        with user stats.
    
    """
    comb_df = pd.merge(data_df, user_stats_df, how='left', on=['user_id'])
    if is_train:
        for level in ['shelved', 'read', 'rated', 'recommended']:
            count_var = "{}_count".format(level)
            comb_df[count_var] = comb_df[count_var] - comb_df[level]
    return comb_df

In [45]:
train_df_aug = merge_user_stats(train_df, train_chain_users, True)
val_df_aug = merge_user_stats(val_df, train_chain_users, False)
test_df_aug = merge_user_stats(test_df, train_chain_users, False)

### Saving Data

In [53]:
def save_interaction_data_to_csv(interaction_df, suffix):
    """Saves the data in `interaction_df` to csv.
    
    Parameters
    ----------
    interaction_df: pd.DataFrame
        The DataFrame being saved to csv.
    suffix: str
        A suffix added to the saved file to identify the
        particular interactions data.
    
    Returns
    -------
    None
    
    """
    file_path = "{0}interactions_{1}.csv".format(OUTPUT_DATA_DIR, suffix)
    interaction_df.to_csv(file_path, index=False)

In [54]:
save_interaction_data_to_csv(train_df_aug, "training")
save_interaction_data_to_csv(test_df_aug, "testing")
save_interaction_data_to_csv(val_df_aug, "validation")