# ChainRec Test-Validation Split

In this notebook we create a csv file which contains the interactions that are in the test and validation sets. The source code of chainRec was modified to load this file to ensure that it splits the data in the same way each time.

In [1]:
import json
import os
import random
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
import scipy.sparse as sp

random.seed(42)
np.random.seed(42)

### Loading Data

In [6]:
OUTPUT_DATA_DIR = "../output_data/"

val_df = pd.read_csv(OUTPUT_DATA_DIR+"interactions_validation.csv")
test_df = pd.read_csv(OUTPUT_DATA_DIR+"interactions_testing.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### Loading Mappings

chainRec expects the user IDs and item IDs to be an increasing sequence starting from 0. So we load mapping files to convert user IDs and book IDs to a number and then create the file.

In [4]:
def load_mapping(mapping_file):
    """Loads the mapping from `mapping_file`.
    
    Parameters
    ----------
    mapping_file: str
        The name of the mapping file to import.
    
    Returns
    -------
    pd.DataFrame
        The DataFrame created from the mapping.
    
    """
    return pd.read_csv(os.path.join("../mappings", "{}.csv".format(mapping_file)))

In [5]:
user_map = load_mapping("user_map")
book_map = load_mapping("book_map")

In [7]:
val_df['book_id'] = val_df['book_id'].apply(lambda x: str(x))
test_df['book_id'] = test_df['book_id'].apply(lambda x: str(x))
book_map['book_id'] = book_map['book_id'].apply(lambda x: str(x))

### Creating chainRec File

In [8]:
val_df = pd.merge(val_df, user_map, how="left", on=["user_id"])
val_df = pd.merge(val_df, book_map, how="left", on=["book_id"])

test_df = pd.merge(test_df, user_map, how="left", on=["user_id"])
test_df = pd.merge(test_df, book_map, how="left", on=["book_id"])

In [12]:
val_df['max_stage_vali'] = val_df['read'] + val_df['rated'] + val_df['recommended']
test_df['max_stage_test'] = test_df['read'] + test_df['rated'] + test_df['recommended']

In [13]:
val_df = val_df[['user_number', 'book_number', 'max_stage_vali']]
val_df.columns = ['user_id', 'item_id_vali', 'max_stage_vali']

test_df = test_df[['user_number', 'book_number', 'max_stage_test']]
test_df.columns = ['user_id', 'item_id_test', 'max_stage_test']

In [16]:
val_test_df = pd.merge(val_df, test_df, how='inner', on=['user_id'])
val_test_df = val_test_df[['user_id', 'item_id_vali', 'max_stage_vali', 'item_id_test', 'max_stage_test']]

### Saving File

In [18]:
val_test_df.to_csv(OUTPUT_DATA_DIR+"goodreads.test.csv", index=False)