In [35]:
import json
import os
import random
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

random.seed(42)
np.random.seed(42)

In [36]:
OUTPUT_DATA_DIR = "./output_data/"

if not os.path.exists(OUTPUT_DATA_DIR):
    os.makedirs(OUTPUT_DATA_DIR)

In [37]:
train_df = pd.read_csv(OUTPUT_DATA_DIR+"interactions_training.csv")
val_df = pd.read_csv(OUTPUT_DATA_DIR+"interactions_validation.csv")
test_df = pd.read_csv(OUTPUT_DATA_DIR+"interactions_testing.csv")

full_interactions = pd.concat([train_df, val_df, test_df], axis=0)

In [38]:
def load_mapping(mapping_file):
    """Loads the mapping from `mapping_file`.
    
    Parameters
    ----------
    mapping_file: str
        The name of the mapping file to import.
    
    Returns
    -------
    pd.DataFrame
        The DataFrame created from the mapping.
    
    """
    return pd.read_csv(os.path.join("mappings", "{}.csv".format(mapping_file)))

In [39]:
user_map = load_mapping("user_map")
book_map = load_mapping("book_map")

In [42]:
full_interactions['book_id'] = full_interactions['book_id'].apply(lambda x: str(x))
book_map['book_id'] = book_map['book_id'].apply(lambda x: str(x))

In [43]:
full_interactions = pd.merge(full_interactions, user_map, how="left", on=["user_id"])
full_interactions = pd.merge(full_interactions, book_map, how="left", on=["book_id"])

In [44]:
full_interactions

Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at,...,year_month_added,year_month_updated,pub_date,is_translated,main_author,is_in_series,series_length,title_description,user_number,book_number
0,55e33e664f052f3858d8df5d9cd38597,2547,a3a8cf3cead1f647c240ce56d394090d,True,5,,Fri Oct 18 14:39:29 -0700 2013,Fri Oct 18 14:39:29 -0700 2013,,,...,2013-10,2013-10,2010-01,0,6466154.0,0,1,"The Prophet Kahlil Gibran's masterpiece, The P...",36322,0
1,be813d0bb8f4cd8c4bd77870df3c8138,2696,12774740d9a76a6dc5e2f60289358cf2,True,2,,Tue Feb 07 01:27:13 -0800 2012,Tue Feb 07 01:27:13 -0800 2012,,,...,2012-02,2012-02,2003-01,0,1838.0,0,1,The Canterbury Tales The procession that cross...,80911,1
2,695d92b05d837223608e5effcbefa14c,30119,9cc215a3037c26944369c2b839c657fc,True,4,,Fri Apr 23 06:36:42 -0700 2010,Fri Apr 23 06:36:42 -0700 2010,,,...,2010-04,2010-04,2002-11,0,435477.0,0,1,Where the Sidewalk Ends Where the Sidewalk End...,44651,2
3,841232985badbdc753f6179ab2dcf899,26596,42b2bb4a3396fd2c63ee3fdba160fb03,False,0,,Thu Sep 25 23:17:27 -0700 2014,Thu Sep 25 23:17:28 -0700 2014,,,...,2014-09,2014-09,1994-00,0,10547.0,0,1,"Complete Poems, 1904-1962 At the time of his d...",55994,3
4,08ff9ebbf8f94cf1d53b1fd87f077d6b,406373,0ee7eb8e9346ca6805f355fc9d8ad806,True,4,,Tue Sep 18 15:41:21 -0700 2007,Wed Feb 13 07:23:38 -0800 2013,,,...,2007-09,2013-02,-00,0,285217.0,1,2,Faust Goethe's Faust reworks the late medieval...,3813,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326900,eb86d2a29fb389c9a26e0306652f0d81,112200,bd3ff9bc0836e5d06cbb6ab457364d2d,True,3,,Sun Jul 07 13:56:06 -0700 2013,Sun Jul 07 13:56:06 -0700 2013,,,...,2013-07,2013-07,-00,0,7440.0,0,1,Selected Poems Emily Dickinson was born into a...,99763,325
326901,8693da07e6f95d5537d2b544e1c051ae,1420,75f06830563075abf4938fcac32ba957,True,4,,Thu May 06 18:37:53 -0700 2010,Thu May 06 18:37:53 -0700 2010,,,...,2010-05,2010-05,2005-08,0,947.0,0,1,"Hamlet One of the greatest plays of all time, ...",57123,8
326902,28a11d79a703bf050e18e98bb5bdc8c6,27418,46a85baae8a0d34ace85c09c8639480e,False,0,,Fri Oct 10 11:07:12 -0700 2014,Fri Oct 10 11:07:14 -0700 2014,,,...,2014-10,2014-10,1999-05,0,1691.0,0,1,"Theogony/Works and Days Hesiod, one of the old...",17008,562
326903,a69ffb3a1c02d55e23f2f4d1d4ec6334,287656,4d8e9d4ab91aeba2dfa90a5f5383516c,True,3,,Sat Jun 03 11:44:26 -0700 2017,Tue Jun 20 08:26:21 -0700 2017,Sun Jun 18 11:49:01 -0700 2017,Sat Jun 03 11:44:27 -0700 2017,...,2017-06,2017-06,2004-09,0,947.0,0,1,Sonnets (No Fear Shakespeare) No Fear Shakespe...,70773,5262


In [45]:
def create_user_book_id(data_df):
    """Creates a user_book_id in `data_df` for joining.
    
    Parameters
    ----------
    data_df: pd.DataFrame
        The DataFrame for which the ID field is created.
    
    Returns
    -------
    pd.DataFrame
        The DataFrame obtained from `data_df` after adding
        a user_book_id field.
    
    """
    data_df['user_number'] = data_df['user_number'].apply(lambda x: str(x))
    data_df['book_number'] = data_df['book_number'].apply(lambda x: str(x))
    data_df['user_book_id'] = data_df['user_number'] + "-" + data_df['book_number']
    return data_df.drop(columns=['user_number', 'book_number'])

In [46]:
full_interactions = create_user_book_id(full_interactions)

In [52]:
train_set = create_user_book_id(load_mapping("goodreads_train"))
val_set = create_user_book_id(load_mapping("goodreads_val"))
test_set = create_user_book_id(load_mapping("goodreads_test"))

In [53]:
train_set['is_train'] = 1
val_set['is_val'] = 1
test_set['is_test'] = 1

In [54]:
inter_train = pd.merge(full_interactions, train_set, how='left', on=['user_book_id'])

Unnamed: 0,user_book_id,is_train
0,36322-0,1
1,36322-8,1
2,80911-1,1
3,80911-1344,1
4,80911-455,1
...,...,...
257640,48969-101,1
257641,37151-208,1
257642,88081-35,1
257643,58962-73,1


In [58]:
inter_train = pd.merge(full_interactions, train_set, how='left', on=['user_book_id'])
inter_train_val = pd.merge(inter_train, val_set, how='left', on=['user_book_id'])
inter_all = pd.merge(inter_train_val, test_set, how='left', on=['user_book_id'])

In [60]:
train_df = inter_all[inter_all['is_train'] == 1]
val_df = inter_all[inter_all['is_val'] == 1]
test_df = inter_all[inter_all['is_test'] == 1]

In [65]:
drop_cols = ['user_book_id', 'is_train','is_val', 'is_test']
train_df = train_df.drop(columns=drop_cols)
val_df = val_df.drop(columns=drop_cols)
test_df = test_df.drop(columns=drop_cols)

In [66]:
def save_interaction_data_to_csv(interaction_df, suffix):
    """Saves the data in `interaction_df` to csv.
    
    Parameters
    ----------
    interaction_df: pd.DataFrame
        The DataFrame being saved to csv.
    suffix: str
        A suffix added to the saved file to identify the
        particular interactions data.
    
    Returns
    -------
    None
    
    """
    file_path = "{0}interactions_{1}.csv".format(OUTPUT_DATA_DIR, suffix)
    interaction_df.to_csv(file_path, index=False)

In [67]:
save_interaction_data_to_csv(train_df, "training")
save_interaction_data_to_csv(test_df, "testing")
save_interaction_data_to_csv(val_df, "validation")