In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from ast import literal_eval
import time
from datetime import timedelta

from scipy.sparse import hstack, vstack, csr_matrix, load_npz, save_npz

pd.set_option('display.width', 1400)

#### Functions

In [None]:
def get_train_and_test_data(df):
    from sklearn.model_selection import train_test_split
    df_exploded = df.explode(['rated_recipes', 'rating_list'])
    df_train_exploded, df_test_exploded = train_test_split(df_exploded, test_size=0.2, random_state=42)
    df_train = df_train_exploded.groupby(level=0).agg(list).drop('ingredients', axis=1)
    df_test = df_test_exploded.groupby(level=0).agg(list).drop('ingredients', axis=1)
    all_user_ids = df.index
    df_train = df_train.reindex(all_user_ids, fill_value=[]) # Re-index to ensure all user_ids are included
    df_test = df_test.reindex(all_user_ids, fill_value=[])
    return df_train, df_test

#### Load Data

In [None]:
# Load necessary dataframes
converters = { k: literal_eval for k in ['tags', 'ingredients', 'steps', 'nutrition'] } # for evaluating strings as arrays (eg. tags)

df_recipes = pd.read_csv('dataset/RAW_recipes.csv', converters=converters, index_col='id')
df_recipe_reviews = pd.read_csv('dataset/Recipe_Reviews.csv', index_col='id')

# df_interact = pd.read_csv('dataset/RAW_interactions.csv', dtype={'review': str})
# converters = { k: literal_eval for k in ['rated_recipes', 'ingredients', 'rating_list'] }
# df_userdata = pd.read_csv('dataset/User_Data.csv', converters=converters, index_col='user_id')

In [None]:
# Create or load training and test data
converters = { k: literal_eval for k in ['rated_recipes', 'ingredients', 'rating_list'] }
train_fn = 'dataset/User_Data_Train.csv'
test_fn = 'dataset/User_Data_Test.csv'
if os.path.exists(train_fn):
    print('Loading train and test userdata ...')
    df_train = pd.read_csv(train_fn, converters=converters, index_col='user_id')
    df_test =  pd.read_csv(test_fn, converters=converters, index_col='user_id')
else:
    print('Reading userdata dataframe ...')
    df_userdata = pd.read_csv('dataset/User_Data.csv', converters=converters, index_col='user_id')
    print('Splitting userdata into training and test data ...')
    df_train, df_test = get_train_and_test_data(df_userdata.head(None).copy())
    df_train.to_csv(train_fn)
    df_test.to_csv(test_fn)
print('Done.')

In [None]:
# Create User-Item Matrix
data, rows, cols = [], [], []

for i, (user_id, row) in enumerate(df_train.iterrows()):
    print('\r{:_}/{:_}'.format(i+1, len(df_train)), end='')
    for recipe_id, rating in zip(row['rated_recipes'], row['rating_list']):
        user_IDX = id_to_index_USERS[user_id]
        recipe_IDX = id_to_index_RECIPES[recipe_id]
        data.append(rating)
        rows.append(user_IDX)
        cols.append(recipe_IDX)
print('\nDone.')

user_item_matrix = csr_matrix((data, (rows, cols)), shape=(len(df_train), len(df_recipes)))
print('Matrix made!')