In [629]:
import pandas as pd

In [630]:
recipes_df = pd.read_csv("RAW_recipes.csv")

In [631]:
recipes_df = recipes_df.rename({'id': 'recipe_id'}, axis='columns')

In [632]:
interactions_df = pd.read_csv("RAW_interactions.csv")

In [633]:
len(interactions_df['user_id'].unique())

226570

In [634]:
recipes_df = recipes_df[~recipes_df.isnull().any(axis=1)]

In [635]:
distinct_interactions_in_recipes = interactions_df['recipe_id'].isin(recipes_df['recipe_id'])
interactions_df = interactions_df[distinct_interactions_in_recipes]

In [636]:
len(interactions_df['user_id'].unique())

223316

In [637]:
len(interactions_df)

1108856

In [638]:
#interactions_df.to_csv('reduced_interactions.csv')

In [639]:
user_ids_df = pd.DataFrame({ 'user_id': interactions_df['user_id'].unique()  })

In [640]:
from builder import PandasGraphBuilder

In [641]:
graph_builder = PandasGraphBuilder()

graph_builder.add_entities(user_ids_df, 'user_id', 'user')
graph_builder.add_entities(recipes_df, 'recipe_id', 'recipe')
graph_builder.add_binary_relations(interactions_df, 'user_id', 'recipe_id', 'interacted')
graph_builder.add_binary_relations(interactions_df, 'recipe_id', 'user_id', 'interacted-by')

In [642]:
g = graph_builder.build()

In [643]:
import numpy as np

In [644]:
interactions_df['date_processed'] = pd.to_datetime(interactions_df['date']).astype(np.int64)
interactions_df['date_processed'] = interactions_df['date_processed'] // 10 ** 9

In [645]:
#nutrition_tensor = np.empty((len(recipes_df['nutrition']), 3))
str_nutrition_tensor = []

In [646]:
def convert_to_array(x): 
    global str_nutrition_tensor
    
    row = x.strip('[]').split(',')
    str_nutrition_tensor.append(row)


In [647]:
recipes_df['nutrition'].astype('str').apply(convert_to_array)

0         None
1         None
2         None
3         None
4         None
          ... 
231632    None
231633    None
231634    None
231635    None
231636    None
Name: nutrition, Length: 226657, dtype: object

In [648]:
nutrition_tensor = np.array(str_nutrition_tensor).astype('float')

In [649]:
nutrition_tensor.shape

(226657, 7)

In [650]:
import torch

g.edges['interacted'].data['rating'] = torch.LongTensor(interactions_df['rating'].values)
g.edges['interacted'].data['date_processed'] = torch.LongTensor(interactions_df['date_processed'].values)

g.edges['interacted-by'].data['rating'] = torch.LongTensor(interactions_df['rating'].values)
g.edges['interacted-by'].data['date_processed'] = torch.LongTensor(interactions_df['date_processed'].values)

#g.nodes['recipe'].data['nutrition'] = torch.FloatTensor(nutrition_tensor)

In [651]:
#'description': recipes_df['description'].values
recipe_textual_dataset = {'name': recipes_df['name'].values}

In [652]:
import json
 
f = open('new_indices(4).json')
data = json.load(f)

In [653]:
train_indices = np.array(data['train'])
test_indices = np.array(data['test'])
val_indices = np.array(data['val'])

In [654]:
from data_utils import *

In [655]:
#train_indices, val_indices, test_indices = train_test_split_by_time(interactions_df, 'date_processed', 'user_id')

In [656]:
train_g = build_train_graph(g, train_indices, 'user', 'recipe', 'interacted', 'interacted-by')

In [657]:
val_matrix, test_matrix = build_val_test_matrix(g, val_indices, test_indices, 'user', 'recipe', 'interacted')

In [658]:
graph_slice = train_g.edge_type_subgraph(['interacted'])
latest_interactions = dgl.sampling.select_topk(graph_slice, 1, 'date_processed', edge_dir='out')
user, latest_items = latest_interactions.all_edges(form='uv', order='srcdst')

In [659]:
torch.equal(user, torch.arange(train_g.number_of_nodes('user')))

True

In [660]:
import pickle

#no item-texts data for now, so just initialized that to an empty dictionary so model.py doesn't complain

dataset = {
    'train-graph': train_g,
    'val-matrix': val_matrix,
    'test-matrix': test_matrix,
    'item-texts': recipe_textual_dataset,
    'item-images': None,
    'user-type': 'user',
    'item-type': 'recipe',
    'user-to-item-type': 'interacted',
    'item-to-user-type': 'interacted-by',
    'timestamp-edge-column': 'date_processed'}

with open("text_data_6.pkl", 'wb') as f:
    pickle.dump(dataset, f)