In [1]:
import argparse
import ast 
import dgl
import gensim 
import logging
import os
# import re
import pickle
import sys
import torch
import tqdm

import dgl.function as nn
import matplotlib.animation as animation
import matplotlib.pyplot as plt 
import numpy as np 
import pandas as pd 
import pygraphviz as pgv
import scipy.sparse as ssp
import torch.nn.functional as F

from collections import defaultdict 

from dgl.data import DGLDataset
from dgl.dataloading import negative_sampler
from dgl.nn import SAGEConv 
from dgl.nn.pytorch import RelGraphConv

from gensim import corpora
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess

from pandas.plotting import scatter_matrix

from sklearn.feature_extraction.text import TfidfVectorizer

# local imports
sys.path.insert(0, 'graph-rec/src/pinsage')
from builder import PandasGraphBuilder
from data_utils import *


Using backend: pytorch


In [3]:
logging.basicConfig(format='%(asctime)s: ', level=logging.INFO)

logger = logging.getLogger('bum')

In [2]:
from platform import python_version

print("Version: {}".format(python_version()))

Version: 3.8.10


In [5]:
# function to remove items in list not in another given list
def filter_list(list_to_filter: list, set_to_check: frozenset):
    return list(filter(lambda item: item in set_to_check, list_to_filter))

## Raw interactions data

In [6]:
interactions_data = pd.read_csv('gcn/archive/RAW_interactions.csv')
interactions_data.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [None]:
"""
    Some formatting work on interactions data
"""
interactions_data.iloc[1].review

In [None]:
# get all users that have reviewed a recipe
users_with_interactions = frozenset(interactions_data.user_id)

In [7]:
interactions = interactions_data.dropna()

## Raw recipe data


In [15]:
recipe_data = pd.read_csv('gcn/archive/RAW_recipes.csv')
recipe_data.head(3)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13


In [13]:
# how many unique recipes do we have? 
len(recipe_data.id.unique())

231637

In [14]:
recipe_data.describe()

Unnamed: 0,id,minutes,contributor_id,n_steps,n_ingredients
count,231637.0,231637.0,231637.0,231637.0,231637.0
mean,222014.708984,9398.546,5534885.0,9.765499,9.051153
std,141206.635626,4461963.0,99791410.0,5.995128,3.734796
min,38.0,0.0,27.0,0.0,1.0
25%,99944.0,20.0,56905.0,6.0,6.0
50%,207249.0,40.0,173614.0,9.0,9.0
75%,333816.0,65.0,398275.0,12.0,11.0
max,537716.0,2147484000.0,2002290000.0,145.0,43.0


In [17]:
"""
    Some formatting work on recipe data
"""
# rename columns to have matching column names between datasets
recipe_data = recipe_data.rename(columns={"id": "recipe_id", "contributor_id": "user_id"})

# turn strings to lists
# recipe_data['tags'] = recipe_data.tags.transform(ast.literal_eval)


In [10]:
# users that contributed a recipe
users_that_contributed_a_recipe = recipe_data.user_id.unique()
print(f"Num users that contributed a recipe = {len(users_that_contributed_a_recipe)}")

# Users that both reviewed and contributed a recipe
users_that_reviewed_and_submitted = users_with_interactions.intersection(users_that_contributed_a_recipe)
print(f'Num users that both reviewed and contributed = {len(users_that_reviewed_and_contributed)}')

Num users that contributed a recipe = 27926


NameError: name 'users_with_interactions' is not defined

In [12]:
""" keep only recipes that have received at least one review from 
    a user who has contributed at least one recipe
"""
# reviewed_recipes = recipes
recipes['recipe_id'] = recipes.recipe_id.apply(lambda x: x if x in recipes_with_reviews else np.nan)
recipes.dropna(inplace=True)
recipes = recipes.reset_index(drop=True)

AttributeError: 'DataFrame' object has no attribute 'recipe_id'

In [25]:
recipes = recipe_data
recipes.head(1)

Unnamed: 0,name,recipe_id,minutes,user_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7


In [11]:
# all recipes that have recieved at least one review
recipes_with_reviews = frozenset(interactions_data.recipe_id)

In [13]:
# rename some columns to have matching column names between dataframes
recipes_df = recipes.rename(columns={'id': 'recipe_id',
                                     'contributor_id': 'user_id'}).set_index('recipe_id')

# order columns in the same way they appear in interactions dataframe
recipes_df = recipes_df.reindex(index=interactions.recipe_id)
recipes_df = recipes_df.dropna().astype(str).drop_duplicates().reset_index()

# remove recipes that haven't been interacted with? todo

In [24]:
recipes = recipes_df

NameError: name 'recipes_df' is not defined

In [None]:
# Create a user-submitted-recipe df
submitted_df = recipes_df[['user_id', 'recipe_id']].drop_duplicates().dropna().reset_index(drop=True)

len(set(np.concatenate((review_df.user_id.unique(), submitted_df.user_id.unique()))))

In [12]:
torch.LongTensor(recipes.user_id[:5].values)

tensor([ 47892,  26278, 196586,  68585,  41706])

### Get all recipes
Get all unique recipes. And if a recipe hasnb't been interacted with, we can remove it for now

In [12]:

unique_users = frozenset(interactions_data.user_id.unique())

In [18]:
# remove recipes without reviews
# interactions_data[~interactions_data.recipe_id.isin(unique_recipes)]
# user_data = user_data.copy()[~user_data.u.isin(unique_users)]
recipe_data[~recipe_data.recipe_id.isin(recipes_with_reviews)]
len(recipe_data)

231637

In [None]:
# filter users without ratings. 
users_in_interactions = frozenset(recipe_data.user_id.unique())
print(len(users_in_interactions))

In [None]:
# print(len(unique_users))
print(len(users_in_interactions))
# print(len(unique_users-users_in_interactions))

In [None]:
# remove users

### Processing for GraphSAGE

In [None]:
interactions_data

In [None]:
recipe_data.head(3)

In [None]:
interactions_data.head(3)

In [21]:
# keep users that contributed AND also reviewed at least one recipe
#interactions['user_id'] = interactions['user_id'].apply(lambda x: x if x in users_that_reviewed_and_contributed else np.nan)
#interactions.dropna(inplace=True)
interactions[~interactions.recipe_id.isin(recipe_data.recipe_id.unique())]

Unnamed: 0,user_id,recipe_id,date,rating,review


In [None]:
interactions.head(4)
print(len(interactions))

In [22]:
interactions

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."
...,...,...,...,...,...
1132362,116593,72730,2003-12-09,0,Another approach is to start making sauce with...
1132363,583662,386618,2009-09-29,5,These were so delicious! My husband and I tru...
1132364,157126,78003,2008-06-23,5,WOW! Sometimes I don't take the time to rate ...
1132365,53932,78003,2009-01-11,4,Very good! I used regular port as well. The ...


In [26]:
# convert string data to actual lists
recipes['ingredients'] = recipes.ingredients.apply(ast.literal_eval)
recipes.ingredients.iloc[1]

['prepared pizza crust',
 'sausage patty',
 'eggs',
 'milk',
 'salt and pepper',
 'cheese']

In [None]:
recipes.description.iloc[1]
recipes.ingredients.iloc[1]

In [None]:
interactions.head(3)

In [12]:
# df for rating given (and review?)
review_df = interactions[['user_id', 'recipe_id', 'rating', 'review']].astype({"user_id": int,
                                                                               "recipe_id": int,
                                                                               "rating": int})

In [18]:
interactions.head(3)

Unnamed: 0,user_id,recipe_id,date,rating,review,user_id_contig
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...,19262
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall...",122399
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...,4482


### We need to remap IDs for recipes and users to contiguous numbers so that the resultilng graph has a representative number of nodes to number of items in the dataset. 

This is because dgl uses the ids provided to figure out how many vertices are in the graph. A graph created with 2 vertices with IDs 10 and 11 will result in `|V| = max(10, 11) + 1` = `12` vertices. So we need to start with IDs from `0` or `1`.

In [27]:
'''
    Map IDs in order they appear to numbers 0 to 1.
'''
_id = 0
# first get the unique user IDs. Remember some users may have 
# only reviewed or contributed a recipe but not both, so we need 
# to look at both datasets to get all users
# all_users = set(np.concatenate((review_df.user_id.unique(), submitted_df.user_id.unique())))
all_users = set(np.concatenate((interactions.user_id.unique(), recipes.user_id.unique())))

user_id_to_contig_num_map = {}
contig_id_to_user_id_map = {}
for user in all_users:
    user_id_to_contig_num_map[user] = _id
    contig_id_to_user_id_map[_id] = user
    
    _id = _id + 1

print(_id)

236568


In [28]:
'''
    Assign contiguous IDs to dataframe 
'''
recipes['user_id_contig'] = recipes.user_id.apply(lambda x: user_id_to_contig_num_map[x])
interactions['user_id_contig'] = interactions.user_id.apply(lambda x: user_id_to_contig_num_map[x])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions['user_id_contig'] = interactions.user_id.apply(lambda x: user_id_to_contig_num_map[x])


In [29]:
'''
    Create a similar mapping for recipes
'''
r_id = 0
all_recipes = set(np.concatenate((recipes.recipe_id.unique(), interactions.recipe_id.unique())))

recipe_id_to_contig_map = {}
contig_id_to_recipe_id_map = {}

for recipe in all_recipes:
    recipe_id_to_contig_map[recipe] = r_id
    contig_id_to_recipe_id_map[r_id] = recipe
    
    r_id = r_id + 1
r_id

In [30]:
''' Assign contiguos IDs '''
recipes['recipe_id_contig'] = recipes['recipe_id'].apply(lambda x: recipe_id_to_contig_map[x])
interactions['recipe_id_contig'] = interactions['recipe_id'].apply(lambda x: recipe_id_to_contig_map[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interactions['recipe_id_contig'] = interactions['recipe_id'].apply(lambda x: recipe_id_to_contig_map[x])


In [68]:
recipes.head(1)
[TaggedDocument(simple_preprocess(str(row[1]['name'])),
                                 [row[1]['recipe_id_contig']])
                  for row in recipes[:3].sort_values(by=['recipe_id_contig']).iterrows()]
# recipes.name.min()

[TaggedDocument(words=['bit', 'different', 'breakfast', 'pizza'], tags=[17776]),
 TaggedDocument(words=['all', 'in', 'the', 'kitchen', 'chili'], tags=[66030]),
 TaggedDocument(words=['arriba', 'baked', 'winter', 'squash', 'mexican', 'style'], tags=[80434])]

### Recipe vertex features: recipe names

In [67]:
# order recipes by id, we'll be assigning these features in the order of graph creation
recipes[:30].sort_values(by=['recipe_id_contig'])

Unnamed: 0,name,recipe_id,minutes,user_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,user_id_contig,recipe_id_contig
5,apple a day milk shake,5289,0,1533,1999-12-06,"[15-minutes-or-less, time-to-make, course, mai...","[160.2, 10.0, 55.0, 3.0, 9.0, 20.0, 7.0]",4,"['combine ingredients in blender', 'cover and ...",,"['milk', 'vanilla ice cream', 'frozen apple ju...",4,553,2239
18,chinese chop suey,8559,70,4481,2001-01-27,"[weeknight, time-to-make, course, main-ingredi...","[395.4, 31.0, 20.0, 29.0, 51.0, 33.0, 8.0]",8,"['brown ground meat and onion in a large pot',...",easy one-pot dinner.,"['celery', 'onion', 'ground pork', 'soy sauce'...",7,2248,3577
19,cream of cauliflower soup vegan,23850,110,3288,2002-03-28,"[lactose, weeknight, time-to-make, course, mai...","[174.2, 4.0, 24.0, 1.0, 15.0, 1.0, 10.0]",10,['heat the oil or margarine in a soup pot and ...,this is a dairy free,"['canola oil', 'onion', 'garlic', 'cauliflower...",16,1601,13093
17,chinese candy,23933,15,35268,2002-03-29,"[15-minutes-or-less, time-to-make, course, pre...","[232.7, 21.0, 77.0, 4.0, 6.0, 38.0, 8.0]",4,['melt butterscotch chips in heavy saucepan ov...,"a little different, and oh so good. i include ...","['butterscotch chips', 'chinese noodles', 'sal...",3,17631,13140
21,cream of spinach soup vegan,24701,55,3288,2002-04-08,"[60-minutes-or-less, time-to-make, course, mai...","[64.8, 3.0, 13.0, 54.0, 4.0, 2.0, 3.0]",10,"['in a 3 qt saucepan over medium high heat , s...",thickened with a mix of cooked oats and vegies...,"['onion', 'scallion', 'apple juice', 'olive oi...",12,1601,13613
6,aww marinated olives,25274,15,21730,2002-04-14,"[15-minutes-or-less, time-to-make, course, mai...","[380.7, 53.0, 7.0, 24.0, 6.0, 24.0, 6.0]",4,['toast the fennel seeds and lightly crush the...,my italian mil was thoroughly impressed by my ...,"['fennel seeds', 'green olives', 'ripe olives'...",9,10577,13966
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"[30-minutes-or-less, time-to-make, course, mai...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,12889,17776
28,fried potatoes,37073,40,1533,2002-08-13,"[60-minutes-or-less, time-to-make, course, mai...","[132.6, 8.0, 4.0, 3.0, 4.0, 5.0, 6.0]",14,"['preheat oven to 400 degrees', 'cut the potat...","my husband made these up last week, very tasty...","['red potatoes', 'margarine', 'rosemary']",3,553,21164
14,calm your nerves tonic,39959,5,37449,2002-09-10,"[15-minutes-or-less, time-to-make, preparation...","[8.2, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",6,"['combine herbs', 'dosage: one-half teaspoonfu...",this will prove a blessing to everyone who tak...,"['gentian root', 'scullcap herb', 'burnet root...",5,18871,22936
11,better than sex strawberries,42198,1460,41531,2002-10-03,"[weeknight, time-to-make, course, main-ingredi...","[734.1, 66.0, 199.0, 10.0, 10.0, 117.0, 28.0]",8,['crush vanilla wafers into fine crumbs and li...,simple but sexy. this was in my local newspape...,"['vanilla wafers', 'butter', 'powdered sugar',...",7,21277,24281


In [31]:
''' Name representation: We're using doc2vec '''
name_documents = [TaggedDocument(simple_preprocess(str(row[1]['name'])),
                                 [row[1]['recipe_id_contig']])
                  for row in recipes.sort_values(by=['recipe_id_contig']).iterrows()]
name_model = Doc2Vec(name_documents, vector_size=20, window=2, min_count=1, workers=4)

2021-12-01 19:04:46,371: 
2021-12-01 19:04:46,372: 
2021-12-01 19:04:46,388: 
2021-12-01 19:04:46,402: 
2021-12-01 19:04:46,416: 
2021-12-01 19:04:46,429: 
2021-12-01 19:04:46,441: 
2021-12-01 19:04:46,453: 
2021-12-01 19:04:46,466: 
2021-12-01 19:04:46,478: 
2021-12-01 19:04:46,490: 
2021-12-01 19:04:46,502: 
2021-12-01 19:04:46,515: 
2021-12-01 19:04:46,527: 
2021-12-01 19:04:46,539: 
2021-12-01 19:04:46,551: 
2021-12-01 19:04:46,565: 
2021-12-01 19:04:46,577: 
2021-12-01 19:04:46,589: 
2021-12-01 19:04:46,601: 
2021-12-01 19:04:46,614: 
2021-12-01 19:04:46,626: 
2021-12-01 19:04:46,638: 
2021-12-01 19:04:46,651: 
2021-12-01 19:04:46,663: 
2021-12-01 19:04:46,671: 
2021-12-01 19:04:46,672: 
2021-12-01 19:04:46,752: 
2021-12-01 19:04:46,752: 
2021-12-01 19:04:46,860: 
2021-12-01 19:04:46,861: 
2021-12-01 19:04:46,862: 
2021-12-01 19:04:47,062: 
2021-12-01 19:04:47,062: 
2021-12-01 19:04:47,081: 
2021-12-01 19:04:48,442: 
2021-12-01 19:04:49,750: 
2021-12-01 19:04:50,889: 
2021-12-01 1

In [60]:
name_model.dv[33]
name_model.dv.get_vector(33)

# get tensor of recipe ids
torch.Tensor(recipes.recipe_id.values).int()

# get tensor of name vectors
#name_model.dv.get_vector(330)


tensor([137739,  31490, 112140,  ..., 308080, 298512, 298509],
       dtype=torch.int32)

In [None]:
# name_documents
name_model.dv.get_vector(199190)

In [None]:
recipe_id_to_contig_map[recipes_df.head(2).recipe_id.iloc[0]]

In [None]:
# torch.from_numpy(name_model.dv.vectors[0])
hg.etypes

### User vertex features: 
The options here could be:
1. The reviews users left for recipes
2. Trainable embeddings


In [None]:
''' Features for users '''


### Edge features: user review and rating

Assign each user's rating (as a `torch.Tensor([<rating_value>]`) of a recipe to the edge connecting them with that recipe 


In [None]:
print(len(interactions.user_id.unique()))
# interactions.head(4)
# review_df.head(4)
# experimenting to get it right. But this is what we send to the esdge
torch.from_numpy(review_df.rating.to_numpy().reshape(-1, 1), device=cuda).to(cuda)

In [13]:
print(torch.flip(torch.LongTensor(interactions.recipe_id_contig.values), dims=[0]))
print(torch.LongTensor(interactions.recipe_id_contig.values))

tensor([45472, 45472, 45472,  ..., 25579, 23488, 23488])
tensor([23488, 23488, 25579,  ..., 45472, 45472, 45472])


In [32]:
# Sort interactions by recipe id
interactions = interactions.sort_values(by=['user_id_contig'])

# sort recipes by user_id
recipes = recipes.sort_values(by=['recipe_id_contig'])

In [145]:
"""
    Try to build a (heterogeneous) graph, use the 
    columns with sorted contigous IDs for recipes and user IDs 
"""
hg = dgl.heterograph({
#     ('user', 'reviewed', 'recipe'): (torch.LongTensor(interactions.user_id_contig.values),
#                                      torch.LongTensor(interactions.recipe_id_contig.values)),
#     ('recipe', 'wasReviewedBy', 'user'): (torch.flip(torch.LongTensor(interactions.recipe_id_contig.values), dims=[0]),
#                                           torch.flip(torch.LongTensor(interactions.user_id_contig.values), dims=[0])),
#     ('user', 'submitted', 'recipe'): (torch.LongTensor(recipes.user_id_contig.values),
#                                       torch.LongTensor(recipes.recipe_id_contig.values)),
    ('user', 'rated', 'recipe'): (torch.LongTensor(interactions.user_id_contig.values),
                                  torch.LongTensor(interactions.recipe_id_contig.values)),
    ('recipe', 'isRated', 'user'): (torch.flip(torch.LongTensor(interactions.recipe_id_contig.values), dims=[0]),
                                    torch.flip(torch.LongTensor(interactions.user_id_contig.values), dims=[0])),
})  # , device='cuda:0')
hg

Graph(num_nodes={'recipe': 231637, 'user': 236568},
      num_edges={('recipe', 'isRated', 'user'): 1132198, ('user', 'rated', 'recipe'): 1132198},
      metagraph=[('recipe', 'user', 'isRated'), ('user', 'recipe', 'rated')])

In [None]:
# FIXME: also add reverse nodes/relationships for each relationship 

In [146]:
hg.num_nodes('user') # 25000

236568

In [None]:
# add reversed edge
# hgg = hg.add_reverse_edges(copy_edata=True)

In [None]:
print(len(set(submitted_df.user_id.values).difference(set(review_df.user_id.values))))
len(review_df.recipe_id.values)

### Assign features associated with the nodes and edges. 
These features (at this time) include: <br>
1. Recipe name ( as doc2vec embeddings ) 
2. Rating (user -> recipe (reviewed) edge) <br>

#### TBD:
3. Description (recipe) <br>
4. Ingredients (recipe) <br>
5. Steps <br>
6. calorie level <br>
7. Review (user -> recipe (reviewed) edge) <br>

In [147]:
# assign name features 
hg.ndata['name'] = {'recipe': torch.from_numpy(name_model.dv.vectors).float()}  # .to('cuda:0')}
# hg.nodes['recipe'].data['name'][torch.Tensor(recipes.recipe_id.values).int()] = torch.from_numpy(name_model.dv.vectors).float().to(hg.device)


In [28]:
hg.ndata

# torch.from_numpy(name_model.dv.vectors).float()

defaultdict(<class 'dict'>, {'name': {'recipe': tensor([[-0.2057, -0.0272, -0.0763,  ...,  0.0300, -0.1241,  0.1883],
        [ 0.0272, -0.0428, -0.0711,  ..., -0.0867,  0.0113,  0.0484],
        [-0.1144,  0.0363,  0.1403,  ..., -0.0060, -0.0717, -0.0547],
        ...,
        [ 0.0390, -0.0190,  0.0812,  ..., -0.0467, -0.0260,  0.0459],
        [ 0.0142,  0.1425, -0.0207,  ..., -0.0095,  0.0040, -0.0791],
        [-0.0166,  0.0559, -0.0348,  ..., -0.0575,  0.0448, -0.0595]])}})

In [148]:
''' assign rating features to reviewed edge'''
# hg.edata['rating'] = {'reviewed' : torch.from_numpy(review_df.rating.to_numpy().reshape(-1, 1)).to(hg.device)}
hg.edata['rating'] = {'rated': torch.LongTensor(interactions.rating.values)}  # .to('cuda:0')}
hg.edata['rating'] = {'isRated': torch.flip(torch.LongTensor(interactions.rating.values), dims=[0])}  # .to('cuda:0')}
# FIXME: ratings will be different edge types  not edge features 

In [151]:
hg.edata['rating']

{('recipe', 'isRated', 'user'): tensor([5, 5, 5,  ..., 5, 5, 5]),
 ('user', 'rated', 'recipe'): tensor([5, 5, 5,  ..., 5, 5, 5])}

In [None]:
hg.num_edges('reviewed')
torch.zeros(5, dtype=torch.bool).bernoulli(0.7).to('cpu')

In [152]:
'''
    Randomly generate masks training masks on recipe 
    and user nodes and reviewed edges 
'''
hg.nodes['user'].data['train_mask'] = torch.zeros(hg.num_nodes('user'),
                                                  dtype=torch.bool).bernoulli(0.7).to(hg.device)
hg.nodes['recipe'].data['train_mask'] = torch.zeros(hg.num_nodes('recipe'),
                                                  dtype=torch.bool).bernoulli(0.7).to(hg.device)

hg.edges['rated'].data['train_mask'] = torch.zeros(hg.num_edges('rated'),
                                                   dtype=torch.bool).bernoulli(0.7).to(hg.device)

### Assign calories as features to recipes vertices

In [132]:
recipes.ingredients.head(2).iloc[1]
# simple_preprocess("this is just another sentence")

['ground beef',
 'onion',
 'diced tomatoes',
 'red kidney beans',
 'taco seasoning',
 'corn',
 'green chilies',
 'jalapeno',
 'ranch dressing',
 'bow tie pasta',
 'sour cream']

In [None]:
"""
    We need change ingredient representation from their string form to a 
    tensor representation. Since ingredients is a list of string tokens,
    we can represent the ingredients as a document vector using doc2vec 
"""
# 1. format ingredients column
recipes['ingredients'] = recipes.ingredients.apply(ast.literal_eval)

In [37]:
''' 2. Create embeddings for ingredient tokens in every recipe '''

token_documents = [TaggedDocument(row[1]['ingredients'],
                                  [row[1]['recipe_id_contig']])
                  for row in recipes.sort_values(by=['recipe_id_contig']).iterrows()]
token_model = Doc2Vec(token_documents, vector_size=70, window=2, min_count=1, workers=4)

2021-12-01 19:08:08,710: 
2021-12-01 19:08:08,710: 
2021-12-01 19:08:08,742: 
2021-12-01 19:08:08,769: 
2021-12-01 19:08:08,794: 
2021-12-01 19:08:08,819: 
2021-12-01 19:08:08,843: 
2021-12-01 19:08:08,868: 
2021-12-01 19:08:08,892: 
2021-12-01 19:08:08,917: 
2021-12-01 19:08:08,942: 
2021-12-01 19:08:08,967: 
2021-12-01 19:08:08,992: 
2021-12-01 19:08:09,020: 
2021-12-01 19:08:09,045: 
2021-12-01 19:08:09,070: 
2021-12-01 19:08:09,093: 
2021-12-01 19:08:09,119: 
2021-12-01 19:08:09,143: 
2021-12-01 19:08:09,168: 
2021-12-01 19:08:09,192: 
2021-12-01 19:08:09,217: 
2021-12-01 19:08:09,243: 
2021-12-01 19:08:09,268: 
2021-12-01 19:08:09,296: 
2021-12-01 19:08:09,306: 
2021-12-01 19:08:09,306: 
2021-12-01 19:08:09,343: 
2021-12-01 19:08:09,345: 
2021-12-01 19:08:09,404: 
2021-12-01 19:08:09,405: 
2021-12-01 19:08:09,405: 
2021-12-01 19:08:09,515: 
2021-12-01 19:08:09,515: 
2021-12-01 19:08:09,597: 
2021-12-01 19:08:10,631: 
2021-12-01 19:08:11,641: 
2021-12-01 19:08:12,660: 
2021-12-01 1

In [153]:
''' 3. Assign ingredient embeddings as recipe features '''
hg.ndata['ingredients'] = {'recipe': torch.from_numpy(token_model.dv.vectors).float()}  # .to('cuda:0')}

In [162]:
''' For user features, I should try 2 options:
    1. use the review they left for the recipe
    2. Use trainable embeddings
'''
interactions.head(2)

Unnamed: 0,user_id,recipe_id,date,rating,review,user_id_contig,recipe_id_contig
279197,1572865,84247,2010-06-11,5,What a winner!!! I didn't want to heat up the...,0,49001
302320,524293,90031,2007-06-25,5,"This is a great recipe. However, I substituted...",1,52586


In [310]:
hg.edges(etype='rated')[1]

tensor([49001, 52586, 87356,  ..., 12439, 83168, 14645], device='cuda:0')

In [59]:
# FIXME instead of, just add train, test, val masks on vertices 
# and train/test/validate with those vertices
# memory intensive

def train_test_split(graph: dgl.heterograph):
    '''
        Split graph for training and testing
    '''
    u, v = graph.edges(etype='rated')
    
    eids = np.arange(graph.number_of_edges('rated'))
    eids = np.random.permutation(eids)
    
    test_size = int(len(eids)*0.15)  # 15% for testing
    train_size = graph.number_of_edges('rated') - test_size  # 85% for training
    
    test_pos_u, test_pos_v = u[eids[:test_size]], v[eids[:test_size]]
    train_pos_u, train_pos_v = u[eids[test_size:]], v[eids[test_size:]]
    
    # Find all negative edges and split them for training and testing
    adj = ssp.coo_matrix((np.ones(len(u)), (u.cpu().numpy(), v.cpu().numpy())))
    # adj_neg = 1 - adj.todense() - np.eye(graph.number_of_nodes())
    adj_neg_u = 1 - adj.todense() - np.eye(graph.number_of_src_nodes('user'))
    adj_neg_v = 1 - adj.todense() - np.eye(graph.number_of_src_nodes('recipe'))
    neg_u = np.where(adj_neg_u != 0)
    neg_v = np.where(adj_neg_v != 0)
    
    neg_eids = np.random.choice(len(neg_u), graph.number_of_edges('rated') // 2)
    test_neg_u, test_neg_v = neg_u[neg_eids[:test_size]], neg_v[neg_eids[:test_size]]
    train_neg_u, train_neg_v = neg_u[neg_eids[test_size:]], neg_v[neg_eids[test_size:]]
    
    train_g = graph.remove_edges(eids[:test_size], etype='rated')
    
    # oops
    train_pos_g = dgl.heterograph({('user', 'rated', 'recipe'), (train_pos_u, train_pos_v)}, device='cuda:0')
    train_net_g = dgl.heterograph({('user', 'rated', 'recipe'), (train_neg_u, train_neg_v)}, device='cuda:0')
    
    return train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g
    

In [60]:
train_g, train_pos_g, train_neg_g, test_pos_g, test_neg_g = train_test_split(hg)

MemoryError: Unable to allocate 408. GiB for an array with shape (236568, 231637) and data type float64

In [97]:
hg.edges['rated'].data['train_mask'][2] == True

tensor(True)

In [111]:
# train test split 
# train_eid_dict = {etype: hg.edges(etype=etype, form='eid') for etype in hg.etypes }

#train_eid_dict = {'rated': (graph.edges['rated'].data['train_mask'] == 1).nonzero(as_tuple=True)[0] for etype in hg.etypes}
#val_eid_dict   = {'rated': (graph.edges['rated'].data['test_mask'] == 2).nonzero(as_tuple=True)[0] for etype in hg.etypes}
# {'rated': (hg.edges['rated'].data['train_mask'] == 2).nonzero(as_tuple=True)[0] for etype in hg.etypes}
# {'rated': (hg.edges['rated'].data['train_mask'] == True).nonzero(as_tuple=True)[0] for etype in hg.etypes}
{'rated': (hg.edges['rated'].data['train_mask'] == True).nonzero(as_tuple=True)[0] for etype in hg.etypes}

{'rated': tensor([      0,       1,       2,  ..., 1132193, 1132194, 1132197])}

In [31]:
# visualize a graph
def plot_graph(nxg):
    ag = pgv.AGraph(strict=False, directed=True)
    
    for u, v, k in nxg.edges(keys=True):
        ag.add_edge(u, v, label=k)

    ag.layout('dot')
    ag.draw('graph.png')

In [82]:
plot_graph(hg.metagraph())

In [39]:
'''
    Construct a negative edge graph for an edge type being sampled
'''
def construct_negative_graph(graph: dgl.heterograph, k, etype):
    utype, _, v_type = etype
    src, dst = graph.edges(etype=etype)
    
    neg_src = src.repeat_interleave(k)
    # neg_dst = 
    neg_dst = torch.randint(0, graph.num_nodes(v_type), (len(src) * k,), device='cuda:0') # shouldn't be random

    return dgl.heterograph({etype: (neg_src, neg_dst)},
                           num_nodes_dict = {ntype: graph.num_nodes(ntype) for ntype in graph.ntypes},
                           device='cuda:0')


In [40]:
''' Compute scores for an edge type in prediction '''
class HeteroDotProductPredictor(torch.nn.Module):
    ''' Dot predictor for heterograph edges '''
    
    def forward(self, graph, h, etype):
        print(etype)
        # assert(h)
        with graph.local_scope():
            # 'h' contains the node representations computed previously
            graph.ndata['h'] = h
            # print(graph.ntypes)
            graph.apply_edges(dgl.function.u_dot_v('h', 'h', 'score'), etype=etype)

            return graph.edges[etype].data['score']


class ScorePredictor(torch.nn.Module):
    def forward(self, edge_subgraph, x):
        with edge_subgraph.local_scope():
            edge_subgraph.ndata['x'] = x
            for etype in edge_subgraph.canonical_etypes:
                edge_subgraph.apply_edges(dgl.function.u_dot_v('x', 'x', 'score'), etype=etype)
            return edge_subgraph.edata['score']

In [116]:
''' Dot predictor for edges '''
class DotPredictor(torch.nn.Module):
    
    def forward(self, graph, feat):
        with graph.local_scope():
            # using the name feature
            g.ndata['h'] = h

            # compute a new edge feature named 'score' as a dot-product
            # between source vertex feature 'feature' and destination
            # vertex feature 'ingredients'
            graph.apply_edges(fn.u_dot_v('h', 'h', 'score'))
            
            return graph.edata['score'][:, 0]

In [41]:
# Loss computation
def compute_loss(pos_score, neg_score):
    # Margin loss 
    n_edges = pos_score.shape[0]
    return (1 - pos_score.unsqueeze(1) + neg_score.view(n_edges, -1)).clamp(min=0).mean()

# Heterogenoeuos graph loss computation
def compute_loss_hetero(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores.squeeze(1), labels)

# hinge loss
def compute_hinge_loss(pos_score, neg_score):
    # an example hinge loss
    n = pos_score.shape[0]
    return (neg_score.view(n, -1) - pos_score.view(n, -1) + 1).clamp(min=0).mean()

# auc
def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat([torch.ones(pos_score.shape[0]),
                        torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [99]:
''' 2 layer GraphSAGE model'''
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')
        
    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [4]:
'''
    Heterograph Relational Conv Model with 3 layers to 
    learn the representation of the nodes (embeddings)
'''
class RGCN(torch.nn.Module):
    def __init__(self, in_feat_size, hid_feat_size, out_feat_size, rel_names):
        super().__init__()

        self.conv1 = dgl.nn.HeteroGraphConv({rel: dgl.nn.GraphConv(in_feat_size,
                                                                   hid_feat_size,
                                                                  norm='right') for rel in rel_names},
                                            aggregate='sum')

        self.conv2 = dgl.nn.HeteroGraphConv({rel: dgl.nn.GraphConv(hid_feat_size,
                                                                   out_feat_size,
                                                                  norm='right') for rel in rel_names},
                                            aggregate='sum')


#     def forward(self, graph, node_features):
#         h = self.conv1(graph, node_features)
#         h = {k: F.leaky_relu(v) for k, v in h.items()}
#         h = self.conv2(graph, h)

#         return h
    def forward(self, blocks, in_feat):

        x = self.conv1(blocks[0], in_feat)
        x = {k: F.leaky_relu(v) for k, v in x.items()}
        x = self.conv2(blocks[1], x)
        return x

In [3]:
dgl.__version__

'0.7.2'

In [43]:
''' 
    Link Prediction Model
'''
class Model(torch.nn.Module):
    def __init__(self, graph, in_features, hidden_features, out_features, etypes):
        super().__init__()
        self.rgcn = RGCN(in_features, hidden_features, out_features, etypes)
        # self.rgcn = HeteroRGCN(graph, in_features, hidden_features, out_features)
        # self.pred = HeteroDotProductPredictor()
        self.pred = ScorePredictor()

#     def forward(self, g, neg_g, node_feats, etype):
#         h = self.rgcn(g, node_feats)  # features stored as self.embed in RGCN
#         assert(h)
#         # edge_sg = dgl.edge_subgraph(g, )

#         return self.pred(g, h, etype), self.pred(neg_g, h, etype)
    def forward(self, pos_g, neg_g, blocks, in_feat):
        in_feat = self.rgcn(blocks, in_feat)
        pos_score = self.pred(pos_g, in_feat)  #, 'rated')
        neg_score = self.pred(neg_g, in_feat)  #, 'isRated')
        
        return pos_score, neg_score
        

In [44]:
class NegativeSampler(object):
    def __init__(self, g, k):
        # caches the probability distribution
        self.weights = {
            etype: g.in_degrees(etype=etype).float() ** 0.75
            for _, etype, _ in g.canonical_etypes
        }
        self.k = k

    def __call__(self, g, eids_dict):
        result_dict = {}
        for etype, eids in eids_dict.items():
            src, _ = g.find_edges(eids, etype=etype)
            src = src.repeat_interleave(self.k)
            dst = self.weights[etype].multinomial(len(src), replacement=True)
            result_dict[etype] = (src, dst)
        return result_dict

In [154]:
embed_dict = {'user' : torch.nn.Parameter(torch.FloatTensor(hg.num_nodes('user'), 70))}  # .to('cuda:0'))}
torch.nn.init.xavier_uniform_(embed_dict['user'])

hg.ndata['feat'] = {'user': embed_dict['user']}
# hg.ndata['ingredients'] = {'recipe': torch.from_numpy(token_model.dv.vectors).float().to(hg.device)}

In [155]:
hg

Graph(num_nodes={'recipe': 231637, 'user': 236568},
      num_edges={('recipe', 'isRated', 'user'): 1132198, ('user', 'rated', 'recipe'): 1132198},
      metagraph=[('recipe', 'user', 'isRated'), ('user', 'recipe', 'rated')])

In [156]:
k=5

model = Model(hg, 70, 40, 1, hg.etypes)  # .to('cuda:0')
opt = torch.optim.Adam(model.parameters())
# train_mask = hg.edges['rated'].data['train_mask']

In [47]:
user_feat = hg.ndata['feat']['user']
rec_feat = hg.ndata['ingredients']['recipe']

node_features = {'user': user_feat, 'recipe': rec_feat}

In [157]:
train_eid_dict = {etype: hg.edges(etype=etype, form='eid') for etype in hg.etypes }
train_eid_dict

{'isRated': tensor([      0,       1,       2,  ..., 1132195, 1132196, 1132197]),
 'rated': tensor([      0,       1,       2,  ..., 1132195, 1132196, 1132197])}

In [158]:
# validation set
val_eid_dict = {'rated': (hg.edges['rated'].data['train_mask'] == False).nonzero(as_tuple=True)[0] for etype in hg.etypes}

In [159]:
# training set
train_eid_dict = {'rated': (hg.edges['rated'].data['train_mask'] == True).nonzero(as_tuple=True)[0] for etype in hg.etypes}

In [160]:
sampler = dgl.dataloading.MultiLayerFullNeighborSampler(2)

dataloader2 = dgl.dataloading.EdgeDataLoader(hg,
                                             train_eid_dict,
                                             sampler,
                                             negative_sampler=NegativeSampler(hg, 4),
                                             batch_size=128,
                                             shuffle=True,
                                             drop_last=False,
                                             device='cpu')

In [161]:
hg = hg.to('cpu')

In [144]:
# for epoch in range(10):
#     negative_graph = construct_negative_graph(hg, k, ('user', 'rated', 'recipe'))
#     # print("negative graph: {}".format(negative_graph))

#     pos_score, neg_score = model(hg, negative_graph, node_features, ('user', 'rated', 'recipe'))
    
#     # loss
#     loss = compute_loss(pos_score, neg_score)

#     opt.zero_grad()
#     loss.backward()
#     opt.step()

#     print(loss.item())
    
#     # validation accuracy

# score[('user', 'rated', 'recipe')].shape
score

Block(num_src_nodes={'recipe': 163588, 'user': 113675},
      num_dst_nodes={'recipe': 44803, 'user': 7699},
      num_edges={('recipe', 'isRated', 'user'): 516855, ('recipe', 'wasReviewedBy', 'user'): 516855, ('user', 'rated', 'recipe'): 515003, ('user', 'reviewed', 'recipe'): 515003, ('user', 'submitted', 'recipe'): 44803},
      metagraph=[('recipe', 'user', 'isRated'), ('recipe', 'user', 'wasReviewedBy'), ('user', 'recipe', 'rated'), ('user', 'recipe', 'reviewed'), ('user', 'recipe', 'submitted')])

In [162]:
epoch = 200
score = {}
for input_nodes, pos_graph, neg_graph, blocks in tqdm.tqdm(dataloader2):
    if epoch > 0:
    # move to gpu
#     blocks = [b.to(torch.device('cuda')) for b in blocks]
#     pos_graph = pos_graph.to(torch.device('cuda'))
#     neg_graph = neg_graph.to(torch.device('cuda'))
    
        node_feat = {'user': blocks[0].srcdata['feat']['user'], 
                     'recipe': blocks[0].srcdata['ingredients']['recipe']}
        # print(node_feat)

        pos_score, neg_score = model(pos_graph, neg_graph, blocks, node_feat)

        loss = compute_loss_hetero(pos_score[('user', 'rated', 'recipe')],
                                   neg_score[('user', 'rated', 'recipe')])
        opt.zero_grad()
        loss.backward()
        opt.step()

        # print(loss.item())
        epoch = epoch - 1
#        score = blocks[0]
    else:
        break
        print("Done!")

  0%|          | 2/6194 [00:00<20:47,  4.96it/s]

0.6931690573692322
0.6931028366088867


  0%|          | 4/6194 [00:00<19:30,  5.29it/s]

0.6930379271507263
0.6929725408554077


  0%|          | 6/6194 [00:01<19:27,  5.30it/s]

0.6928800940513611
0.6927483081817627


  0%|          | 8/6194 [00:01<20:40,  4.98it/s]

0.6925919651985168
0.6924353837966919


  0%|          | 10/6194 [00:01<19:35,  5.26it/s]

0.6921769380569458
0.6919278502464294


  0%|          | 12/6194 [00:02<20:07,  5.12it/s]

0.6916764974594116
0.691350519657135


  0%|          | 13/6194 [00:02<20:36,  5.00it/s]

0.6909536719322205


  0%|          | 15/6194 [00:02<21:00,  4.90it/s]

0.6905575394630432
0.6901613473892212


  0%|          | 16/6194 [00:03<20:37,  4.99it/s]

0.6895794868469238


  0%|          | 17/6194 [00:03<20:57,  4.91it/s]

0.6889418363571167


  0%|          | 19/6194 [00:03<20:25,  5.04it/s]

0.6884115934371948
0.6878353357315063


  0%|          | 21/6194 [00:04<20:25,  5.04it/s]

0.686949610710144
0.6861189007759094


  0%|          | 23/6194 [00:04<21:05,  4.88it/s]

0.6852006316184998
0.6844111680984497


  0%|          | 25/6194 [00:04<20:28,  5.02it/s]

0.6834040880203247
0.6821137070655823


  0%|          | 27/6194 [00:05<20:03,  5.12it/s]

0.6807791590690613
0.6798431873321533


  0%|          | 29/6194 [00:05<20:04,  5.12it/s]

0.6780838370323181
0.6770498156547546


  1%|          | 31/6194 [00:06<19:14,  5.34it/s]

0.6746453046798706
0.6727421283721924


  1%|          | 33/6194 [00:06<19:23,  5.30it/s]

0.6708905100822449
0.669196367263794


  1%|          | 35/6194 [00:06<19:26,  5.28it/s]

0.6663357019424438
0.6643860936164856


  1%|          | 36/6194 [00:07<19:31,  5.26it/s]

0.6616536974906921


  1%|          | 38/6194 [00:07<19:52,  5.16it/s]

0.6603516936302185
0.6562414169311523


  1%|          | 40/6194 [00:07<19:55,  5.15it/s]

0.65362948179245
0.6507973670959473


  1%|          | 41/6194 [00:08<20:22,  5.04it/s]

0.6471969485282898


  1%|          | 43/6194 [00:08<20:29,  5.00it/s]

0.6451069712638855
0.640617847442627


  1%|          | 44/6194 [00:08<20:17,  5.05it/s]

0.6382246613502502


  1%|          | 46/6194 [00:09<20:07,  5.09it/s]

0.6325049996376038
0.6305027604103088


  1%|          | 48/6194 [00:09<19:58,  5.13it/s]

0.625185489654541
0.6204826831817627


  1%|          | 50/6194 [00:09<19:33,  5.23it/s]

0.6157106161117554
0.6110453605651855


  1%|          | 52/6194 [00:10<21:19,  4.80it/s]

0.6062970161437988
0.6004830598831177


  1%|          | 54/6194 [00:10<21:28,  4.76it/s]

0.5957850813865662
0.5898532271385193


  1%|          | 56/6194 [00:11<20:31,  4.99it/s]

0.582658588886261
0.5776313543319702


  1%|          | 58/6194 [00:11<19:46,  5.17it/s]

0.5716444253921509
0.5642405152320862


  1%|          | 60/6194 [00:11<20:08,  5.07it/s]

0.5574548244476318
0.5518512725830078


  1%|          | 62/6194 [00:12<19:35,  5.21it/s]

0.5411519408226013
0.5361733436584473


  1%|          | 63/6194 [00:12<19:38,  5.20it/s]

0.528210461139679


  1%|          | 65/6194 [00:12<20:38,  4.95it/s]

0.5213608741760254
0.5097223520278931


  1%|          | 66/6194 [00:13<20:06,  5.08it/s]

0.5025468468666077


  1%|          | 67/6194 [00:13<20:17,  5.03it/s]

0.4962252974510193


  1%|          | 68/6194 [00:13<22:36,  4.52it/s]

0.4896339774131775
0.47957363724708557

  1%|          | 70/6194 [00:13<21:00,  4.86it/s]


0.46856486797332764


  1%|          | 72/6194 [00:14<19:59,  5.10it/s]

0.45964598655700684
0.458944171667099


  1%|          | 74/6194 [00:14<19:29,  5.23it/s]

0.44320690631866455
0.4317178726196289


  1%|          | 75/6194 [00:14<19:31,  5.22it/s]

0.41821983456611633


  1%|          | 76/6194 [00:15<20:49,  4.90it/s]

0.4169366657733917


  1%|▏         | 78/6194 [00:15<20:12,  5.05it/s]

0.40397390723228455
0.39737051725387573


  1%|▏         | 80/6194 [00:15<19:19,  5.27it/s]

0.38507184386253357
0.37736207246780396


  1%|▏         | 82/6194 [00:16<19:46,  5.15it/s]

0.3632936179637909
0.3552424907684326


  1%|▏         | 83/6194 [00:16<20:09,  5.05it/s]

0.34703829884529114


  1%|▏         | 84/6194 [00:16<20:56,  4.86it/s]

0.33227095007896423


  1%|▏         | 86/6194 [00:17<20:30,  4.96it/s]

0.3213173449039459
0.3129938542842865


  1%|▏         | 88/6194 [00:17<19:51,  5.12it/s]

0.3026718199253082
0.2939436137676239


  1%|▏         | 90/6194 [00:17<20:01,  5.08it/s]

0.2837427258491516
0.27459847927093506


  1%|▏         | 92/6194 [00:18<19:34,  5.20it/s]

0.2574712932109833
0.25336116552352905


  2%|▏         | 94/6194 [00:18<19:11,  5.30it/s]

0.2417244017124176
0.2362724244594574


  2%|▏         | 96/6194 [00:19<19:37,  5.18it/s]

0.22280970215797424
0.2155151069164276


  2%|▏         | 98/6194 [00:19<18:51,  5.39it/s]

0.21918240189552307
0.1991904079914093


  2%|▏         | 100/6194 [00:19<19:10,  5.30it/s]

0.1898353397846222
0.18166542053222656


  2%|▏         | 102/6194 [00:20<19:38,  5.17it/s]

0.1845439076423645
0.1679472029209137


  2%|▏         | 104/6194 [00:20<19:53,  5.10it/s]

0.1613769233226776
0.15784598886966705


  2%|▏         | 106/6194 [00:20<19:38,  5.16it/s]

0.15053808689117432
0.14339357614517212


  2%|▏         | 107/6194 [00:21<19:47,  5.13it/s]

0.13456463813781738


  2%|▏         | 109/6194 [00:21<20:05,  5.05it/s]

0.1291411966085434
0.12117983400821686


  2%|▏         | 110/6194 [00:21<20:01,  5.06it/s]

0.11745069175958633


  2%|▏         | 112/6194 [00:22<20:42,  4.89it/s]

0.10911988466978073
0.11058544367551804


  2%|▏         | 114/6194 [00:22<20:19,  4.99it/s]

0.10736220329999924
0.09697875380516052


  2%|▏         | 116/6194 [00:23<21:18,  4.75it/s]

0.09126818925142288
0.09512175619602203


  2%|▏         | 118/6194 [00:23<20:50,  4.86it/s]

0.08918921649456024
0.07915637642145157


  2%|▏         | 120/6194 [00:23<20:08,  5.03it/s]

0.07703850418329239
0.0747142881155014


  2%|▏         | 122/6194 [00:24<20:36,  4.91it/s]

0.06649245321750641
0.06917354464530945


  2%|▏         | 124/6194 [00:24<19:53,  5.09it/s]

0.06211234629154205
0.05906194821000099


  2%|▏         | 125/6194 [00:24<20:17,  4.98it/s]

0.0674847662448883


  2%|▏         | 126/6194 [00:25<20:41,  4.89it/s]

0.05425361916422844


  2%|▏         | 128/6194 [00:25<20:19,  4.97it/s]

0.06112289056181908
0.05734077841043472


  2%|▏         | 129/6194 [00:25<20:07,  5.02it/s]

0.048056792467832565


  2%|▏         | 131/6194 [00:26<20:24,  4.95it/s]

0.05315603315830231
0.049149610102176666


  2%|▏         | 133/6194 [00:26<19:58,  5.06it/s]

0.04329296201467514
0.04337528720498085


  2%|▏         | 134/6194 [00:26<19:57,  5.06it/s]

0.0379391647875309


  2%|▏         | 136/6194 [00:27<20:03,  5.03it/s]

0.046033747494220734
0.04334692284464836


  2%|▏         | 138/6194 [00:27<20:27,  4.93it/s]

0.036803435534238815
0.03517022728919983


  2%|▏         | 140/6194 [00:27<20:00,  5.04it/s]

0.033649422228336334
0.03494492173194885


  2%|▏         | 142/6194 [00:28<19:43,  5.11it/s]

0.031358614563941956
0.03132996708154678


  2%|▏         | 143/6194 [00:28<20:34,  4.90it/s]

0.03294241055846214


  2%|▏         | 144/6194 [00:28<23:11,  4.35it/s]

0.029981229454278946


  2%|▏         | 146/6194 [00:29<21:09,  4.76it/s]

0.03009893000125885
0.027961838990449905


  2%|▏         | 148/6194 [00:29<19:53,  5.07it/s]

0.029480500146746635
0.026359079405665398


  2%|▏         | 150/6194 [00:29<19:54,  5.06it/s]

0.026375140994787216
0.025053003802895546


  2%|▏         | 151/6194 [00:30<19:45,  5.10it/s]

0.024078264832496643


  2%|▏         | 153/6194 [00:30<20:43,  4.86it/s]

0.02807074412703514
0.025230342522263527


  3%|▎         | 155/6194 [00:30<20:04,  5.01it/s]

0.022569457069039345
0.021402625367045403


  3%|▎         | 157/6194 [00:31<19:43,  5.10it/s]

0.020999561995267868
0.021452363580465317


  3%|▎         | 159/6194 [00:31<19:58,  5.04it/s]

0.021516527980566025
0.019638625904917717


  3%|▎         | 161/6194 [00:32<20:04,  5.01it/s]

0.018879469484090805
0.017032839357852936


  3%|▎         | 163/6194 [00:32<19:11,  5.24it/s]

0.018605709075927734
0.01858840137720108


  3%|▎         | 165/6194 [00:32<19:18,  5.20it/s]

0.01841989904642105
0.017416153103113174


  3%|▎         | 166/6194 [00:33<19:46,  5.08it/s]

0.017562780529260635


  3%|▎         | 167/6194 [00:33<20:27,  4.91it/s]

0.017239725217223167


  3%|▎         | 168/6194 [00:33<20:53,  4.81it/s]

0.018652815371751785


  3%|▎         | 170/6194 [00:34<21:50,  4.60it/s]

0.014114596880972385
0.020769156515598297


  3%|▎         | 172/6194 [00:34<20:52,  4.81it/s]

0.014692728407680988
0.01708448864519596


  3%|▎         | 174/6194 [00:34<20:11,  4.97it/s]

0.013770339079201221
0.013550725765526295


  3%|▎         | 176/6194 [00:35<19:56,  5.03it/s]

0.013142875395715237
0.014062770642340183


  3%|▎         | 178/6194 [00:35<18:57,  5.29it/s]

0.012532751075923443
0.014915835112333298


  3%|▎         | 180/6194 [00:35<19:09,  5.23it/s]

0.015007569454610348
0.015900570899248123


  3%|▎         | 182/6194 [00:36<19:15,  5.20it/s]

0.011218897998332977
0.011463647708296776


  3%|▎         | 183/6194 [00:36<20:06,  4.98it/s]

0.012379529885947704


  3%|▎         | 185/6194 [00:36<19:52,  5.04it/s]

0.013941925950348377
0.0108179971575737


  3%|▎         | 187/6194 [00:37<18:55,  5.29it/s]

0.011215431615710258
0.016557976603507996


  3%|▎         | 189/6194 [00:37<18:52,  5.30it/s]

0.011217441409826279
0.014858797192573547


  3%|▎         | 190/6194 [00:37<19:00,  5.26it/s]

0.009061486460268497


  3%|▎         | 191/6194 [00:38<20:09,  4.96it/s]

0.012140271253883839


  3%|▎         | 192/6194 [00:38<20:11,  4.95it/s]

0.010831507854163647


  3%|▎         | 193/6194 [00:38<20:09,  4.96it/s]

0.008095336146652699


  3%|▎         | 195/6194 [00:38<20:22,  4.91it/s]

0.012925926595926285
0.011224363930523396


  3%|▎         | 197/6194 [00:39<20:28,  4.88it/s]

0.01201404258608818
0.010883968323469162


  3%|▎         | 199/6194 [00:39<20:33,  4.86it/s]

0.010220387950539589
0.008281300775706768


  3%|▎         | 200/6194 [00:40<20:00,  4.99it/s]

0.01226048357784748





## Attempt to make dataset, rather than make the graph from scratch with dataframes

In [1]:
torch.cuda.empty_cache()

NameError: name 'torch' is not defined

In [8]:
torch.cuda.device_count()
torch.cuda.set_device(1)

In [32]:
train_eids = hg.edge_ids(hg.edges(etype='rated')[0], hg.edges(etype='rated')[1], etype='rated')
# hg.edges(etype='rated')[1]

In [85]:
dataloader = dgl.dataloading.EdgeDataLoader(hg, 
                                            {'rated': train_eids},
                                            dgl.dataloading.MultiLayerFullNeighborSampler(3),
                                            batch_size=64,
                                            shuffle=True,
                                            device='cpu')

In [55]:
model


Model(
  (rgcn): RGCN(
    (conv1): HeteroGraphConv(
      (mods): ModuleDict(
        (isRated): GraphConv(in=70, out=40, normalization=right, activation=None)
        (wasReviewedBy): GraphConv(in=70, out=40, normalization=right, activation=None)
        (rated): GraphConv(in=70, out=40, normalization=right, activation=None)
        (reviewed): GraphConv(in=70, out=40, normalization=right, activation=None)
        (submitted): GraphConv(in=70, out=40, normalization=right, activation=None)
      )
    )
    (conv2): HeteroGraphConv(
      (mods): ModuleDict(
        (isRated): GraphConv(in=40, out=1, normalization=right, activation=None)
        (wasReviewedBy): GraphConv(in=40, out=1, normalization=right, activation=None)
        (rated): GraphConv(in=40, out=1, normalization=right, activation=None)
        (reviewed): GraphConv(in=40, out=1, normalization=right, activation=None)
        (submitted): GraphConv(in=40, out=1, normalization=right, activation=None)
      )
    )
  )
  (p

In [56]:
model

Model(
  (rgcn): RGCN(
    (conv1): HeteroGraphConv(
      (mods): ModuleDict(
        (isRated): GraphConv(in=70, out=40, normalization=right, activation=None)
        (wasReviewedBy): GraphConv(in=70, out=40, normalization=right, activation=None)
        (rated): GraphConv(in=70, out=40, normalization=right, activation=None)
        (reviewed): GraphConv(in=70, out=40, normalization=right, activation=None)
        (submitted): GraphConv(in=70, out=40, normalization=right, activation=None)
      )
    )
    (conv2): HeteroGraphConv(
      (mods): ModuleDict(
        (isRated): GraphConv(in=40, out=1, normalization=right, activation=None)
        (wasReviewedBy): GraphConv(in=40, out=1, normalization=right, activation=None)
        (rated): GraphConv(in=40, out=1, normalization=right, activation=None)
        (reviewed): GraphConv(in=40, out=1, normalization=right, activation=None)
        (submitted): GraphConv(in=40, out=1, normalization=right, activation=None)
      )
    )
  )
  (p

In [132]:
def get_hits(edges_df, h, h_test):
    ''' Get list of hits'''
    hist = []
    edges = edges_df
    for i in range(h.shape[0]):
        true_edges = list(edges[edges.asin == i].recipe_id)
        dist = torch.cdist(h_test[[i]], h)
        top_k = torch.topk(dist, k=500, largest=False)[1]
        hit = 0
        for j in true_edges:
            if j in top_k:
                hit = 1
                break
        hits.apend(hit)
    return hits

In [133]:
# if hit:
# hits = 
get_hits(val_eid_dict, h, model(test_pos_g, node_features))
print(np.mean(hits))

NameError: name 'h' is not defined

In [71]:
hg.edge_ids(hg.nodes('user'), hg.nodes('recipe'), etype=('user', 'rated', 'recipe'))

DGLError: [19:57:54] /opt/dgl/src/array/cpu/csr_get_data.cc:47: Check failed: (rowlen == collen) || (rowlen == 1) || (collen == 1): Invalid row and col id array.
Stack trace:
  [bt] (0) /home/trique/.local/lib/python3.8/site-packages/dgl/libdgl.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x4f) [0x7f7128db93bf]
  [bt] (1) /home/trique/.local/lib/python3.8/site-packages/dgl/libdgl.so(dgl::runtime::NDArray dgl::aten::impl::CSRGetData<(DLDeviceType)1, long, long>(dgl::aten::CSRMatrix, dgl::runtime::NDArray, dgl::runtime::NDArray, bool, dgl::runtime::NDArray, long)+0xdd) [0x7f7128df083d]
  [bt] (2) /home/trique/.local/lib/python3.8/site-packages/dgl/libdgl.so(dgl::runtime::NDArray dgl::aten::impl::CSRGetData<(DLDeviceType)1, long>(dgl::aten::CSRMatrix, dgl::runtime::NDArray, dgl::runtime::NDArray)+0x13f) [0x7f7128db4c7f]
  [bt] (3) /home/trique/.local/lib/python3.8/site-packages/dgl/libdgl.so(dgl::aten::CSRGetData(dgl::aten::CSRMatrix, dgl::runtime::NDArray, dgl::runtime::NDArray)+0x7d2) [0x7f7128dad592]
  [bt] (4) /home/trique/.local/lib/python3.8/site-packages/dgl/libdgl.so(dgl::UnitGraph::CSR::EdgeIdsOne(unsigned long, dgl::runtime::NDArray, dgl::runtime::NDArray) const+0x56) [0x7f712970d596]
  [bt] (5) /home/trique/.local/lib/python3.8/site-packages/dgl/libdgl.so(dgl::UnitGraph::EdgeIdsOne(unsigned long, dgl::runtime::NDArray, dgl::runtime::NDArray) const+0x8d) [0x7f712970752d]
  [bt] (6) /home/trique/.local/lib/python3.8/site-packages/dgl/libdgl.so(dgl::HeteroGraph::EdgeIdsOne(unsigned long, dgl::runtime::NDArray, dgl::runtime::NDArray) const+0x66) [0x7f712960aac6]
  [bt] (7) /home/trique/.local/lib/python3.8/site-packages/dgl/libdgl.so(+0xe0c25c) [0x7f712961225c]
  [bt] (8) /home/trique/.local/lib/python3.8/site-packages/dgl/libdgl.so(DGLFuncCall+0x48) [0x7f712959f518]



In [80]:
hg.edata['rating'][('user', 'rated', 'recipe')]

tensor([5, 5, 5,  ..., 5, 5, 5])

In [84]:
hg.edges(form='all', etype='score')

DGLError: Edge type "score" does not exist.

In [83]:
hg[dgl.ETYPE]

DGLError: Invalid key "_TYPE". Must be one of the edge types.