In [1]:
# !pip install -q tensorflow-recommenders
# !pip install -q --upgrade tensorflow-datasets
# !pip install -q scann

In [2]:
import os
import pprint
import tempfile
from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import pandas as pd

In [3]:
import tensorflow_recommenders as tfrs

In [4]:
import pickle

# Constants 

In [5]:
RATINGS_SMALL = "../EDA_files/ratings_small.parquet"
RECIPES_SMALL = "../EDA_files/recipes_small.parquet"
INDEX_TO_RECIPE_OBJ = "../EDA_files/index_to_recipe.obj"
RECIPE_TO_INDEX_OBJ = "../EDA_files/recipe_to_index.obj"

# Load data 

In [6]:
recipes_small = pd.read_parquet(RECIPES_SMALL)
ratings_small = pd.read_parquet(RATINGS_SMALL)

In [7]:
with open('../EDA_files/ratings_small.obj', 'wb') as pickle_file:
     pickle.dump(ratings_small, pickle_file)

In [8]:
with open('../EDA_files/recipes_small.obj', 'wb') as pickle_file:
     pickle.dump(recipes_small, pickle_file)

# Prepare dataset

In [9]:
ratings_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1401982 entries, 0 to 1401981
Data columns (total 5 columns):
 #   Column         Non-Null Count    Dtype              
---  ------         --------------    -----              
 0   RecipeId       1401982 non-null  int32              
 1   AuthorId       1401982 non-null  int32              
 2   Rating         1401982 non-null  int32              
 3   Review         1401982 non-null  object             
 4   DateSubmitted  1401982 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), int32(3), object(1)
memory usage: 37.4+ MB


In [29]:
features = ['AuthorId', 'RecipeId']
ratings = tf.data.Dataset.from_tensor_slices((
            tf.cast(ratings_small[features].values, tf.int32),
            tf.cast(ratings_small['Rating'].values, tf.float32))
    )


In [24]:
ratings_dict = ratings_small[['AuthorId', 'RecipeId', 'Rating']].to_dict()

In [30]:
# ratings = (tf.data.Dataset.from_tensor_slices(ratings_dict))

In [35]:
ratings = ratings.map(lambda x : x)

TypeError: in user code:


    TypeError: <lambda>() takes 1 positional argument but 2 were given


In [32]:
for x in ratings.take(10).as_numpy_iterator():
  pprint.pprint(x)

(array([2008,  992]), 5.0)
(array([1634, 4384]), 4.0)
(array([2046, 4523]), 2.0)
(array([1773, 7435]), 5.0)
(array([2085,   44]), 5.0)
(array([2046, 5221]), 4.0)
(array([ 2046, 13307]), 5.0)
(array([2156,  148]), 0.0)
(array([2046,  517]), 5.0)
(array([2046, 4684]), 5.0)


In [53]:
interactions_dict = ratings_small.groupby(['AuthorId', 'RecipeId'])['Rating'].sum().reset_index()

In [54]:
interactions_dict

Unnamed: 0,AuthorId,RecipeId,Rating
0,1533,2137,5
1,1533,2585,1
2,1533,4213,5
3,1533,8857,5
4,1533,10332,5
...,...,...,...
1401977,2002901749,153642,5
1401978,2002901759,363862,5
1401979,2002901787,128248,5
1401980,2002901848,264191,5


In [55]:
ratings_small[ratings_small.RecipeId==2137]

Unnamed: 0,RecipeId,AuthorId,Rating,Review,DateSubmitted
10040,2137,27643,5,I went searching the site for a fresh rhubarb ...,2002-04-29 19:42:58+00:00
10772,2137,40237,5,just the recipe I am looking for - thanks,2002-05-09 15:26:22+00:00
11466,2137,10033,5,!!!!!!This was fantastic!!!!!!!! The whole pie...,2002-05-19 16:25:57+00:00
145848,2137,1533,5,OUTSTANDING was the only comments my guests ma...,2005-02-27 09:30:26+00:00
152512,2137,195076,5,AWESOME and surprisingly easy. I used vegan b...,2005-03-30 09:48:16+00:00
162142,2137,69474,5,This is the only rhubarb pie I will ever use!!...,2005-05-12 13:30:14+00:00
229774,2137,215898,5,Wonderful! Easy to make and very flavorful. I ...,2006-02-07 11:20:37+00:00
265215,2137,283787,5,My husband and son loved it. We did add Straw...,2006-06-18 16:18:14+00:00
270426,2137,254415,5,Rated this recipe 5 stars for ease of preparat...,2006-07-03 19:12:53+00:00
272458,2137,74987,5,Loved this pie. My favorite rhubarb pie is Str...,2006-07-09 13:48:21+00:00


In [56]:
interactions_dict = {name: np.array(value) for name, value in interactions_dict.items()}
interactions = tf.data.Dataset.from_tensor_slices(interactions_dict)


In [57]:
interactions = interactions.map(lambda x: {'user_id' : x['AuthorId'], 
                                            'product_id' : x['RecipeId'], 
                                            'quantity' : float(x['Rating']),})

In [48]:
for x in interactions.take(10).as_numpy_iterator():
  pprint.pprint(x)

{'product_id': 2137, 'quantity': 5.0, 'user_id': 1533}
{'product_id': 2585, 'quantity': 1.0, 'user_id': 1533}
{'product_id': 4213, 'quantity': 5.0, 'user_id': 1533}
{'product_id': 8857, 'quantity': 5.0, 'user_id': 1533}
{'product_id': 10332, 'quantity': 5.0, 'user_id': 1533}
{'product_id': 10457, 'quantity': 5.0, 'user_id': 1533}
{'product_id': 10554, 'quantity': 5.0, 'user_id': 1533}
{'product_id': 10721, 'quantity': 5.0, 'user_id': 1533}
{'product_id': 11914, 'quantity': 5.0, 'user_id': 1533}
{'product_id': 12027, 'quantity': 5.0, 'user_id': 1533}


In [58]:
tf.random.set_seed(42)
shuffled = interactions.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.take(80_000).take(20_000)