# Download

In [1]:
# !pip install kaggle
# !kaggle datasets download -d irkaal/foodcom-recipes-and-reviews -f recipes.parquet
# !kaggle datasets download -d irkaal/foodcom-recipes-and-reviews -f reviews.parquet
# !unzip recipes.parquet.zip
# !rm recipes.parquet.zip
# !unzip reviews.parquet.zip
# !rm reviews.parquet.zip
# !wget https://raw.githubusercontent.com/Lord-Kelsier/datasets/refs/heads/master/mappers.json


# Libs

In [2]:
import torch
from torch import nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm
import pickle as pkl
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Dataset

In [3]:
reviews_train = pd.read_csv("https://raw.githubusercontent.com/Lord-Kelsier/datasets/refs/heads/master/review_train.csv")
reviews_test = pd.read_csv("https://raw.githubusercontent.com/Lord-Kelsier/datasets/refs/heads/master/review_test.csv")
recipes = pd.read_csv("https://raw.githubusercontent.com/Lord-Kelsier/datasets/refs/heads/master/recipes.csv")

In [4]:
users_amount, recipes_amount = len(reviews_train["AuthorId"].unique()), len(reviews_train["RecipeId"].unique())
users_amount, recipes_amount

(271906, 238658)

In [5]:
min(recipes["RecipeId"]), max(recipes["RecipeId"]), len(recipes["RecipeId"]), len(recipes["RecipeId"].unique())

(0, 238657, 238658, 238658)

In [6]:
recipes.set_index("RecipeId", inplace=True)
recipes.head()

Unnamed: 0_level_0,RecipeIngredientParts,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,"[4686, 4448, 1850, 2281]",170.9,2.5,1.3,8.0,29.8,37.1,3.6,30.2,3.2
1,"[5798, 3172, 4039, 1824, 2488, 1101, 2985, 111...",1110.7,58.8,16.6,372.8,368.4,84.4,9.0,20.4,63.4
2,"[684, 1038, 5984, 4536, 484]",311.1,0.2,0.0,0.0,1.8,81.5,0.4,77.2,0.3
3,"[230, 1127, 2827, 581, 3810, 2547, 3371, 4036,...",536.1,24.0,3.8,0.0,1558.6,64.2,17.3,32.1,29.3
4,"[4765, 2695, 3039, 3395, 3641]",103.6,0.4,0.1,0.0,959.3,25.1,4.8,17.7,4.3


In [7]:
nutrition_facts = [
    'Calories', 'FatContent','SaturatedFatContent', 'CholesterolContent', 'SodiumContent',
    'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent'
]

## Parsing

In [8]:
recipes["RecipeIngredientParts"] = recipes["RecipeIngredientParts"].apply(lambda x:  x.strip("[]").split(","))
recipes["RecipeIngredientParts"].head()

RecipeId
0                          [4686,  4448,  1850,  2281]
1    [5798,  3172,  4039,  1824,  2488,  1101,  298...
2                     [684,  1038,  5984,  4536,  484]
3    [230,  1127,  2827,  581,  3810,  2547,  3371,...
4                   [4765,  2695,  3039,  3395,  3641]
Name: RecipeIngredientParts, dtype: object

In [9]:
recipes["RecipeIngredientParts"] = recipes["RecipeIngredientParts"].apply(lambda x: [] if len(x) == 1 and x[0] == "" else x)

In [10]:
recipes["RecipeIngredientParts"] = recipes["RecipeIngredientParts"].apply(lambda x: [int(i) for i in x])
recipes["RecipeIngredientParts"].head()

RecipeId
0                             [4686, 4448, 1850, 2281]
1    [5798, 3172, 4039, 1824, 2488, 1101, 2985, 111...
2                         [684, 1038, 5984, 4536, 484]
3    [230, 1127, 2827, 581, 3810, 2547, 3371, 4036,...
4                       [4765, 2695, 3039, 3395, 3641]
Name: RecipeIngredientParts, dtype: object

In [11]:
all_ingredients = set()
list(map(lambda x: all_ingredients.update(x), recipes["RecipeIngredientParts"]))
len(all_ingredients)

6157

In [12]:
recipes.head(1)

Unnamed: 0_level_0,RecipeIngredientParts,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,"[4686, 4448, 1850, 2281]",170.9,2.5,1.3,8.0,29.8,37.1,3.6,30.2,3.2


## Dataset and dataloader

In [13]:
class RecipeDataset(Dataset):
    def __init__(self, recipes):
        self.recipes = recipes
        self.nutrition_facts = 9
        self.all_ingredients = all_ingredients
    def __len__(self):
        return len(self.recipes)
    def __getitem__(self, idx):
        assert 0 <= idx < len(self.recipes), f"Index {idx} out of range"
        # Las recipes deben tener como indice su id
        recipe = self.recipes.iloc[idx]
        nutrition = recipe[nutrition_facts].values.copy().astype(float)
        nutrition /= max(nutrition.max(), 1e-6)
        nutrition = torch.tensor(nutrition, device=device, dtype=torch.float32).squeeze()

        max_nutrition = torch.max(nutrition)
        ingredients = recipe["RecipeIngredientParts"]
        one_hot_ingredients = torch.zeros(len(self.all_ingredients)).to(device)
        one_hot_ingredients[ingredients] = 1
        return torch.cat([one_hot_ingredients, nutrition]).to(device)


In [14]:
class ReviewsDataset(Dataset):
    def __init__(self, reviews, recipes_dset):
        self.reviews = reviews
        self.recipes_dset = recipes_dset
    def __len__(self):
        return len(self.reviews)
    def __getitem__(self, idx):
        assert 0 <= idx < len(self.reviews), f"Index {idx} out of range"
        review = self.reviews.iloc[idx]
        user_id = review['AuthorId']
        recipe_id = review['RecipeId']
        rating = review['Rating'].astype(float)
        recipe_features = self.recipes_dset[recipe_id]
        return (
            torch.tensor([user_id, recipe_id], device=device),
            torch.tensor(rating, device=device, dtype=torch.float32),
            recipe_features
        )

In [15]:
def collate_fn(batch):
    user_item, rating, recipe_features = zip(*batch)
    user_item = torch.stack(user_item)
    rating = torch.stack(rating)
    recipe_features = torch.stack(recipe_features)
    user_item = user_item.to(device)
    rating = rating.to(device)
    recipe_features = recipe_features.to(device)
    item_id = user_item[:, 1]
    user_id = user_item[:, 0]
    return user_item, rating, recipe_features


In [16]:
recipes_dset = RecipeDataset(recipes)
train_loader = DataLoader(
    ReviewsDataset(reviews_train, recipes_dset),
    batch_size=64,
    shuffle=True,
    collate_fn=collate_fn,
)
test_loader = DataLoader(
    ReviewsDataset(reviews_test, recipes_dset),
    batch_size=64,
    shuffle=True,
    collate_fn=collate_fn,
)

In [17]:
for b in train_loader:
  user_item, rating, recipe_features = b
  print(user_item.shape, rating.shape, recipe_features.shape)
  break

torch.Size([64, 2]) torch.Size([64]) torch.Size([64, 6166])


# Model

In [18]:
recipe_faetures_size = 6166
class RecipeEncoder(nn.Module):
    def __init__(self, n_features=6166):
        super().__init__()
        self.recipe_features = n_features
        self.encoder = nn.Linear(n_features, 64)
    def forward(self, x):
        return self.encoder(x)

In [19]:
# Extraido en gran parte del práctico de NCF
class NeuralCollaborativeFilteringNN(nn.Module):
    def __init__(self, user_sz: tuple, item_sz:tuple, mlp_layer, y_range=(0,5.5), n_act=100, n_factors = 50):
        super().__init__()
        number_of_users = user_sz[0]
        number_of_items = item_sz[0]
        number_user_features = user_sz[1]
        number_item_features = item_sz[1]
        self.user_factors_mf = nn.Embedding(number_of_users, n_factors)
        self.user_factors_mlp = nn.Embedding(number_of_users, number_user_features)
        self.item_factors_mf = nn.Embedding(number_of_items, n_factors)
        self.item_factors_mlp = nn.Embedding(number_of_items, number_item_features) # for the encoder
        self.recipe_encoder = RecipeEncoder() # ouputs 64 tensor embedding
        self.y_range = y_range

        self.user_bias = nn.Embedding(number_of_users, 1)
        self.item_bias = nn.Embedding(number_of_items, 1)
        self.MLPLayers = mlp_layer

        self.final_layer = nn.Linear(n_act + n_factors, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, recipe_features):
        user_vector_mf = self.user_factors_mf(x[:,0])
        user_vector_mlp = self.user_factors_mlp(x[:,0])
        item_vector_mf = self.item_factors_mf(x[:,1])
        item_vector_mlp = self.item_factors_mlp(x[:,1])
        recipe_features = self.recipe_encoder(recipe_features)
        # TODO: implementar resto del código. Por favor incluir comentarios
        # para que quede claro el propósito de cada línea (o conjunto de líneas)

        # Hacemos el element-wise product y le sumamos el bias
        user_bias = self.user_bias(x[:,0])
        item_bias = self.item_bias(x[:,1])
        mf_output = user_vector_mf * item_vector_mf
        mf_output += user_bias + item_bias

        # Concatenamos los vectores de embedding de MLP de usuario e ítem
        mlp_input = torch.cat((user_vector_mlp, item_vector_mlp, recipe_features), dim=1)

        # Pasamos la concatenación por la red MLP
        mlp_output = self.MLPLayers(mlp_input)

        # Concatenamos la salida del MLP con el resultado del modelo MF
        final_input = torch.cat((mlp_output, mf_output), dim=1)

        # Pasamos la concatenación por la capa final para obtener la predicción
        output = self.final_layer(final_input)

        return self.sigmoid(output) * (self.y_range[1] - self.y_range[0]) + self.y_range[0]

# Train

In [20]:
user_sz = (users_amount, 256)
item_sz = (recipes_amount, 256)
user_sz, item_sz
n_act = 50
mlp_layers = nn.Sequential(
    nn.Linear(user_sz[1] + item_sz[1] + 64, 2048),
    nn.ReLU(),
    nn.Linear(2048, 1024),
    nn.ReLU(),
    nn.Linear(1024, 512),
    nn.ReLU(),
    nn.Linear(512, 128),
    nn.ReLU(),
    nn.Linear(128, n_act),
    nn.ReLU()
)

In [21]:
ncf_model = NeuralCollaborativeFilteringNN(
    user_sz=user_sz,
    item_sz=item_sz,
    n_act = n_act,
    mlp_layer = mlp_layers,
    n_factors = 50
)
ncf_model.to(device)
print()




In [22]:
optimizer = torch.optim.Adam(ncf_model.parameters(), lr=0.001)
loss_func = torch.nn.MSELoss()

In [23]:
@torch.no_grad()
def evaluate(model, loss_func, test_loader, max_batches=None):
    model.eval()
    loop = tqdm(enumerate(test_loader), total=len(test_loader), leave=False)
    avg_loss = 0
    for i, (user_item, rating, recipe_features) in loop:
        if max_batches and i >= max_batches:
            break
        output = model(user_item, recipe_features)
        loss = loss_func(output.squeeze(), rating)
        avg_loss = (avg_loss * i + loss.item()) / (i + 1)
        loop.set_description(f'loss: {loss.item():.4f}, Avg Loss: {avg_loss:.4f}')
    return avg_loss

In [24]:
import logging as log
log.basicConfig(
    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=log.INFO,
    filename="train_basic_features.log",
    force=True
)

In [25]:
ncf_model.train()
losses_cb_ncf = []
for epoch in range(10):
    loop = tqdm(enumerate(train_loader), total=len(train_loader))
    avg_loss = 0
    for i, (user_item, rating, recipe_features) in loop:
        optimizer.zero_grad()
        output = ncf_model(user_item, recipe_features)
        loss = loss_func(output.squeeze(), rating)
        loss.backward()
        optimizer.step()
        losses_cb_ncf.append(loss.item())
        avg_loss = (avg_loss * i + loss.item()) / (i + 1)
        if i % 2000 == 0:
            val_loss = evaluate(ncf_model, loss_func, test_loader, max_batches=1500)
            log.info(f'epoch: {epoch}, it: {i}, loss: {loss.item():.4f}, Avg Loss: {avg_loss:.4f}, val Loss: {val_loss:.4f}')
        elif i % 500 == 0:
            log.info(f'epoch: {epoch}, it: {i}, loss: {loss.item():.4f}, Avg Loss: {avg_loss:.4f}, val Loss: {val_loss:.4f}')

        loop.set_description(f'epoch: {epoch}, loss: {loss.item():.4f}, Avg Loss: {avg_loss:.4f}')
    torch.save(ncf_model.state_dict(), f"ncf_basic_features_epoch_{epoch}.pt")

  0%|          | 0/16234 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/16234 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/16234 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/16234 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/16234 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/16234 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/16234 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/16234 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/16234 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/16234 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]

  0%|          | 0/5073 [00:00<?, ?it/s]