In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import ast
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

**Load the Dataset as a pandas dataframe**

In [2]:
recipes_dataset = pd.read_csv('./Data/RAW_recipes.csv')

Tags are stored as a list but because CSV files are comma separated, the list but be a **string** in order to not to conflict with other values. We want to convert that into a **list**. (Example 'Tag' = "[American, sweet, desert, chocolate]")
   
Steps, Ingredients, and Nutritional Information are stored the same as tags so we also want to convert them into **lists**

Calories are always the first element in the nutritional information list, therefore make a column that contains only caloric information.


In [3]:
recipes_dataset['tags'] = recipes_dataset['tags'].apply(ast.literal_eval)
recipes_dataset['steps'] = recipes_dataset['steps'].apply(ast.literal_eval)
recipes_dataset['ingredients'] = recipes_dataset['ingredients'].apply(ast.literal_eval)
recipes_dataset['nutrition'] = recipes_dataset['nutrition'].apply(ast.literal_eval)
recipes_dataset['calories'] = recipes_dataset['nutrition'].apply(lambda x: x[0])



Iterate through the dataframe and add each recipe to the **recipes dictionary**. This dictionary will hold all recipes and have the recipe id as the key

In [4]:
recipes = {}

for index, row in recipes_dataset.iterrows():
    recipes[row['id']] = {
        'name' : row['name'],
        'id' : row['id'],
        'minutes' : int(row['minutes']),
        'contributor_id' : row['contributor_id'],
        'submitted' : pd.to_datetime(row['submitted']),
        'tags' : set(row['tags']),
        'nutrition' : row['nutrition'],
        'n_steps' : int(row['n_steps']),
        'steps' : set(row['steps']),
        'description' : row['description'],
        'ingredients' : set(row['ingredients']),
        'n_ingredients' : int(row['n_ingredients']),
        'calories' : int(row['calories']) 
    }



In [5]:
#test the recipe that is the first element in the dataframe using its id
recipes[137739]

{'name': 'arriba   baked winter squash mexican style',
 'id': 137739,
 'minutes': 55,
 'contributor_id': 47892,
 'submitted': Timestamp('2005-09-16 00:00:00'),
 'tags': {'60-minutes-or-less',
  'christmas',
  'course',
  'cuisine',
  'dietary',
  'easy',
  'fall',
  'holiday-event',
  'main-ingredient',
  'mexican',
  'north-american',
  'occasion',
  'preparation',
  'seasonal',
  'side-dishes',
  'squash',
  'time-to-make',
  'vegetables',
  'vegetarian',
  'winter'},
 'nutrition': [51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0],
 'n_steps': 11,
 'steps': {'bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin',
  'be careful not to burn the squash especially if you opt to use sugar or butter',
  'depending on size of squash , cut into half or fourths',
  'for spicy squash , drizzle olive oil or melted butter over each cut squash piece',
  'for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash 

Load all the reviews into a dataframe

In [6]:
interactions = pd.read_csv('./Data/RAW_interactions.csv')

Convert the dates to usable dates using pandas

In [7]:
interactions['date'] = pd.to_datetime(interactions['date'])

In [8]:
def jaccard(s1,s2):
    numer = len(s1.intersection(s2))
    denom = len(s2.union(s1))
    return numer / denom if denom > 0 else 0

In [9]:
ratings = []

Create a list of tuples of user, item, and ratings to a list.

In [10]:
for index, row in interactions.iterrows():
    user, item, rating = row['user_id'], row['recipe_id'], row['rating']
    ratings.append((user,item,rating))

Shuffle the ratings then create two sets. One will be used to train the data, the other will be used to check its accuracy.

In [11]:
random.shuffle(ratings)
number_of_ratingss = len(ratings)
split = int(0.8 * number_of_ratingss)


In [12]:
training_data = ratings[:split]
validation_data = ratings[split:]



In [None]:
def recipeVector(recipe):
    return [recipe.get('minutes',0), recipe.get('n_steps',0),recipe.get('calories',0), recipe.get('n_ingredients',0)]

In [14]:
X_train = []
y_train = []

for u, i, r in training_data:
    if i in recipes:
        X_train.append(recipeVector(recipes[i]))
        y_train.append(r)

X_valid = []
y_valid = []

for u, i, r in validation_data:
    if i in recipes:
        X_valid.append(recipeVector(recipes[i]))
        y_valid.append(r)

# 3. Train the Linear Regression Model
# fit_intercept=True adds the bias term (theta_0/offset) automatically
model = LinearRegression(fit_intercept=True)
model.fit(X_train, y_train)

# 4. Evaluate
predictions = model.predict(np.array(X_valid))
mse = np.mean((np.array(y_valid) - predictions) ** 2)

print(f"Linear Regression MSE: {mse}")

# 5. Analyze Coefficients
# This tells you which features actually matter
feature_names = ['Minutes', 'N_Steps', 'Calories', 'N_Ingredients']
print("\nFeature Coefficients:")
for name, coef in zip(feature_names, model.coef_):
    print(f"  {name}: {coef:.5f}")
print(f"  Intercept (Base Rating): {model.intercept_:.5f}")

Linear Regression MSE: 1.5961509420214919

Feature Coefficients:
  Minutes: 0.00000
  N_Steps: -0.00501
  Calories: -0.00001
  N_Ingredients: 0.00205
  Intercept (Base Rating): 4.44726


Create dictionaries of each rating a user has given an item and each rating a recipe has received from each user

In [15]:
reviews_per_user = defaultdict(list)
reviews_per_item = defaultdict(list)
for user, item, rating in training_data:
    reviews_per_user[user].append((item, rating))
    reviews_per_item[item].append((user, rating))

Calculate the mean rating of all ratings in the training set and get the MSE using the global average as a baseline

In [16]:
global_avg_rating = np.mean([r for _, _, r in training_data])

In [None]:
global_avg_mse = 0.0
for u, i, r in validation_data:
    global_avg_mse += (r - global_avg_rating) ** 2
global_avg_mse /= len(validation_data)

print(f"Global Average Baseline MSE: {global_avg_mse}")

Global Average Baseline MSE: 1.596962598331192


In [18]:
def alphaUpdate(ratingsTrain, alpha, betaU, betaI, lamb):
    newAlpha = 0
    for u,b,r in ratingsTrain:
        newAlpha += r - (betaU[u] + betaI[b])
    return newAlpha / len(ratingsTrain)

In [19]:
def betaUUpdate(ratingsPerUser, alpha, betaU, betaI, lamb):
    newBetaU = {}
    for u in ratingsPerUser:
        newBeta = 0
        for b,r in ratingsPerUser[u]:
            newBeta += r - (alpha + betaI[b])
        newBetaU[u] = newBeta / (lamb + len(ratingsPerUser[u]))
    return newBetaU

In [20]:
def betaIUpdate(ratingsPerItem, alpha, betaU, betaI, lamb):
    newBetaI = {}
    for b in ratingsPerItem:
        newBeta = 0
        for u,r in ratingsPerItem[b]:
            newBeta += r - (alpha + betaU[u])
        newBetaI[b] = newBeta / (lamb + len(ratingsPerItem[b]))
    return newBetaI

In [21]:
def userBiasModel(ratingsTrain, ratingsPerUser, ratingsPerItem, alpha, betaU, betaI,lamb):
    for i in range(10):
        alpha = alphaUpdate(ratingsTrain, alpha, betaU, betaI, lamb)
        betaU = betaUUpdate(ratingsPerUser, alpha, betaU, betaI, lamb)
        betaI = betaIUpdate(ratingsPerItem, alpha, betaU, betaI, lamb)
    return alpha, betaU, betaI

In [22]:
def getMSE(alpha,betaI,betaU, dataset):
    sse = 0
    for user, item, rating in dataset:
        prediction = alpha + betaU.get(user, 0.0) + betaI.get(item, 0.0)
        sse += (rating - prediction) ** 2
    mse = sse / len(dataset)
    return mse

In [23]:
user_review_history = defaultdict(list)

interactions.sort_values(by=['user_id', 'date'], inplace=True)

current_history = defaultdict(list)

for index, row in interactions.iterrows():
    user_id = int(row['user_id'])
    review_text = str(row['review'])

    user_review_history[user_id] = " ".join(current_history[user_id])
    
    current_history[user_id].append(review_text)

Run a model that calculates the user's bias and use that for prediction. Find the best $\lambda$ to minimize MSE 

In [24]:
# 1. Initialize Parameters
alpha = global_avg_rating
betaU = {u: 0.0 for u in reviews_per_user}
betaI = {i: 0.0 for i in reviews_per_item}
lambs = [0.1,0.5,1.0,5.0,500.0]

bestMSE = 100.0
bestIndex = 0
for i in range(len(lambs)):
    # 2. Run Training
    lamb = lambs[i]
    alpha = global_avg_rating
    betaU = {u: 0.0 for u in reviews_per_user}
    betaI = {i: 0.0 for i in reviews_per_item}

    alpha, betaU, betaI = userBiasModel(training_data, reviews_per_user, reviews_per_item, alpha, betaU, betaI, lamb)
    # 3. Evaluate
    train_mse = getMSE(alpha, betaU, betaI, training_data)
    valid_mse = getMSE(alpha, betaU, betaI, validation_data)

    if bestMSE > valid_mse:
        bestMSE = valid_mse
        bestIndex = i

print(bestMSE)
print(lambs[bestIndex])

1.603127937867828
500.0


Create a dictionary: User ID -> Set of all ingredients they have used
We only want to keep ingredients from recipes with ratings > 4 that way we are not adding disliked ingredients, same for tags

In [25]:
# Create a dictionary: User ID -> Set of all ingredients they have liked
user_ingredient_profiles = defaultdict(set)
user_tag_profiles = defaultdict(set)

print("Building user profiles...")
for u, i, r in training_data:
    # Only consider recipes they liked (e.g., rating >= 4)
    # This reduces noise from things they hated
    if r >= 4:
        # Check if we have ingredient info for this recipe
        if i in recipes:
            recipe_ingredients = recipes[i]['ingredients']
            user_ingredient_profiles[u].update(recipe_ingredients)
            recipe_tags = recipes[i]['tags']
            user_tag_profiles[u].update(recipe_tags)

print(f"Built profiles for {len(user_ingredient_profiles)} users")

Building user profiles...
Built profiles for 153898 users


Calculate the rating based off the user's bias and the similarity of the recipe's tags and ingredients to the user's tags and ingredients

In [26]:
def similarityBiasModel(u, i, alpha, betaU, betaI, weight1,weight2):
    bias_pred = alpha + betaU.get(u, 0.0) + betaI.get(i, 0.0)
    
    jaccard_ingredients = 0.0
    if u in user_ingredient_profiles and i in recipes:
        u_ingredients = user_ingredient_profiles[u]
        i_ingredients = recipes[i]['ingredients']
         
        # Jaccard Calculation
        jaccard_ingredients = jaccard(u_ingredients,i_ingredients)

    jaccard_tag = 0.0
    if u in user_tag_profiles and i in recipes:
        u_tags = user_tag_profiles[u]
        i_tags = recipes[i]['tags']

        jaccard_tag = jaccard(u_tags,i_tags)    
    return bias_pred + (weight1 * (jaccard_tag)) +( weight2 * (jaccard_ingredients))

Find the best weights of the ingredient and tag similarity scores

In [27]:
# Use the best parameters from your previous tuning (e.g., Lambda=10 or 100)
# alpha, betaU, betaI = ... (your trained values)

weights = [0.0,0.1,0.25,0.5,0.75,1.0]
best_hybrid_mse = float('inf')
best_weight = 0
best_weight2 = 0
best_pen = 0
print("Tuning Hybrid Weight...")
for w in weights:
    for j in weights:
        sse = 0
        for u, i, r in validation_data:
            pred = similarityBiasModel(u, i, alpha, betaU, betaI, w,j)
            sse += (r - pred) ** 2
        
        mse = sse / len(validation_data)
        print(f"Weight: ({w},{j}), MSE: {mse}")
        
        if mse < best_hybrid_mse:
            best_hybrid_mse = mse
            best_weight = w
            best_weight2 = j

print(f"\nBest Hybrid MSE: {best_hybrid_mse} (at weight {best_weight},{best_weight2})")

Tuning Hybrid Weight...
Weight: (0.0,0.0), MSE: 1.5369389034777867
Weight: (0.0,0.1), MSE: 1.5365632615387113
Weight: (0.0,0.25), MSE: 1.536059928793883
Weight: (0.0,0.5), MSE: 1.5353813879891642
Weight: (0.0,0.75), MSE: 1.5349032810636798
Weight: (0.0,1.0), MSE: 1.5346256080173282
Weight: (0.1,0.0), MSE: 1.5353024960042652
Weight: (0.1,0.1), MSE: 1.5349992586720598
Weight: (0.1,0.25), MSE: 1.5346045328374274
Weight: (0.1,0.5), MSE: 1.5341070035496294
Weight: (0.1,0.75), MSE: 1.533809908141143
Weight: (0.1,1.0), MSE: 1.5337132466118415
Weight: (0.25,0.0), MSE: 1.5334795843496263
Weight: (0.25,0.1), MSE: 1.5332849539275557
Weight: (0.25,0.25), MSE: 1.5330531384582529
Weight: (0.25,0.5), MSE: 1.5328271264460116
Weight: (0.25,0.75), MSE: 1.5328015483130213
Weight: (0.25,1.0), MSE: 1.5329764040591463
Weight: (0.5,0.0), MSE: 1.5321259304066923
Weight: (0.5,0.1), MSE: 1.5321123115016597
Weight: (0.5,0.25), MSE: 1.5321520133078355
Weight: (0.5,0.5), MSE: 1.5323785300881076
Weight: (0.5,0.75),

In [28]:
# Find a case where Hybrid beat the Bias model
print("analyzing improvements...")
for u, i, r in validation_data:
    # 1. Calculate Bias Prediction
    bias_pred = alpha + betaU.get(u, 0.0) + betaI.get(i, 0.0)
    
    # 2. Calculate Hybrid Prediction
    bias_sim_pred = similarityBiasModel(u, i, alpha, betaU, betaI, 0.5,0.25)
    
    # 3. Check error
    err_bias = abs(r - bias_pred)
    err_hybrid = abs(r - bias_sim_pred)
    
    # Did Hybrid win by a lot? (e.g., improved by 0.5 stars)
    if err_hybrid < err_bias - 0.5:
        print(f"User {u} rated Item {i} as {r}")
        print(f"  Bias Prediction: {bias_pred:.2f} (Error: {err_bias:.2f})")
        print(f"  Hybrid Prediction: {bias_sim_pred:.2f} (Error: {err_hybrid:.2f})")
        
        # Print ingredients to explain WHY
        if u in user_ingredient_profiles and i in recipes:
            common = user_ingredient_profiles[u].intersection(recipes[i]['ingredients'])
            print(f"  Common Ingredients: {list(common)[:5]}")
        break

analyzing improvements...
User 774494 rated Item 257276 as 5
  Bias Prediction: 4.27 (Error: 0.73)
  Hybrid Prediction: 4.98 (Error: 0.02)
  Common Ingredients: ['sour cream', 'elbow macaroni', 'eggs', 'black pepper', 'sharp cheddar cheese']


In [29]:
# 1. Load the Pre-Processed Data
pp_recipes = pd.read_csv('./Data/PP_recipes.csv')
pp_users = pd.read_csv('./Data/PP_users.csv')

# 2. Parse the list columns (Strings -> Lists)
# This converts "[1, 2, 3]" to the list [1, 2, 3]
pp_recipes['ingredient_ids'] = pp_recipes['ingredient_ids'].apply(ast.literal_eval)
pp_users['items'] = pp_users['items'].apply(ast.literal_eval)
pp_users['ratings'] = pp_users['ratings'].apply(ast.literal_eval)


# 3. Create Training Samples (Flattening the user history)
# The PP_users file has one row per user. We need to "explode" this 
# so we have one row per interaction (User, Item, Rating) for training.

interaction_list = []

for index, row in pp_users.iterrows():
    user_internal_id = row['u']
    item_list = row['items']
    rating_list = row['ratings']
    
    # Zip them together to get individual interactions
    for item_id, rating in zip(item_list, rating_list):
        interaction_list.append([user_internal_id, item_id, rating])

# Convert to DataFrame
df_train = pd.DataFrame(interaction_list, columns=['u', 'i', 'rating'])

In [30]:
train_df, valid_df = train_test_split(df_train, test_size=0.2, random_state=42)

class FoodDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['u'].values, dtype=torch.long)
        self.items = torch.tensor(df['i'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

train_dataset = FoodDataset(train_df)
valid_dataset = FoodDataset(valid_df)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=128, shuffle=False)

In [31]:
class MatrixFactorizationNN(nn.Module):
    def __init__(self, num_users, num_items, n_factors=8):
        super(MatrixFactorizationNN, self).__init__()
        
        # User Embeddings (Gamma_u) & Bias (Beta_u)
        self.user_factors = nn.Embedding(num_users, n_factors)
        self.user_bias = nn.Embedding(num_users, 1)
        
        # Item Embeddings (Gamma_i) & Bias (Beta_i)
        self.item_factors = nn.Embedding(num_items, n_factors)
        self.item_bias = nn.Embedding(num_items, 1)
        
        # Global Bias (Alpha) - a single learnable number
        self.global_bias = nn.Parameter(torch.tensor(0.0))

    def forward(self, user, item):
        # 1. Get Embeddings
        u_factor = self.user_factors(user)
        i_factor = self.item_factors(item)
        
        # 2. Dot Product (Similarity)
        # (Batch_size, Factors) * (Batch_size, Factors) -> Sum -> (Batch_size, 1)
        dot_product = (u_factor * i_factor).sum(dim=1, keepdim=True)
        
        # 3. Add Biases
        u_b = self.user_bias(user)
        i_b = self.item_bias(item)
        
        # Prediction = Alpha + Beta_u + Beta_i + (Gamma_u * Gamma_i)
        output = self.global_bias + u_b + i_b + dot_product
        
        return output.squeeze() # Remove extra dimension

In [32]:
# 1. Initialize Model
# We need the max ID to know how many embeddings to create
num_users = df_train['u'].max() + 1
num_items = df_train['i'].max() + 1

model = MatrixFactorizationNN(num_users, num_items, n_factors=4) # 8 latent factors
criterion = nn.MSELoss() # We want to minimize Mean Squared Error
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)
# 2. Training Loop
device = torch.device("cpu")
model.to(device)

print("Starting Neural Training...")
for epoch in range(5): # Run for 5 epochs
    model.train()
    total_loss = 0
    
    for users, items, ratings in train_loader:
        users, items, ratings = users.to(device), items.to(device), ratings.to(device)
        
        # Forward pass
        predictions = model(users, items)
        loss = criterion(predictions, ratings)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    # Validation Step
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for users, items, ratings in valid_loader:
            users, items, ratings = users.to(device), items.to(device), ratings.to(device)
            preds = model(users, items)
            v_loss = criterion(preds, ratings)
            val_loss += v_loss.item()
            
    avg_train_loss = total_loss / len(train_loader)
    avg_val_loss = val_loss / len(valid_loader)
    
    print(f"Epoch {epoch+1}: Train MSE = {avg_train_loss:.4f}, Valid MSE = {avg_val_loss:.4f}")

Starting Neural Training...
Epoch 1: Train MSE = 1.8243, Valid MSE = 0.8462
Epoch 2: Train MSE = 0.8460, Valid MSE = 0.8469
Epoch 3: Train MSE = 0.8463, Valid MSE = 0.8485
Epoch 4: Train MSE = 0.8452, Valid MSE = 0.8469
Epoch 5: Train MSE = 0.8466, Valid MSE = 0.8476
