In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import ast
import random

In [2]:
recipes_dataset = pd.read_csv('./Data/RAW_recipes.csv')

In [3]:
recipes_dataset['tags'] = recipes_dataset['tags'].apply(ast.literal_eval)
recipes_dataset['steps'] = recipes_dataset['steps'].apply(ast.literal_eval)
recipes_dataset['ingredients'] = recipes_dataset['ingredients'].apply(ast.literal_eval)
recipes_dataset['nutrition'] = recipes_dataset['nutrition'].apply(ast.literal_eval)
recipes_dataset['calories'] = recipes_dataset['nutrition'].apply(lambda x: x[0])



In [4]:
recipes = {}

for index, row in recipes_dataset.iterrows():
    recipes[row['id']] = {
        'name' : row['name'],
        'id' : row['id'],
        'minutes' : int(row['minutes']),
        'contributor_id' : row['contributor_id'],
        'submitted' : pd.to_datetime(row['submitted']),
        'tags' : set(row['tags']),
        'nutrition' : row['nutrition'],
        'n_steps' : int(row['n_steps']),
        'steps' : set(row['steps']),
        'description' : row['description'],
        'ingredients' : set(row['ingredients']),
        'n_ingredients' : int(row['n_ingredients']),
        'calories' : int(row['calories']) 
    }



In [5]:
recipes[137739]

{'name': 'arriba   baked winter squash mexican style',
 'id': 137739,
 'minutes': 55,
 'contributor_id': 47892,
 'submitted': Timestamp('2005-09-16 00:00:00'),
 'tags': {'60-minutes-or-less',
  'christmas',
  'course',
  'cuisine',
  'dietary',
  'easy',
  'fall',
  'holiday-event',
  'main-ingredient',
  'mexican',
  'north-american',
  'occasion',
  'preparation',
  'seasonal',
  'side-dishes',
  'squash',
  'time-to-make',
  'vegetables',
  'vegetarian',
  'winter'},
 'nutrition': [51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0],
 'n_steps': 11,
 'steps': {'bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin',
  'be careful not to burn the squash especially if you opt to use sugar or butter',
  'depending on size of squash , cut into half or fourths',
  'for spicy squash , drizzle olive oil or melted butter over each cut squash piece',
  'for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash 

In [6]:
interactions = pd.read_csv('./Data/RAW_interactions.csv')

In [7]:
interactions['date'] = pd.to_datetime(interactions['date'])

In [8]:
def jaccard(s1,s2):
    numer = len(s1.intersection(s2))
    denom = len(s2.union(s1))
    return numer / denom if denom > 0 else 0

In [9]:
reviews_per_user = defaultdict(list)
reviews_per_item = defaultdict(list)
ratings = []

In [10]:
for index, row in interactions.iterrows():
    user, item, rating = row['user_id'], row['recipe_id'], row['rating']
    reviews_per_user[user].append((item,rating))
    reviews_per_item[item].append((user,rating))
    ratings.append((user,item,rating))

In [11]:
random.shuffle(ratings)
number_of_ratingss = len(ratings)
split = int(0.8 * number_of_ratingss)


In [12]:
training_data = ratings[:split]
validation_data = ratings[split:]
reviews_per_user = defaultdict(list)
reviews_per_item = defaultdict(list)


In [13]:
for user, item, rating in training_data:
    reviews_per_user[user].append((item, rating))
    reviews_per_item[item].append((user, rating))

In [14]:
global_avg_rating = np.mean([r for _, _, r in training_data])

In [15]:
def alphaUpdate(ratingsTrain, alpha, betaU, betaI, lamb):
    newAlpha = 0
    for u,b,r in ratingsTrain:
        newAlpha += r - (betaU[u] + betaI[b])
    return newAlpha / len(ratingsTrain)

In [16]:
def betaUUpdate(ratingsPerUser, alpha, betaU, betaI, lamb):
    newBetaU = {}
    for u in ratingsPerUser:
        newBeta = 0
        for b,r in ratingsPerUser[u]:
            newBeta += r - (alpha + betaI[b])
        newBetaU[u] = newBeta / (lamb + len(ratingsPerUser[u]))
    return newBetaU

In [17]:
def betaIUpdate(ratingsPerItem, alpha, betaU, betaI, lamb):
    newBetaI = {}
    for b in ratingsPerItem:
        newBeta = 0
        for u,r in ratingsPerItem[b]:
            newBeta += r - (alpha + betaU[u])
        newBetaI[b] = newBeta / (lamb + len(ratingsPerItem[b]))
    return newBetaI

In [18]:
def goodModel(ratingsTrain, ratingsPerUser, ratingsPerItem, alpha, betaU, betaI,lamb):
    for i in range(20):
        alpha = alphaUpdate(ratingsTrain, alpha, betaU, betaI, lamb)
        betaU = betaUUpdate(ratingsPerUser, alpha, betaU, betaI, lamb)
        betaI = betaIUpdate(ratingsPerItem, alpha, betaU, betaI, lamb)
    return alpha, betaU, betaI

In [19]:
def getMSE(alpha,betaI,betaU, dataset):
    sse = 0
    for user, item, rating in dataset:
        prediction = alpha + betaU.get(user, 0.0) + betaI.get(item, 0.0)
        sse += (rating - prediction) ** 2
    mse = sse / len(dataset)
    return mse

In [20]:
user_review_history = defaultdict(list)

interactions.sort_values(by=['user_id', 'date'], inplace=True)

current_history = defaultdict(list)

for index, row in interactions.iterrows():
    user_id = int(row['user_id'])
    review_text = str(row['review'])

    user_review_history[user_id] = " ".join(current_history[user_id])
    
    current_history[user_id].append(review_text)

In [25]:
# 1. Initialize Parameters
alpha = global_avg_rating
betaU = {u: 0.0 for u in reviews_per_user}
betaI = {i: 0.0 for i in reviews_per_item}
lambs = [100.0,500.0,200.0,150.0]

bestMSE = 100.0
bestIndex = 0
for i in range(len(lambs)):
    # 2. Run Training
    lamb = lambs[i]
    alpha = global_avg_rating
    betaU = {u: 0.0 for u in reviews_per_user}
    betaI = {i: 0.0 for i in reviews_per_item}

    alpha, betaU, betaI = goodModel(training_data, reviews_per_user, reviews_per_item, alpha, betaU, betaI, lamb)
    # 3. Evaluate
    train_mse = getMSE(alpha, betaU, betaI, training_data)
    valid_mse = getMSE(alpha, betaU, betaI, validation_data)

    if bestMSE > valid_mse:
        bestMSE = valid_mse
        bestIndex = i

print(bestMSE)
print(lambs[bestIndex])

1.6099008844793672
500.0


In [26]:
# Calculate the MSE of just guessing the global average for everyone
global_avg_mse = 0
for u, i, r in validation_data:
    global_avg_mse += (r - global_avg_rating) ** 2
global_avg_mse /= len(validation_data)

print(f"Global Average Baseline MSE: {global_avg_mse}")

Global Average Baseline MSE: 1.6040004469551954


In [27]:
def predictHybrid(u, i, alpha, betaU, betaI, weight=1.0):
    # 1. Base Prediction (from your existing model)
    bias_pred = alpha + betaU.get(u, 0.0) + betaI.get(i, 0.0)
    
    # 2. Content Bonus (Jaccard)
    jaccard_score = 0.0
    if u in user_ingredient_profiles and i in recipes:
        u_ingredients = user_ingredient_profiles[u]
        i_ingredients = recipes[i]['ingredients']
        
        # Jaccard Calculation
        intersection = len(u_ingredients.intersection(i_ingredients))
        union = len(u_ingredients.union(i_ingredients))
        if union > 0:
            jaccard_score = intersection / union
            
    # Combine them
    return bias_pred + (weight * jaccard_score)

In [29]:
# Create a dictionary: User ID -> Set of all ingredients they have liked
user_ingredient_profiles = defaultdict(set)

print("Building user profiles...")
for u, i, r in training_data:
    # Only consider recipes they liked (e.g., rating >= 4)
    # This reduces noise from things they hated
    if r >= 4:
        # Check if we have ingredient info for this recipe
        if i in recipes:
            recipe_ingredients = recipes[i]['ingredients']
            user_ingredient_profiles[u].update(recipe_ingredients)

print(f"Built profiles for {len(user_ingredient_profiles)} users")

Building user profiles...
Built profiles for 153939 users


In [33]:
# Use the best parameters from your previous tuning (e.g., Lambda=10 or 100)
# alpha, betaU, betaI = ... (your trained values)

weights = [1.25, 1.5, 1.75]
best_hybrid_mse = float('inf')
best_weight = 0

print("Tuning Hybrid Weight...")
for w in weights:
    sse = 0
    for u, i, r in validation_data:
        pred = predictHybrid(u, i, alpha, betaU, betaI, weight=w)
        sse += (r - pred) ** 2
    
    mse = sse / len(validation_data)
    print(f"Weight: {w}, MSE: {mse}")
    
    if mse < best_hybrid_mse:
        best_hybrid_mse = mse
        best_weight = w

print(f"\nBest Hybrid MSE: {best_hybrid_mse} (at weight {best_weight})")

Tuning Hybrid Weight...
Weight: 1.25, MSE: 1.5043767105734847
Weight: 1.5, MSE: 1.504235601373332
Weight: 1.75, MSE: 1.5042930981866565

Best Hybrid MSE: 1.504235601373332 (at weight 1.5)


In [32]:
# Find a case where Hybrid beat the Bias model
print("analyzing improvements...")
for u, i, r in validation_data:
    # 1. Calculate Bias Prediction
    bias_pred = alpha + betaU.get(u, 0.0) + betaI.get(i, 0.0)
    
    # 2. Calculate Hybrid Prediction
    hybrid_pred = predictHybrid(u, i, alpha, betaU, betaI, weight=1.5)
    
    # 3. Check error
    err_bias = abs(r - bias_pred)
    err_hybrid = abs(r - hybrid_pred)
    
    # Did Hybrid win by a lot? (e.g., improved by 0.5 stars)
    if err_hybrid < err_bias - 0.5:
        print(f"User {u} rated Item {i} as {r}")
        print(f"  Bias Prediction: {bias_pred:.2f} (Error: {err_bias:.2f})")
        print(f"  Hybrid Prediction: {hybrid_pred:.2f} (Error: {err_hybrid:.2f})")
        
        # Print ingredients to explain WHY
        if u in user_ingredient_profiles and i in recipes:
            common = user_ingredient_profiles[u].intersection(recipes[i]['ingredients'])
            print(f"  Common Ingredients: {list(common)[:5]}")
        break

analyzing improvements...
User 1805530 rated Item 384181 as 5
  Bias Prediction: 4.23 (Error: 0.77)
  Hybrid Prediction: 4.83 (Error: 0.17)
  Common Ingredients: ['salt', 'vegetable oil', 'flour', 'sugar', 'vanilla extract']


In [36]:


# 1. Load the Pre-Processed Data
pp_recipes = pd.read_csv('./Data/PP_recipes.csv')
pp_users = pd.read_csv('./Data/PP_users.csv')

# 2. Parse the list columns (Strings -> Lists)
# This converts "[1, 2, 3]" to the list [1, 2, 3]
pp_recipes['ingredient_ids'] = pp_recipes['ingredient_ids'].apply(ast.literal_eval)
pp_users['items'] = pp_users['items'].apply(ast.literal_eval)
pp_users['ratings'] = pp_users['ratings'].apply(ast.literal_eval)

# 3. Create Training Samples (Flattening the user history)
# The PP_users file has one row per user. We need to "explode" this 
# so we have one row per interaction (User, Item, Rating) for training.

interaction_list = []

for index, row in pp_users.iterrows():
    user_internal_id = row['u']
    item_list = row['items']
    rating_list = row['ratings']
    
    # Zip them together to get individual interactions
    for item_id, rating in zip(item_list, rating_list):
        interaction_list.append([user_internal_id, item_id, rating])

# Convert to DataFrame
df_train = pd.DataFrame(interaction_list, columns=['u', 'i', 'rating'])

print(df_train.head())
# Output will look like:
#    u      i     rating
#    0    1118     5.0
#    0    27680    5.0

   u       i  rating
0  0    1118     5.0
1  0   27680     5.0
2  0   32541     5.0
3  0  137353     5.0
4  0   16428     5.0


In [37]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# 1. Split the data (80% train, 20% validation)
train_df, valid_df = train_test_split(df_train, test_size=0.2, random_state=42)

# 2. Define the PyTorch Dataset
class FoodDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['u'].values, dtype=torch.long)
        self.items = torch.tensor(df['i'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

# 3. Create DataLoaders (Batching)
train_dataset = FoodDataset(train_df)
valid_dataset = FoodDataset(valid_df)

# Batch size of 64 or 128 is standard
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=128, shuffle=False)

In [38]:
class MatrixFactorizationNN(nn.Module):
    def __init__(self, num_users, num_items, n_factors=8):
        super(MatrixFactorizationNN, self).__init__()
        
        # User Embeddings (Gamma_u) & Bias (Beta_u)
        self.user_factors = nn.Embedding(num_users, n_factors)
        self.user_bias = nn.Embedding(num_users, 1)
        
        # Item Embeddings (Gamma_i) & Bias (Beta_i)
        self.item_factors = nn.Embedding(num_items, n_factors)
        self.item_bias = nn.Embedding(num_items, 1)
        
        # Global Bias (Alpha) - a single learnable number
        self.global_bias = nn.Parameter(torch.tensor(0.0))

    def forward(self, user, item):
        # 1. Get Embeddings
        u_factor = self.user_factors(user)
        i_factor = self.item_factors(item)
        
        # 2. Dot Product (Similarity)
        # (Batch_size, Factors) * (Batch_size, Factors) -> Sum -> (Batch_size, 1)
        dot_product = (u_factor * i_factor).sum(dim=1, keepdim=True)
        
        # 3. Add Biases
        u_b = self.user_bias(user)
        i_b = self.item_bias(item)
        
        # Prediction = Alpha + Beta_u + Beta_i + (Gamma_u * Gamma_i)
        output = self.global_bias + u_b + i_b + dot_product
        
        return output.squeeze() # Remove extra dimension

In [41]:
# 1. Initialize Model
# We need the max ID to know how many embeddings to create
num_users = df_train['u'].max() + 1
num_items = df_train['i'].max() + 1

model = MatrixFactorizationNN(num_users, num_items, n_factors=8) # 8 latent factors
criterion = nn.MSELoss() # We want to minimize Mean Squared Error
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)
# 2. Training Loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Starting Neural Training...")
for epoch in range(5): # Run for 5 epochs
    model.train()
    total_loss = 0
    
    for users, items, ratings in train_loader:
        users, items, ratings = users.to(device), items.to(device), ratings.to(device)
        
        # Forward pass
        predictions = model(users, items)
        loss = criterion(predictions, ratings)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    # Validation Step
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for users, items, ratings in valid_loader:
            users, items, ratings = users.to(device), items.to(device), ratings.to(device)
            preds = model(users, items)
            v_loss = criterion(preds, ratings)
            val_loss += v_loss.item()
            
    avg_train_loss = total_loss / len(train_loader)
    avg_val_loss = val_loss / len(valid_loader)
    
    print(f"Epoch {epoch+1}: Train MSE = {avg_train_loss:.4f}, Valid MSE = {avg_val_loss:.4f}")

Starting Neural Training...
Epoch 1: Train MSE = 1.8997, Valid MSE = 0.8472
Epoch 2: Train MSE = 0.8457, Valid MSE = 0.8468
Epoch 3: Train MSE = 0.8462, Valid MSE = 0.8458
Epoch 4: Train MSE = 0.8460, Valid MSE = 0.8472
Epoch 5: Train MSE = 0.8460, Valid MSE = 0.8488
