In [None]:
#Part 1: Data Exploration and Analysis
import pandas as pd
import pickle

recipes = pd.read_csv('RAW_recipes.csv')
interactions = pd.read_csv('RAW_interactions.csv')
df = pd.read_pickle('ingr_map.pkl')

print("Recipes Dataset Sample:\n", recipes.head())
print("\n\nInteractions Dataset Sample:\n", recipes.head())
print("\n\nIngr Map Sample:\n", recipes.head())

In [None]:
#Further Data Inspection

print("\nRecipes Dataseet info:\n",recipes.info())
print("\n\nInteractions Dataset info:\n",interactions.info())
print("\n\nMapping Key  info:\n",df.info())

In [None]:
#Handling Missing values
print("\nMissing Data in Recipes:\n")
print(recipes.isnull().sum())

print("\nMissing Data in Interactions:\n")
print(interactions.isnull().sum())

In [None]:
#Filling missing rows with placeholder strings
recipes['description'] = recipes['description'].fillna('No description available')
interactions['review'] = interactions['review'].fillna('No reviews available')

print("\nMissing Data in Recipes:\n")
print(recipes.isnull().sum())

print("\nMissing Data in Interactions:\n")
print(interactions.isnull().sum())

In [None]:
#Dropping Rows with missing values in critical columns
recipes = recipes.dropna(subset=['name', 'ingredients', 'steps'])
interactions = interactions.dropna(subset=['rating'])

print("\nCleaned Recipes Dataset Info:")
print(recipes.info())


print("\nCleaned Interactions Dataset Info:")
print(interactions .info())


In [None]:
#Normalize Ingredients

def normalize_ingredients(ingredients_list, ingr_map):
    normalized = [ingr_map.get(ingredient, ingredient) for ingredient in ingredients_list]
    return normalized


recipes['ingredients'] = recipes['ingredients'].apply(lambda x: x if isinstance(x, list) else eval(x))
recipes['normalized_ingredients'] = recipes['ingredients'].apply(lambda x: normalize_ingredients(x, df))
print("Done normalizing!")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


#Feature Engineering

recipes['num_ingredients'] = recipes['ingredients'].apply(len)
recipes['num_steps'] = recipes['steps'].apply(lambda x: len(eval(x)))

#Parsing Nutritional Information 
nutrition_columns = ['calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates']
recipes[nutrition_columns] = recipes['nutrition'].apply(
    lambda x: pd.Series(eval(x)) if isinstance(x, str) else pd.Series([np.nan]*7)
)

#Visualization
#Distribution of # of ingredients
plt.figure(figsize=(10,6))
sns.histplot(recipes['num_ingredients'], kde=False, bins=20, color='blue')
plt.title("Distribution of Number of Ingredients per Recipe")
plt.xlabel("Number of Ingredients")
plt.ylabel("Frequency")
plt.show()

#Distribution of Ratings
plt.figure(figsize=(10,6))
sns.histplot(interactions['rating'], kde=False, bins=10, color='green')
plt.title("Distribution of Ratings")
plt.xlabel("Rating")
plt.ylabel("Frequency")
plt.show()

recipes.to_csv('cleaned_recipes.csv', index=False)
interactions.to_csv('cleaned_interactions.csv', index=False)

print("\nData Cleaning and Feature Engineering Complete!!!")

In [None]:
#Part 2: Building the Recommendation System

recipes = pd.read_csv('cleaned_recipes.csv')
interactions = pd.read_csv('cleaned_interactions.csv')

print(recipes.head())
print(interactions.head())

In [None]:
#Merging and Preparing Data

merged_data = pd.merge(interactions, recipes, left_on='recipe_id', right_on='id', how='inner')

#Confirm Structure
print(merged_data.head())

In [None]:
#(Optional) Finding 'common' ingredients

from collections import Counter

recipes['ingredients'] = recipes['ingredients'].apply(lambda x: x if isinstance(x, list) else eval(x))

all_ingredients = [ingredient for recipe in recipes['ingredients'] for ingredient in recipe]

# Count the frequency of each ingredient
ingredient_counts = Counter(all_ingredients)

# top 100 most common ingredients
N = 20
common_ingredients = [ingredient for ingredient, count in ingredient_counts.most_common(N)]

print(f"Top {N} common ingredients: {common_ingredients}")


In [None]:
#Collaborative Filtering

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from scipy.sparse.linalg import svds

from scipy.sparse import csr_matrix

interactions['rating'] = interactions['rating'].astype(float)

# Create a sparse interaction matrix
interaction_matrix_sparse = csr_matrix(
    (interactions['rating'], (interactions['user_id'], interactions['recipe_id']))
)

In [None]:
# Filter to include only users who have rated more than a threshold
user_threshold = 10
active_users = interactions['user_id'].value_counts()
active_users = active_users[active_users > user_threshold].index
interactions_filtered = interactions[interactions['user_id'].isin(active_users)]

# Filter to include only recipes with many ratings
recipe_threshold = 10
popular_recipes = interactions['recipe_id'].value_counts()
popular_recipes = popular_recipes[popular_recipes > recipe_threshold].index
interactions_filtered = interactions_filtered[interactions_filtered['recipe_id'].isin(popular_recipes)]


interaction_matrix_sparse = csr_matrix(
    (interactions_filtered['rating'], (interactions_filtered['user_id'], interactions_filtered['recipe_id']))
)


In [None]:
# Subsampling interaction matrix to avoid memory crashes
interaction_matrix_sparse = interaction_matrix_sparse[:5000, :500]
#Batch Processing for SVD

from scipy.sparse.linalg import svds

#perform SVD

U, sigma, Vt = svds(interaction_matrix_sparse[:5000, :500], k=20)
sigma = np.diag(sigma)

In [None]:
#Reccomendation Phase

#Reconstructing the approximated interaction matrix
reconstructed_matrix = np.dot(np.dot(U, sigma), Vt)

#Ensure values are between the expected range
reconstructed_matrix = np.clip(reconstructed_matrix, 1, 5)

In [None]:

sample_users = interactions['user_id'].drop_duplicates().iloc[:reconstructed_matrix.shape[0]]
sample_users = sample_users.iloc[:reconstructed_matrix.shape[0]]
sample_recipes = interactions['recipe_id'].drop_duplicates().iloc[:reconstructed_matrix.shape[1]]

sample_recipes = sample_recipes.iloc[:reconstructed_matrix.shape[1]]
recipe_id_mapping = {recipe: idx for idx, recipe in enumerate(sample_recipes)}
index_to_recipe_id = {idx: recipe for recipe, idx in recipe_id_mapping.items()}
user_to_index = {user_id: index for index, user_id in enumerate(interactions['user_id'].unique())}

# # Check if the problematic User ID is included
# if user_id_to_check not in sample_users.values:
#     print(f"Adding User ID {user_id_to_check} to the dataset.")
#     #sample_users = sample_users.append(pd.Series([user_id_to_check])).drop_duplicates()
#     sample_users = pd.concat([sample_users, pd.Series([user_id_to_check])]).drop_duplicates()
    
user_id_mapping = {user_id: idx for idx, user_id in enumerate(sample_users)}
#print(f"User ID {user_id_to_check} added to user_id_mapping.")

new_user_row = np.zeros(reconstructed_matrix.shape[1])  # A row of zeros for all recipes
reconstructed_matrix = np.vstack([reconstructed_matrix, new_user_row])
print(f"Reconstructed matrix extended. New shape: {reconstructed_matrix.shape}")

if 'reconstructed_matrix' not in globals():
    print("Error: 'reconstructed_matrix' is not defined. Please ensure it is calculated.")
else:
    # Function to recommend recipes for a given user
    def recommended_recipes(user_id, reconstructed_matrix, recipe_mapping, top_n=5):
    # Check if user exists in the mapping
        if user_id not in user_id_mapping:
            print(f"User ID {user_id} not found. Returning default recommendations.")
            return [recipe_mapping[idx] for idx in range(top_n)]
    
        user_index = user_id_mapping[user_id]
        if user_index >= reconstructed_matrix.shape[0]:
            print(f"User index {user_index} is out of bounds for reconstructed_matrix.")
            return []
    
        # Get predicted ratings for the user
        predicted_ratings = reconstructed_matrix[user_index]
    
        # Check bounds for recipe indices
        if len(predicted_ratings) != len(recipe_mapping):
            print(f"Mismatch: {len(predicted_ratings)} predictions vs {len(recipe_mapping)} recipes.")
            return []
    
        # Get top N recipes
        recommended_indices = np.argsort(predicted_ratings)[::-1][:top_n]
        recommended_recipes = [recipe_mapping[idx] for idx in recommended_indices if idx in recipe_mapping]
        return recommended_recipes

    # Sample user and get recommendations
    sample_users = interactions['user_id'].drop_duplicates().sample(n=10000)
    user_id = sample_users.iloc[0]  # Choose the first sample user

    top_recipes = recommended_recipes(user_id, reconstructed_matrix, index_to_recipe_id)

    if top_recipes:
        print(f"Top recommended recipes for user {user_id}: {top_recipes}")


In [None]:
# Add the focus user to sample_users and extend reconstructed_matrix
user_id_to_check = 1858424
focus_user_id = user_id_to_check

if user_id_to_check not in sample_users.values:
    print(f"Adding User ID {user_id_to_check} to sample_users and extending reconstructed_matrix.")
    sample_users = pd.concat([sample_users, pd.Series([user_id_to_check])]).reset_index(drop=True)
    
    # Extend reconstructed_matrix
    new_user_row = np.zeros(reconstructed_matrix.shape[1])  # Add a row for the new user
    reconstructed_matrix = np.vstack([reconstructed_matrix, new_user_row])
    print(f"Reconstructed matrix extended. New shape: {reconstructed_matrix.shape}")

# Trim sample_users to match reconstructed_matrix dimensions
sample_users = sample_users.iloc[:reconstructed_matrix.shape[0]].reset_index(drop=True)

# Rebuild user_id_mapping
user_id_mapping = {user_id: idx for idx, user_id in enumerate(sample_users)}

# Validate focus user
focus_user_index = user_id_mapping.get(focus_user_id, -1)

if focus_user_index < 0 or focus_user_index >= reconstructed_matrix.shape[0]:
    print(f"Error: User index {focus_user_index} is out of bounds for reconstructed_matrix.")
else:
    print(f"Focus user {focus_user_id} is valid and mapped to index {focus_user_index}.")

# Generate recommendations for the focus user
top_recipes = recommended_recipes(focus_user_id, reconstructed_matrix, index_to_recipe_id, top_n=5)

if top_recipes:
    print(f"Top recommended recipes for user {focus_user_id}: {top_recipes}")
else:
    print(f"No recommendations available for user {focus_user_id}.")


In [None]:
# Filter the recommended recipes from the dataset
recommended_recipes_df = recipes[recipes['id'].isin([521583, 223868, 205535, 211570, 206084])]
print(recommended_recipes_df[['id','name', 'description']])