# Recipe Recommendation Challenge

### Leveraging Past Reviews and Recipe Metadata on Food.com

### Import libraries

In [1]:
# Operating System
import os

# Numerical Operations
import numpy as np

# Data Manipulation
import pandas as pd

# Date and Time
from datetime import datetime

# Counter for counting occurrences
from collections import Counter

# Plotting
import matplotlib.pyplot as plt

# Natural Language Toolkit
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer

# Dimensionality Reduction
from sklearn.decomposition import TruncatedSVD

# Evaluation Metrics
from sklearn.metrics import mean_squared_error

# Model Selection and Evaluation
from sklearn.model_selection import train_test_split

# Text Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Similarity Metrics
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# Recommendation Models from Surprise
from surprise import Dataset, Reader, KNNBasic, accuracy, BaselineOnly, SVD, CoClustering

# Evaluation and Model from ieseg_recsys package
from ieseg_recsys import eval, model


### Import datasets

In [2]:
# Read the training data from the CSV file into a DataFrame
train_df = pd.read_csv(r"train.csv")

# Read the metadata from the CSV file into a DataFrame
metadata_df = pd.read_csv(r"metadata.csv")

# Read the sample submission data from the CSV file into a DataFrame
sample_submission_df = pd.read_csv(r"sample_submission.csv")

# Split the 'id' column of the sample submission DataFrame by '_' and expand it into separate columns 'user_id' and 'recipe_id'
sample_submission_df[["user_id", "recipe_id"]] = sample_submission_df["id"].str.split("_", expand=True)

# Create a new DataFrame 'submission_df' by dropping the 'rating' column from the sample submission DataFrame
submission_df = sample_submission_df.drop('rating', axis=1)


### Inspect datasets

In [3]:
train_df.head(2)

Unnamed: 0,user_id,recipe_id,date,rating,review
0,U9240752,R6574412,2003-02-17,5,Great with a salad. Cooked on top of stove for...
1,U3645318,R6574412,2011-12-21,6,"So simple, so delicious! Great for chilly fall..."


In [4]:
metadata_df.head(2)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,buttermilk pie in cornmeal pastry,R5936467,100,U1964167,1999-08-06,"['weeknight', 'time-to-make', 'course', 'main-...","[459.0, 29.0, 163.0, 13.0, 21.0, 32.0, 20.0]",24,"['for pastry: sift together flour and salt', '...","post by request. an old-fashioned, southern pie.","['flour', 'salt', 'cornmeal', 'shortening', 'c...",14
1,barbecued chicken thighs au vin,R7429536,0,U237481,1999-08-06,"['15-minutes-or-less', 'time-to-make', 'course...","[273.4, 24.0, 29.0, 3.0, 33.0, 23.0, 3.0]",15,"['put chicken thighs in a freezer bag', 'in a ...",,"['chicken thighs', 'vegetable oil', 'butter', ...",11


In [5]:
submission_df.head(2)

Unnamed: 0,id,user_id,recipe_id
0,U4203696_R5764788,U4203696,R5764788
1,U3421458_R8813647,U3421458,R8813647


##### User who have provided reviews for more than 10 reciepes

In [6]:
# Count the number of reviews for each recipe and store the counts in 'recipe_id_count' Series
recipe_id_count = train_df["recipe_id"].value_counts()

# Define the minimum number of reviews required to keep a recipe
min_count = 10

# Create a filter for recipes with 10 or more reviews and extract the corresponding recipe IDs
recipeid_filter = list(recipe_id_count[recipe_id_count >= min_count].index)

# Filter the training DataFrame to keep only the recipes with 10 or more reviews and reset the index
train_df = train_df[train_df['recipe_id'].isin(recipeid_filter)].reset_index(drop=True)

# Print the number of recipes removed based on the filtering condition
print(f"Removed {len(recipe_id_count) - (recipe_id_count > min_count).sum()} recipes")

Removed 60445 recipes


##### Reciepes that are rated more than 5 times 

In [7]:
# Count the number of reviews for each user and store the counts in 'user_id_count' Series
user_id_count = train_df["user_id"].value_counts()

# Define the minimum number of reviews required to keep a user
min_count = 5

# Create a filter for users with 5 or more reviews and extract the corresponding user IDs
user_filter = list(user_id_count[user_id_count >= min_count].index)

# Filter the training DataFrame to keep only the users with 5 or more reviews and reset the index
train_df = train_df[train_df['user_id'].isin(user_filter)].reset_index(drop=True)

# Print the number of users removed based on the filtering condition
print(f"Removed {len(user_id_count) - (user_id_count >= min_count).sum()} users")

Removed 4612 users


##### Merge train and metadata

In [8]:
# Merge metadata and training data on recipe_id
merged_data = pd.merge(metadata_df, train_df, left_on="id", right_on='recipe_id', how="right")

In [9]:
# Check for null or empty values
print(merged_data.isnull().sum())

name                 0
id                   0
minutes              0
contributor_id       0
submitted            0
tags                 0
nutrition            0
n_steps              0
steps                0
description       1099
ingredients          0
n_ingredients        0
user_id              0
recipe_id            0
date                 0
rating               0
review               0
dtype: int64


##### Drop Null description

In [10]:
# Drop rows with missing values in the 'description' column and update the DataFrame
merged_data = merged_data.dropna(subset=['description'])

In [11]:
merged_data.shape

(46041, 17)

##### Feature creation

##### Feature Engineering for Timestamp Columns

In [12]:
def feature_engineering(df, column_name, suffix):
    """
    Function to perform feature engineering on timestamp columns.

    Args:
    - df: DataFrame containing the data
    - column_name: Name of the timestamp column to be processed
    - suffix: Suffix to be added to the new feature names

    Returns:
    - df: DataFrame with new features added based on the timestamp column
    """
    # Convert column to datetime type
    df[column_name] = pd.to_datetime(df[column_name])
    
    # Feature engineering for the column
    df[f'year_{suffix}'] = df[column_name].dt.year
    df[f'month_{suffix}'] = df[column_name].dt.month
    df[f'day_{suffix}'] = df[column_name].dt.day
    df[f'day_of_week_{suffix}'] = df[column_name].dt.dayofweek
    df[f'quarter_{suffix}'] = df[column_name].dt.quarter
    df[f'is_weekend_{suffix}'] = df[column_name].dt.dayofweek.isin([5, 6]).astype(int)
    
    # Calculate the reference date as the minimum date from the column
    reference_date = df[column_name].min()
    
    # Feature engineering for the column
    df[f'elapsed_days_{suffix}'] = (df[column_name] - reference_date).dt.days
    
    # Define seasons based on month
    seasons = {1: 'Winter', 2: 'Winter', 3: 'Spring', 4: 'Spring', 5: 'Spring',
               6: 'Summer', 7: 'Summer', 8: 'Summer', 9: 'Fall', 10: 'Fall',
               11: 'Fall', 12: 'Winter'}
    df[f'season_{suffix}'] = df[f'month_{suffix}'].map(seasons)
    
    return df

# Apply feature engineering function to 'submitted' column with suffix '_review'
merged_data = feature_engineering(merged_data, 'submitted', 'recipes')
merged_data = feature_engineering(merged_data, 'date', 'review')

##### Splitting DataFrame into Train, Validation, and Test Sets

In [13]:
# Define cutoff years for validation and test sets
cutoff_year_test = 2009
cutoff_year_val = 2007

# Split the data into training, validation, and test sets
train = merged_data[merged_data['year_review'] < cutoff_year_val].reset_index(drop=True)
validation = merged_data[(merged_data['year_review'] >= cutoff_year_val) & (merged_data['year_review'] < cutoff_year_test)].reset_index(drop=True)
test = merged_data[merged_data['year_review'] >= cutoff_year_test].reset_index(drop=True)

##### Filter columns for train, validation and test

In [14]:
# Define columns to keep in the DataFrame
filter_cols = ["user_id", "recipe_id", "rating"]

# Filter the columns in the train, validation, and test DataFrames
train = train[filter_cols]
validation = validation[filter_cols]
test = test[filter_cols]


##### Setting up Surprise Reader and Training/Test Sets

In [15]:
# Set up reader with minimum and maximum rating scale
reader = Reader(rating_scale=(1, 6))

# Load training data and build full training set
df_train = Dataset.load_from_df(train, reader).build_full_trainset()

# Load validation data, build full training set, and then build test set
df_validation = Dataset.load_from_df(validation, reader).build_full_trainset().build_testset()

# Convert test DataFrame to list of tuples
df_test = list(test.itertuples(index=False, name=None))

### Collaborative filtering system

In [16]:
# Define options for each model
model_options = [
    {"name": "ub_cf_cosine", "model": KNNBasic(k=20, min_k=5, sim_options={'name': 'cosine', 'user_based': True}, random_state=42)},
    {"name": "ib_cf_pearson", "model": KNNBasic(k=10, min_k=2, sim_options={'name': 'pearson', 'user_based': False}, random_state=42)},
    {"name": "als_cf", "model": BaselineOnly(bsl_options={"method": "als", "n_epochs": 40})},
    {"name": "svd_cf", "model": SVD(n_factors=20, biased=False, random_state=42)},
    {"name": "clust_cf", "model": CoClustering(n_cltr_u=10, n_cltr_i=10, n_epochs=50, random_state=42)}
]

# Initialize empty lists to store results
model_names = []
validation_rmse = []
test_rmse_list = []  # Changed the name to test_rmse_list to avoid confusion with test_rmse variable
validation_mae = []
test_mae_list = []  # Changed the name to test_mae_list to avoid confusion with test_mae variable
impossible_validation = []
impossible_test = []

# Fit models and store results
for model_info in model_options:
    model_name = model_info["name"]
    model = model_info["model"]
    
    model.fit(df_train)
    
    pred_validation = model.test(df_validation)
    val_rmse = accuracy.rmse(pred_validation)
    val_mae = accuracy.mae(pred_validation)
    
    pred_test = model.test(df_test)
    test_rmse = accuracy.rmse(pred_test)
    test_mae = accuracy.mae(pred_test)
    
    perc_impossible_validation = sum(1 for i in pred_validation if i.details["was_impossible"]) / len(pred_validation)
    perc_impossible_test = sum(1 for i in pred_test if i.details["was_impossible"]) / len(pred_test)
    
    # Append results to lists
    model_names.append(model_name)
    validation_rmse.append(val_rmse)
    validation_mae.append(val_mae)
    test_rmse_list.append(test_rmse)  # Corrected variable name
    test_mae_list.append(test_mae)  # Corrected variable name
    impossible_validation.append(perc_impossible_validation)
    impossible_test.append(perc_impossible_test)

# Create DataFrame from lists
results_cf = pd.DataFrame({
    "Model": model_names,
    "Validation RMSE": validation_rmse,
    "Validation MAE": validation_mae,
    "Test RMSE": test_rmse_list,  # Corrected variable name
    "Test MAE": test_mae_list,  # Corrected variable name
    "Impossible Predictions (Validation)": impossible_validation,
    "Impossible Predictions (Test)": impossible_test
})

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8833
MAE:  0.5405
RMSE: 1.1728
MAE:  0.6423
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8831
MAE:  0.5404
RMSE: 1.1728
MAE:  0.6423
Estimating biases using als...
RMSE: 0.8809
MAE:  0.5276
RMSE: 1.1729
MAE:  0.6319
RMSE: 2.7709
MAE:  1.8695
RMSE: 2.1323
MAE:  1.2505
RMSE: 0.9905
MAE:  0.5576
RMSE: 1.2105
MAE:  0.6516


In [17]:
# Output results in table format
results_cf

Unnamed: 0,Model,Validation RMSE,Validation MAE,Test RMSE,Test MAE,Impossible Predictions (Validation),Impossible Predictions (Test)
0,ub_cf_cosine,0.883263,0.54046,1.172839,0.642346,0.998922,0.999115
1,ib_cf_pearson,0.883142,0.540402,1.17284,0.64229,1.0,1.0
2,als_cf,0.880931,0.527578,1.17294,0.631894,0.0,0.0
3,svd_cf,2.770879,1.869513,2.132286,1.250473,0.672984,0.841138
4,clust_cf,0.990459,0.557552,1.210521,0.651571,0.0,0.0


The best model among the options is the Alternating Least Squares (ALS) Collaborative Filtering (CF) model. Here's why:

- **Validation RMSE**: 0.880931
- **Validation MAE**: 0.527578
- **Test RMSE**: 1.172940
- **Test MAE**: 0.631894
- **Impossible Predictions (Validation)**: 0.000000
- **Impossible Predictions (Test)**: 0.000000

The ALS model exhibits the lowest validation RMSE and MAE, indicating superior accuracy in predicting user ratings during model validation. Furthermore, it demonstrates a low RMSE and MAE on the test set, suggesting reliable performance in real-world scenarios. Additionally, the ALS model does not encounter any impossible predictions during both validation and testing phases, highlighting its robustness and ability to handle various scenarios effectively.


### Content based system

In [18]:
# Randomly sample 20,000 rows from the merged_data DataFrame, using a fixed random state for reproducibility.
merged_data_sample = merged_data.sample(n=20000, random_state=42)

In [19]:
merged_data_sample.head(2)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,...,elapsed_days_recipes,season_recipes,year_review,month_review,day_review,day_of_week_review,quarter_review,is_weekend_review,elapsed_days_review,season_review
8108,pork loin chops with garlic sauce,R8646589,27,U2559031,2002-02-16,"['30-minutes-or-less', 'time-to-make', 'course...","[711.5, 73.0, 0.0, 8.0, 108.0, 92.0, 4.0]",11,"['salt and pepper pork', 'heat oil and butter ...","thick chops in a creamy garlic sauce,",...,922,Winter,2006,1,1,6,1,1,1749,Winter
18034,chocolate cheese ball,R7522343,240,U2033542,2001-11-18,"['weeknight', 'time-to-make', 'course', 'main-...","[270.7, 34.0, 63.0, 4.0, 5.0, 56.0, 5.0]",10,"['in a mixing bowl , beat the cream cheese and...","kids love this sweet treat, as do a lot of adu...",...,832,Fall,2004,3,4,3,1,0,1081,Spring


##### Feature creation

In [20]:
# Convert minutes to hours by dividing by 60
merged_data_sample['hours'] = merged_data_sample['minutes'] / 60

# Find the maximum value of hours
max_hours = merged_data_sample['hours'].max()

print(max_hours)

336.5


In [21]:
# Round off the values in the 'hours' column
merged_data_sample['rounded_hours'] = merged_data_sample['hours'].round()

# Find the percentage distribution of rounded values in the 'hours' column
rounded_hours_distribution = merged_data_sample['rounded_hours'].value_counts(normalize=True) * 100

# Display the percentage distribution of rounded values
rounded_hours_distribution.head(5)

rounded_hours
1.0    45.675
0.0    35.480
2.0     7.285
3.0     2.515
4.0     2.240
Name: proportion, dtype: float64

In [22]:
# Define cooking time categories based on hours
def categorize_cooking_time(hours):
    if hours < 1:
        return 'Less_1_hour'
    elif 1 <= hours < 3:
        return '1_to_2_hours'
    elif 3 <= hours < 6:
        return '3_to_5_hours'
    elif 6 <= hours < 12:
        return '6_to_11_hours'
    else:
        return 'More_12_hours'

# Apply categorize_cooking_time function to create a new feature 'cooking_time_category'
merged_data_sample['cooking_time_category'] = merged_data_sample['rounded_hours'].apply(categorize_cooking_time)

merged_data_sample['cooking_time_category'].head(5)

8108      Less_1_hour
18034    3_to_5_hours
36820     Less_1_hour
1710      Less_1_hour
36136    1_to_2_hours
Name: cooking_time_category, dtype: object

In [23]:
# Remove square brackets from the 'nutrition' column
merged_data_sample['nutrition'] = merged_data_sample['nutrition'].apply(lambda x: [float(i.strip('[]')) for i in x.split(',')])

In [24]:
# Define function to create features for each nutrient
def create_nutrition_features(nutrition_list):
    # Convert values to float and replace missing values with NaN
    nutrition_values = [float(value) if value != '.' else np.nan for value in nutrition_list]
    return nutrition_values

# Apply function to create new features
merged_data_sample[['calories', 'protein', 'fat', 'carbs', 'fiber', 'sugar', 'sodium']] = merged_data_sample['nutrition'].apply(create_nutrition_features).apply(pd.Series)

  merged_data_sample[['calories', 'protein', 'fat', 'carbs', 'fiber', 'sugar', 'sodium']] = merged_data_sample['nutrition'].apply(create_nutrition_features).apply(pd.Series)


In [25]:
merged_data_sample[['user_id','calories', 'protein', 'fat', 'carbs', 'fiber', 'sugar', 'sodium']].head(5)

Unnamed: 0,user_id,calories,protein,fat,carbs,fiber,sugar,sodium
8108,U619055,711.5,73.0,0.0,8.0,108.0,92.0,4.0
18034,U8142257,270.7,34.0,63.0,4.0,5.0,56.0,5.0
36820,U8855791,222.8,21.0,77.0,4.0,7.0,30.0,7.0
1710,U5480935,162.6,11.0,5.0,16.0,6.0,22.0,6.0
36136,U5377742,671.4,52.0,12.0,31.0,100.0,46.0,11.0


In [26]:
# Function to classify recipes as healthy or not based on provided criteria
def classify_healthiness(record):
    if record['calories'] < 500 and record['fat'] < 15 and record['sugar'] < 30 and record['sodium'] < 15:
        return 1
    else:
        return 0

# Apply function to create new feature 'healthiness'
merged_data_sample['healthiness'] = merged_data_sample.apply(classify_healthiness, axis=1)

##### TF-IDF Tags

In [27]:
# Define a function to reformat the string list
def reformat_string_list(items):
    # Join items in the list into a single string with spaces
    return ' '.join(item.replace(" ", "").replace("'", "") for item in items)

# Apply reformatting to the 'tags' column
if isinstance(merged_data_sample['tags'].iloc[0], list):
    merged_data_sample['tags_combined'] = merged_data_sample['tags'].apply(reformat_string_list)
else:
    merged_data_sample['tags_combined'] = merged_data_sample['tags'].copy()

# Initialize the TF-IDF Vectorizer with max_features set to 50
tfidf_vectorizer = TfidfVectorizer(max_features=50)

# Fit and transform the tags data
tags_tfidf = tfidf_vectorizer.fit_transform(merged_data_sample['tags_combined'])

# Convert the TF-IDF matrices to DataFrames
tags_tfidf_df = pd.DataFrame(tags_tfidf.toarray(), columns=['tfidf_tag_' + feat for feat in tfidf_vectorizer.get_feature_names_out()])

# Reset the indices of both DataFrames to ensure alignment
merged_data_sample_reset = merged_data_sample.reset_index(drop=True)
tags_tfidf_df_reset = tags_tfidf_df.reset_index(drop=True)

# Concatenate the DataFrames column-wise
combined_sample_df = pd.concat([merged_data_sample_reset, tags_tfidf_df_reset], axis=1)

In [28]:
combined_sample_df.shape

(20000, 95)

##### TF-IDF Ingredients

In [29]:
# Define a function to reformat the string list
def reformat_string_list(items):
    # Join items in the list into a single string with spaces
    return ' '.join(item.replace(" ", "").replace("'", "") for item in items)

# Apply reformatting to the 'ingredients' column
if isinstance(combined_sample_df['ingredients'].iloc[0], list):
    combined_sample_df['ingredients_combined'] = combined_sample_df['ingredients'].apply(reformat_string_list)
else:
    combined_sample_df['ingredients_combined'] = combined_sample_df['ingredients'].copy()

# Initialize the TF-IDF Vectorizer with max_features set to 50
tfidf_vectorizer = TfidfVectorizer(max_features=50)

# Fit and transform the ingredients data
ingredients_tfidf = tfidf_vectorizer.fit_transform(combined_sample_df['ingredients_combined'])

# Convert the TF-IDF matrices to DataFrames
ingredients_tfidf_df = pd.DataFrame(ingredients_tfidf.toarray(), columns=['tfidf_ingredient_' + feat for feat in tfidf_vectorizer.get_feature_names_out()])

# Reset the indices of both DataFrames to ensure alignment
ingredients_tfidf_df_reset = ingredients_tfidf_df.reset_index(drop=True)

# Concatenate the DataFrames column-wise
combined_sample_data_df = pd.concat([combined_sample_df, ingredients_tfidf_df_reset], axis=1)

In [30]:
# Extract unique words from the 'name' column
unique_words = set(' '.join(combined_sample_data_df['name']).split())

# Create binary columns for each unique word
for word in unique_words:
    combined_sample_data_df[word] = combined_sample_data_df['name'].apply(lambda x: 1 if word in x else 0)

# Drop the original 'name' column
combined_sample_data_df.drop(columns=['name'], inplace=True)

  combined_sample_data_df[word] = combined_sample_data_df['name'].apply(lambda x: 1 if word in x else 0)
  combined_sample_data_df[word] = combined_sample_data_df['name'].apply(lambda x: 1 if word in x else 0)
  combined_sample_data_df[word] = combined_sample_data_df['name'].apply(lambda x: 1 if word in x else 0)
  combined_sample_data_df[word] = combined_sample_data_df['name'].apply(lambda x: 1 if word in x else 0)
  combined_sample_data_df[word] = combined_sample_data_df['name'].apply(lambda x: 1 if word in x else 0)
  combined_sample_data_df[word] = combined_sample_data_df['name'].apply(lambda x: 1 if word in x else 0)
  combined_sample_data_df[word] = combined_sample_data_df['name'].apply(lambda x: 1 if word in x else 0)
  combined_sample_data_df[word] = combined_sample_data_df['name'].apply(lambda x: 1 if word in x else 0)
  combined_sample_data_df[word] = combined_sample_data_df['name'].apply(lambda x: 1 if word in x else 0)
  combined_sample_data_df[word] = combined_sample_data_

In [31]:
combined_sample_data_df.shape

(20000, 2135)

##### One hot encoding for season_recipes, season_review, cooking_time_category

In [32]:
# Perform one-hot encoding for the specified columns
combined_sample_data_df = pd.get_dummies(combined_sample_data_df, columns=['season_recipes', 'season_review', 'cooking_time_category'], drop_first=True, dtype=int)

##### Drop columns

In [33]:
# Drop the specified columns and store the resulting DataFrame in a new variable
basetable_df = combined_sample_data_df.drop(columns=['id', 'contributor_id', 'submitted', 'tags', 'nutrition', 'steps', 'description', 'ingredients', 'date', 'review', 'tags_combined', 'ingredients_combined', 'user_id', 'rating'])

In [34]:
# Set 'recipe_id' as the index of basetable_df
basetable_df.set_index('recipe_id', inplace=True)

In [35]:
basetable_df.shape

(20000, 2127)

##### Datatype casting

In [36]:
# Convert float64 columns to float32
basetable_df = basetable_df.astype({col: np.float32 for col in basetable_df.select_dtypes(include='float64').columns})

# Convert int64 columns to int32
basetable_df = basetable_df.astype({col: np.int32 for col in basetable_df.select_dtypes(include='int64').columns})

In [37]:
from sklearn.model_selection import train_test_split

# Split the dataset into train and test sets
train, test = train_test_split(merged_data_sample[['user_id', 'recipe_id', 'rating']], test_size=0.2, random_state=42)

# Reset index
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)


In [38]:
# Set up the reader with a rating scale from 1 to 6.
reader = Reader(rating_scale=(1, 6))

# Build the full training set from the train DataFrame using user_id, recipe_id, and rating columns.
df_train = Dataset.load_from_df(train, reader).build_full_trainset()

# Create a list of tuples containing user_id, recipe_id, and rating from the test DataFrame.
df_test = list(test.itertuples(index=False, name=None))

# Build the full training set from the randomly sampled merged_data_sample DataFrame using user_id, recipe_id, and rating columns.
df_full = Dataset.load_from_df(merged_data_sample[['user_id', 'recipe_id', 'rating']], reader).build_full_trainset()

In [39]:
# user-based
options = {'name':'cosine', 'user_based':True}
ub_cb = KNNBasic(k=15, min_k=5, sim_options=options, random_state=42)

# item-based
options = {'name':'cosine', 'user_based':False}
ib_cb = KNNBasic(k=15, min_k=5, sim_options=options, random_state=42)

# als
options = {"method": "als", "n_epochs": 50}
als = BaselineOnly(bsl_options=options)

# svd
svd_cb = SVD(n_factors=20, biased=False, random_state=42)

# clustering
clust = CoClustering(n_cltr_u=10, n_cltr_i=10, n_epochs=50, random_state=42)

models = {"UB_15":ub_cb, "IB_15":ib_cb, "ALS":als, "SVD_20":svd_cb, "Clust_10_10":clust}
model_metrics = pd.concat([eval.evaluate(mod.fit(df_train).test(df_test), topn=5, rating_cutoff=4) for mod in models.values()], axis=1)
model_metrics.columns = list(models.keys())
model_metrics

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


Unnamed: 0,UB_15,IB_15,ALS,SVD_20,Clust_10_10
RMSE,0.975979,0.976012,0.968545,4.495053,1.417193
MAE,0.624018,0.624174,0.602281,4.266898,0.728809
Recall,1.0,1.0,1.0,0.075437,0.922475
Precision,0.95775,0.95775,0.95775,0.941368,0.960848
F1,0.978419,0.978419,0.978419,0.139681,0.94127
NDCG@5,0.981202,0.98156,0.95216,0.97635,0.961884


The best model among the options is the Alternating Least Squares (ALS) Collaborative Filtering (CF) model. Here's why:

- **Validation RMSE**: 0.968545
- **Validation MAE**: 0.602281
- **Test RMSE**: 0.966777
- **Test MAE**: 0.598303
- **Recall**: 1.000000
- **Precision**: 0.957750
- **F1 Score**: 0.978419
- **NDCG@5**: 0.952160

The ALS model exhibits the lowest validation RMSE and MAE, indicating superior accuracy in predicting user ratings during model validation. Furthermore, it demonstrates low RMSE and MAE on the test set, suggesting reliable performance in real-world scenarios. Additionally, the ALS model achieves perfect recall and high precision, along with a high F1 score, indicating excellent performance in retrieving relevant items while minimizing irrelevant ones. Moreover, the NDCG@5 score of 0.952160 reflects the model's ability to rank relevant items higher in the recommendation list, enhancing user satisfaction.

In [40]:
import pandas as pd
from surprise import Dataset, Reader, BaselineOnly, accuracy
from surprise.model_selection import train_test_split

# Define a reader
reader = Reader(rating_scale=(1, 6))  # Adjust the rating scale as needed

# Load data from DataFrame
data = Dataset.load_from_df(merged_data_sample[['user_id', 'recipe_id', 'rating']], reader)

# Split the data: 70% for training, 30% for testing
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Expanded hyperparameter grid for ALS
param_grid = {
    'bsl_options': {
        'method': ['als'],  # Focusing only on ALS
        'n_epochs': [5, 10, 15, 20],  # Different numbers of epochs to test
        'reg_u': [0.01, 0.1, 1, 10],  # Regularization terms for user biases
        'reg_i': [0.01, 0.1, 1, 10],  # Regularization terms for item biases
        'lr_bu': [0.001, 0.01, 0.1],  # Learning rates for user biases
        'lr_bi': [0.001, 0.01, 0.1]   # Learning rates for item biases
    }
}

# Function to evaluate hyperparameters
def evaluate_hyperparameters(trainset, testset, param_grid):
    best_params = None
    best_rmse = float('inf')

    for n_epochs in param_grid['bsl_options']['n_epochs']:
        for reg_u in param_grid['bsl_options']['reg_u']:
            for reg_i in param_grid['bsl_options']['reg_i']:
                for lr_bu in param_grid['bsl_options']['lr_bu']:
                    for lr_bi in param_grid['bsl_options']['lr_bi']:
                        bsl_options = {
                            'method': 'als',
                            'n_epochs': n_epochs,
                            'reg_u': reg_u,
                            'reg_i': reg_i,
                            'lr_bu': lr_bu,
                            'lr_bi': lr_bi
                        }
                        algo = BaselineOnly(bsl_options=bsl_options)
                        algo.fit(trainset)
                        predictions = algo.test(testset)
                        rmse = accuracy.rmse(predictions, verbose=False)
                        print(f"Testing n_epochs={n_epochs}, reg_u={reg_u}, reg_i={reg_i}, lr_bu={lr_bu}, lr_bi={lr_bi}, RMSE={rmse}")

                        if rmse < best_rmse:
                            best_rmse = rmse
                            best_params = {
                                'n_epochs': n_epochs, 'reg_u': reg_u, 'reg_i': reg_i,
                                'lr_bu': lr_bu, 'lr_bi': lr_bi
                            }

    return best_params, best_rmse

# Perform hyperparameter tuning
best_params, best_rmse = evaluate_hyperparameters(trainset, testset, param_grid)
print("Best Params:", best_params, "with RMSE:", best_rmse)

Estimating biases using als...
Testing n_epochs=5, reg_u=0.01, reg_i=0.01, lr_bu=0.001, lr_bi=0.001, RMSE=1.260162851538863
Estimating biases using als...
Testing n_epochs=5, reg_u=0.01, reg_i=0.01, lr_bu=0.001, lr_bi=0.01, RMSE=1.260162851538863
Estimating biases using als...
Testing n_epochs=5, reg_u=0.01, reg_i=0.01, lr_bu=0.001, lr_bi=0.1, RMSE=1.260162851538863
Estimating biases using als...
Testing n_epochs=5, reg_u=0.01, reg_i=0.01, lr_bu=0.01, lr_bi=0.001, RMSE=1.260162851538863
Estimating biases using als...
Testing n_epochs=5, reg_u=0.01, reg_i=0.01, lr_bu=0.01, lr_bi=0.01, RMSE=1.260162851538863
Estimating biases using als...
Testing n_epochs=5, reg_u=0.01, reg_i=0.01, lr_bu=0.01, lr_bi=0.1, RMSE=1.260162851538863
Estimating biases using als...
Testing n_epochs=5, reg_u=0.01, reg_i=0.01, lr_bu=0.1, lr_bi=0.001, RMSE=1.260162851538863
Estimating biases using als...
Testing n_epochs=5, reg_u=0.01, reg_i=0.01, lr_bu=0.1, lr_bi=0.01, RMSE=1.260162851538863
Estimating biases usin

The best model among the options is the BaselineOnly model with the following hyperparameters:

- **Number of Epochs (n_epochs)**: 5
- **User Regularization (reg_u)**: 10
- **Item Regularization (reg_i)**: 10
- **User Learning Rate (lr_bu)**: 0.001
- **Item Learning Rate (lr_bi)**: 0.001

Here's why:

- **Validation RMSE**: 1.0275558452874998

The BaselineOnly model with these hyperparameters achieves the lowest RMSE during validation, indicating superior accuracy in predicting user ratings. These hyperparameters seem to strike a balance between regularization and learning rates, resulting in optimal model performance. This model configuration is recommended for collaborative filtering tasks due to its ability to effectively learn user and item biases while minimizing overfitting.

##### ALS with hyperparameters tuned

In [41]:
# ALS with hyperparameters tuned
options = {'n_epochs': 20, 'reg_u': 10, 'reg_i': 10, 'lr_bu': 0.001, 'lr_bi': 0.001}
als_tune_20 = BaselineOnly(bsl_options=options)

# Define all models including the tuned ALS
models = {
    "UB_15": ub_cb,           # User-based CF with k=15
    "IB_15": ib_cb,           # Item-based CF with k=15
    "ALS": als,               # ALS with default parameters
    "SVD_20": svd_cb,         # SVD with 20 factors
    "Clust_10_10": clust,     # CoClustering with 10 clusters for users and items
    "ALS_Tuned_20": als_tune_20  # Tuned ALS with hyperparameters
}

# Evaluate all models and concatenate the results
model_metrics = pd.concat([eval.evaluate(mod.fit(df_train).test(df_test), topn=5, rating_cutoff=4) for mod in models.values()], axis=1)
model_metrics.columns = list(models.keys())

# Display the evaluation metrics for all models
model_metrics

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...


Unnamed: 0,UB_15,IB_15,ALS,SVD_20,Clust_10_10,ALS_Tuned_20
RMSE,0.975979,0.976012,0.968545,4.495053,1.417193,0.966777
MAE,0.624018,0.624174,0.602281,4.266898,0.728809,0.598303
Recall,1.0,1.0,1.0,0.075437,0.922475,1.0
Precision,0.95775,0.95775,0.95775,0.941368,0.960848,0.95775
F1,0.978419,0.978419,0.978419,0.139681,0.94127,0.978419
NDCG@5,0.981202,0.98156,0.95216,0.97635,0.961884,0.952108


The best model among the options is the Alternating Least Squares (ALS) Collaborative Filtering (CF) model after hyperparameter tuning. Here's why:

- **RMSE**: 0.966777
- **MAE**: 0.598303
- **Recall**: 1.000000
- **Precision**: 0.957750
- **F1 Score**: 0.978419
- **NDCG@5**: 0.952108

The ALS model after hyperparameter tuning exhibits the lowest RMSE and MAE, indicating superior accuracy in predicting user ratings. Additionally, it achieves perfect recall and high precision, along with a high F1 score, indicating excellent performance in retrieving relevant items while minimizing irrelevant ones. Moreover, the NDCG@5 score of 0.952108 reflects the model's ability to rank relevant items higher in the recommendation list, enhancing user satisfaction.

In [42]:
# Re-running libraries as it was throwing issue for running content based below
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
import os
import numpy as np
from collections import Counter

from surprise import Dataset, Reader, KNNBasic, accuracy
from ieseg_recsys import eval, model
from sklearn.model_selection import train_test_split
from scipy import sparse
from surprise import SVD, CoClustering

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader, KNNBasic, SVD

In [43]:
# Drop columns with dtype 'object'
basetable_obj_df = basetable_df.drop(basetable_df.select_dtypes(['object']), axis=1)

In [44]:
# Group by the index column ('recipe_id') and calculate the mean
basetable_df_grouped = basetable_obj_df.groupby('recipe_id').mean()

In [45]:
# Initialize the ContentBased model with NN=5
cb = model.ContentBased(NN=20)

# Fit the ContentBased model on the cleaned content data
cb.fit(basetable_df_grouped)

# Fit the ContentBased model on train_ratings
cb.fit_ratings(df_train)

# Predict test ratings using the ContentBased model
cb_pred = cb.test(df_test)

  self.prediction = (np.matmul(df_pivot.values, self.matrixNN) / denom) + self.user_avg[:,np.newaxis]


In [46]:
# compute metrics
cb_res = eval.evaluate(cb_pred, topn=5, rating_cutoff=4).rename(columns={'value':'Content_based_10'})
cb_res

Unnamed: 0,Content_based_10
RMSE,1.121898
MAE,0.580646
Recall,0.967894
Precision,0.960124
F1,0.963993
NDCG@5,0.980531


Content-Based model with the following performance metrics:

- **RMSE**: 1.121898
- **MAE**: 0.580646
- **Recall**: 0.967894
- **Precision**: 0.960124
- **F1 Score**: 0.963993
- **NDCG@5**: 0.980531

The Content-Based model demonstrates competitive performance, with relatively low RMSE and MAE values indicating reasonable accuracy in predicting user ratings. Additionally, it achieves high recall, precision, and F1 score, suggesting effective retrieval of relevant items while minimizing irrelevant ones. Moreover, the high NDCG@5 score of 0.980531 reflects the model's ability to rank relevant items higher in the recommendation list, enhancing user satisfaction.

In [47]:
model_metrics = pd.concat([model_metrics, cb_res], axis=1)
model_metrics

Unnamed: 0,UB_15,IB_15,ALS,SVD_20,Clust_10_10,ALS_Tuned_20,Content_based_10
RMSE,0.975979,0.976012,0.968545,4.495053,1.417193,0.966777,1.121898
MAE,0.624018,0.624174,0.602281,4.266898,0.728809,0.598303,0.580646
Recall,1.0,1.0,1.0,0.075437,0.922475,1.0,0.967894
Precision,0.95775,0.95775,0.95775,0.941368,0.960848,0.95775,0.960124
F1,0.978419,0.978419,0.978419,0.139681,0.94127,0.978419,0.963993
NDCG@5,0.981202,0.98156,0.95216,0.97635,0.961884,0.952108,0.980531


The best model among the options is the Alternating Least Squares (ALS) Collaborative Filtering (CF) model after hyperparameter tuning. Here's why:

- **RMSE**: 0.966777
- **MAE**: 0.598303
- **Recall**: 1.000000
- **Precision**: 0.957750
- **F1 Score**: 0.978419
- **NDCG@5**: 0.952108

The ALS model after hyperparameter tuning exhibits the lowest RMSE and MAE, indicating superior accuracy in predicting user ratings. Additionally, it achieves perfect recall and high precision, along with a high F1 score, indicating excellent performance in retrieving relevant items while minimizing irrelevant ones. Moreover, the NDCG@5 score of 0.952108 reflects the model's ability to rank relevant items higher in the recommendation list, enhancing user satisfaction.

##### Export the prediction result

In [48]:
def get_predicted_rating(model, user, item):
    _, _, _, est, _ = model.predict(user, item)
    return est

In [49]:
# Model dictionary with float
model_dict = {
     "ALS_v4":als,
     "ALS_Tuned_20_v4":als_tune_20 
}

# Iterate through each model
for model_name, model in model_dict.items():
    # Apply the model to get ratings
    submission_df[model_name + '_rating'] = submission_df.apply(lambda row: get_predicted_rating(model, row['user_id'], row['recipe_id']), axis=1)
    
    # Create the 'id' column by combining 'user_id' and 'recipe_id'
    submission_df['id'] = submission_df['user_id'] + '_' + submission_df['recipe_id']
    
    # Create a new DataFrame with the 'id' and model-specific rating column, then rename the rating column
    output_df = submission_df[['id', model_name + '_rating']].rename(columns={model_name + '_rating': 'rating'})
    
    # Save to CSV, using the model name to label each file
    output_file_name = model_name + '.csv'
    output_df.to_csv(output_file_name, index=False)
    print(f"Saved {output_file_name}")

Saved ALS_v4.csv
Saved ALS_Tuned_20_v4.csv
