In [1]:

import pandas as pd
import pickle

# Load the datasets
ingr_map = pd.read_pickle("ingr_map.pkl")
raw_recipes = pd.read_csv("RAW_recipes.csv")
raw_interactions = pd.read_csv("RAW_interactions.csv")
pp_users = pd.read_csv("PP_users.csv")
pp_recipes = pd.read_csv("PP_recipes.csv")
interactions_validation = pd.read_csv("interactions_validation.csv")
interactions_train = pd.read_csv("interactions_train.csv")
interactions_test = pd.read_csv("interactions_test.csv")


# Show some basic information about each dataset
datasets = {
    'ingr_map': ingr_map,
    'raw_recipes': raw_recipes,
    'raw_interactions': raw_interactions,
    'pp_users': pp_users,
    'pp_recipes': pp_recipes,
    'interactions_validation': interactions_validation,
    'interactions_train': interactions_train,
    'interactions_test': interactions_test
}

info_dict = {}

for name, dataset in datasets.items():
    info_dict[name] = {
        'Number of Rows': dataset.shape[0],
        'Number of Columns': dataset.shape[1],
        'Columns': ', '.join(dataset.columns)
    }

info_df = pd.DataFrame(info_dict).T
info_df


Unnamed: 0,Number of Rows,Number of Columns,Columns
ingr_map,11659,7,"raw_ingr, raw_words, processed, len_proc, repl..."
raw_recipes,231637,12,"name, id, minutes, contributor_id, submitted, ..."
raw_interactions,1132367,5,"user_id, recipe_id, date, rating, review"
pp_users,25076,6,"u, techniques, items, n_items, ratings, n_ratings"
pp_recipes,178265,8,"id, i, name_tokens, ingredient_tokens, steps_t..."
interactions_validation,7023,6,"user_id, recipe_id, date, rating, u, i"
interactions_train,698901,6,"user_id, recipe_id, date, rating, u, i"
interactions_test,12455,6,"user_id, recipe_id, date, rating, u, i"


In [2]:
raw_recipes_df = pd.read_csv('RAW_recipes.csv')
raw_interactions_df = pd.read_csv("RAW_interactions.csv")

# SVD collaborative filtering

In [3]:
data = raw_interactions_df[['user_id', 'recipe_id', 'rating']]

In [4]:
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data, reader)
model = SVD()

cross_validation_results = cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print(cross_validation_results)



Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2223  1.2182  1.2155  1.2279  1.2180  1.2204  0.0043  
MAE (testset)     0.7406  0.7403  0.7360  0.7428  0.7393  0.7398  0.0022  
Fit time          12.46   12.83   12.60   13.03   13.45   12.87   0.35    
Test time         1.70    1.35    1.30    1.44    1.87    1.53    0.22    
{'test_rmse': array([1.22231115, 1.21816206, 1.21553553, 1.22786698, 1.21796324]), 'test_mae': array([0.74062302, 0.74025291, 0.73601485, 0.7428266 , 0.73925335]), 'fit_time': (12.46197509765625, 12.825676918029785, 12.602699995040894, 13.030407190322876, 13.449766159057617), 'test_time': (1.7019448280334473, 1.3513116836547852, 1.3037898540496826, 1.4356706142425537, 1.8686282634735107)}


In [5]:
from surprise.model_selection import train_test_split
from surprise import accuracy

# Split the data into training and test set (e.g., 75% training, 25% testing)
trainset, testset = train_test_split(data, test_size=0.25)

# Train the model on the training set
model = SVD()
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Compute and print the accuracy metrics
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)


RMSE: 1.2227
MAE:  0.7408


# Hyperparameter tuning for SVD collaborative filtering

In [7]:
from surprise import SVD
from surprise.model_selection import GridSearchCV
from surprise import Dataset, Reader

# Define the parameter grid
param_grid = {
    'n_factors': [50, 100, 150],
    'n_epochs': [20, 30, 40],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.02, 0.1]
}

# Setup grid search
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)

# Load your dataset
# Define the reader with the rating scale
reader = Reader(rating_scale=(1, 5))
# Load the dataset
data = Dataset.load_from_df(raw_interactions_df[['user_id', 'recipe_id', 'rating']], reader)

# Run grid search
gs.fit(data)

# Best RMSE score
print("Best RMSE score: ", gs.best_score['rmse'])

# Combination of parameters that gave the best RMSE score
print("Best parameters: ", gs.best_params['rmse'])


Best RMSE score:  1.2161208333623452
Best parameters:  {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.1}


# Using the Best Parameters to Train the SVD collaborative filtering Model

In [9]:
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Setup the SVD model with the best parameters
model = SVD(n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.1)

# Split your dataset into train and test sets
trainset, testset = train_test_split(data, test_size=0.25)

# Train the model on the trainset
model.fit(trainset)

# Make predictions on the testset
predictions = model.test(testset)

# Calculate and print the RMSE on the test set
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 1.2182
MAE:  0.7430


# Content-based Reccomendation Model

In [35]:
# Get Tags for Content-based features
import ast

recipe_data = pd.read_csv("RAW_recipes.csv")

# Parsing the tags from string representation of list to actual list
recipe_data['tags'] = recipe_data['tags'].apply(ast.literal_eval)

# Exploring the unique tags and their frequencies
all_tags = [tag for sublist in recipe_data['tags'] for tag in sublist]
unique_tags = set(all_tags)
tag_frequency = pd.Series(all_tags).value_counts()

num_unique_tags = len(unique_tags)
selected_indices = [4,5,8, 11] + list(range(13, 61))
selected_tags = tag_frequency.iloc[selected_indices]


num_unique_tags, selected_tags


(552,
 dietary                  165091
 easy                     126062
 low-in-something          85776
 60-minutes-or-less        69990
 meat                      56042
 30-minutes-or-less        55077
 vegetables                53814
 taste-mood                52143
 4-hours-or-less           49497
 north-american            48479
 3-steps-or-less           44933
 15-minutes-or-less        43934
 low-sodium                43349
 desserts                  43203
 low-carb                  42189
 healthy                   40340
 dinner-party              37561
 low-cholesterol           36743
 low-calorie               36429
 vegetarian                35651
 beginner-cook             35561
 5-ingredients-or-less     35466
 holiday-event             34920
 inexpensive               32619
 low-protein               32522
 low-saturated-fat         31378
 fruit                     31324
 oven                      31180
 american                  31179
 eggs-dairy                30142
 pas

In [36]:
# One-hot encoding tags to speed up computation
top_tags = selected_tags.index.tolist()

# Initializing columns for top tags with default value 0
for tag in top_tags:
    recipe_data[f'tag_{tag}'] = 0

# Setting the value to 1 if the recipe contains the tag
for index, row in recipe_data.iterrows():
    for tag in top_tags:
        if tag in row['tags']:
            recipe_data.at[index, f'tag_{tag}'] = 1


Unnamed: 0,name,id,tag_dietary,tag_easy,tag_low-in-something,tag_60-minutes-or-less,tag_meat,tag_30-minutes-or-less,tag_vegetables,tag_taste-mood,...,tag_seasonal,tag_weeknight,tag_chicken,tag_appetizers,tag_brunch,tag_to-go,tag_for-large-groups,tag_beef,tag_one-dish-meal,tag_cheese
0,arriba baked winter squash mexican style,137739,1,1,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
1,a bit different breakfast pizza,31490,1,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,all in the kitchen chili,112140,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,alouette potatoes,59389,1,1,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
4,amish tomato ketchup for canning,44061,1,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [31]:
# Merging the user ratings data (interactions_data) with the one-hot encoded tags from recipe_data
# The merging key will be 'recipe_id'

interactions_data = pd.read_csv("RAW_interactions.csv")

# Selecting relevant columns from recipe_data (recipe_id and one-hot encoded tags)
recipe_tags_data = recipe_data[['id'] + [col for col in recipe_data.columns if col.startswith('tag_')]]

# Renaming 'id' column to 'recipe_id' for consistency
recipe_tags_data.rename(columns={'id': 'recipe_id'}, inplace=True)

# Merging the datasets
merged_data = interactions_data.merge(recipe_tags_data, how='left', on='recipe_id')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recipe_tags_data.rename(columns={'id': 'recipe_id'}, inplace=True)


Unnamed: 0,user_id,recipe_id,date,rating,review,tag_low-in-something,tag_60-minutes-or-less,tag_meat,tag_30-minutes-or-less,tag_vegetables,...,tag_seasonal,tag_weeknight,tag_chicken,tag_appetizers,tag_brunch,tag_to-go,tag_for-large-groups,tag_beef,tag_one-dish-meal,tag_cheese
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall...",0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...,0,0,0,1,0,...,1,0,0,0,0,1,1,0,0,0
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin...",0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# Filling missing values in tag columns with zeros
merged_data.fillna({col: 0 for col in merged_data.columns if col.startswith('tag_')}, inplace=True)

# Checking the first few rows of the updated merged dataset
merged_data.head()


Unnamed: 0,user_id,recipe_id,date,rating,review,tag_low-in-something,tag_60-minutes-or-less,tag_meat,tag_30-minutes-or-less,tag_vegetables,...,tag_seasonal,tag_weeknight,tag_chicken,tag_appetizers,tag_brunch,tag_to-go,tag_for-large-groups,tag_beef,tag_one-dish-meal,tag_cheese
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall...",0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...,0,0,0,1,0,...,1,0,0,0,0,1,1,0,0,0
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin...",0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


# Hyperparameter Tuning for Content-based Model

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Prepare the dataset for content-based model
X = merged_data.drop(columns=['user_id', 'recipe_id', 'rating', 'review', 'date'])
y = merged_data['rating']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train a simple model like RandomForest
content_model = RandomForestRegressor()
content_model.fit(X_train, y_train)

# Make predictions on the test set
predictions_content = content_model.predict(X_test)


In [39]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Create a base model
rf = RandomForestRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)


Fitting 3 folds for each of 36 candidates, totalling 108 fits




[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 7.1min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=10.0min
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=51.4min
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=118.4min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=10.9min




[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 7.1min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=10.0min
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=51.4min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=84.3min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=46.0min




[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 3.5min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 3.6min
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 3.4min
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=58.0min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=84.3min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=46.0min




[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 3.5min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 7.0min
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 6.7min
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=152.6min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=19.3min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=10.8min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 3.5min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 3.6min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=10.0min
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total ti



[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=10.5min
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 6.6min
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=101.9min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=72.2min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=16.5min
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 5.4min




[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 3.5min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 7.0min
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 6.6min
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=101.9min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=77.6min
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 5.3min
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=16.4min
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=12.2min
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=12.1min




[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=10.5min
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time= 3.3min
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 3.3min
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=101.8min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=66.8min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time= 5.4min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=16.5min
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time= 5.4min
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=15.9min
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total ti

In [40]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
# Create a new model with the best parameters
optimized_rf = RandomForestRegressor(max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=300)

# Retrain the model on the entire training set
optimized_rf.fit(X_train, y_train)

# Predict on the test set
y_pred = optimized_rf.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)


MSE: 1.6006633320865211
RMSE: 1.2651732419263857
MAE: 0.8456339830113258
