# Recommender Model

In [167]:
%pip install faiss-cpu

Note: you may need to restart the kernel to use updated packages.


In [168]:
import pandas as pd
import numpy as np
import faiss
from sklearn.neighbors import NearestNeighbors
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import scipy.sparse
import pickle
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\annie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [169]:
# initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [170]:
# define a function to lemmatize an input string
def lemmatize_string(string):
    string_lower = string.lower()
    # tokenize the string into individual words
    tokens = word_tokenize(string_lower)
    # lemmatize each token
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

# define a function to lemmatize an input list
def lemmatize_list(list):
    return [lemmatizer.lemmatize(item.lower()) for item in list]

In [171]:
# load precomputed tfidf matrix and vectorizer
tfidf_matrix = scipy.sparse.load_npz('models/tfidf_matrix.npz')
with open('models/tfidf_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)
df = pd.read_csv('Data/recipes_food_com_combinedinfo.csv')

Ingredients-based recommender

In [172]:
# Initialize NearestNeighbors model and fit on the tfidf_matrix
nearest_neighbors = NearestNeighbors(metric='cosine', algorithm='brute')
nearest_neighbors.fit(tfidf_matrix)

In [237]:
# ingredients based recommender
def recommend_by_ingredients(ingredients_list, excluded_ingredients=None):
    if excluded_ingredients is None:
        excluded_ingredients = []
    # if excluded_ingredients is a string, convert to list
    elif isinstance(excluded_ingredients, str):
        excluded_ingredients = excluded_ingredients.lower().split(', ')

    # preprocess input by lemmatizing
    lemmatized_ingredients = lemmatize_string(ingredients_list)

    # Transform the user's input ingredients into the vector space
    user_vector = vectorizer.transform([lemmatized_ingredients])
    
    # Find the top N nearest neighbors
    distances, indices = nearest_neighbors.kneighbors(user_vector, n_neighbors=df.shape[0])
    
    # Retrieve recommended recipes and their similarity scores
    recommendations = df.iloc[indices[0]].copy()
    recommendations['IngredientSimilarity'] = 1 - distances[0]  # Similarity = 1 - distance (cosine)

    # filter out recipes containing excluded ingredients
    lemmatized_excluded_ingredients = lemmatize_list(excluded_ingredients)
    def contains_excluded(ingredients):
        for excluded in lemmatized_excluded_ingredients:
            if excluded in ingredients:
                return True
        return False
    
    recommendations = recommendations[~recommendations['NLP_Ingredients'].apply(contains_excluded)].reset_index()
    
    return recommendations[['ID','Name', 'IngredientSimilarity', 'IngredientsExtracted', 'NLP_Ingredients']]


In [238]:
ingredients = 'tomato basil pasta'

recommend_by_ingredients(ingredients)

Unnamed: 0,ID,Name,IngredientSimilarity,IngredientsExtracted,NLP_Ingredients
0,52849,La Pasta Di Casamicciola (Donkey House Pasta),0.716905,"('tomatoes', 'lemon juice', 'olive oil', 'salt...","('fresh basil', 'lemon juice', 'olive oil', 'p..."
1,218905,Mom's Skillet Goulash,0.712875,"('pasta', 'diced tomatoes', 'hamburger', 'toma...","('basil', 'hamburger', 'pasta', 'tomato', 'tom..."
2,350058,Bruschetta Pasta,0.689191,"('olive oil', 'tomatoes', 'onion', 'fresh basi...","('black pepper', 'fresh basil', 'garlic clove'..."
3,14360,Alla Checca,0.676579,"('tomatoes', 'garlic cloves', 'fresh basil', '...","('fresh basil', 'garlic clove', 'olive oil', '..."
4,182256,Cheese Tostados,0.675240,"('tortilla', 'tomato and basil pasta sauce', '...","('cheese', 'chili', 'salt', 'tomato basil past..."
...,...,...,...,...,...
494944,91562,Herbed Pasta Salad,0.000000,"('of fresh mint', 'fresh dill', 'parsley', 'sc...","('buttermilk', 'corn', 'fresh dill', 'mayonnai..."
494945,85714,Favorite Apple Pie,0.000000,"('unbaked pie shells', 'granulated sugar', 'br...","('allpurpose flour', 'brown sugar', 'butter', ..."
494946,336664,Cheesy Salsa Potatoes,0.000000,"('water', 'milk', 'margarine', 'scalloped pota...","('cheddar cheese', 'fresh cilantro', 'green ch..."
494947,326589,Vegan Fruit and Dark Chocolate Bars,0.000000,"('raisins', 'dates', 'prune', 'nuts', 'dark ch...","('dark chocolate', 'date', 'nut', 'prune', 'ra..."


Description-based recommender

In [174]:
# initialize sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# load precomputed embeddings
embeddings = np.load('Data/recipe_embeddings.npy')

In [175]:
# build FAISS index
dimension = embeddings.shape[1]
# inner product = cosine similarity for normalized vectors
index = faiss.IndexFlatIP(dimension)
# normalize embeddings for cosine similarity
faiss.normalize_L2(embeddings)
# add recipe embeddings to index
index.add(embeddings)

In [239]:
def recommend_by_description(user_query):
    # encode the query
    query_embedding = model.encode(user_query, convert_to_tensor=False)

    # normalize query embedding
    faiss.normalize_L2(np.array([query_embedding]))

    # query the faiss index
    distances, indices = index.search(np.array([query_embedding]), df.shape[0])

    # retrieve recommended rows
    recommendations = df.iloc[indices[0]].copy().reset_index()

    # add similarity scores from faiss to the dataframe
    recommendations['DescriptionSimilarity'] = distances[0]

    return recommendations[['ID','Name','DescriptionSimilarity','CombinedInfo']]

In [240]:
query = 'easy chicken pasta'
recommend_by_description(query)

Unnamed: 0,ID,Name,DescriptionSimilarity,CombinedInfo
0,445385,Easy Chicken & Pasta,0.840905,"Easy Chicken & Pasta | Tags: occasion, main-di..."
1,453416,Easy Chicken and Broccoli Pasta,0.818596,Easy Chicken and Broccoli Pasta | Tags: dietar...
2,70352,Easy Chicken and Pepper Pasta,0.807572,Easy Chicken and Pepper Pasta | Tags: dinner-p...
3,170912,Easy Cheesy Chicken With Pasta,0.803549,Easy Cheesy Chicken With Pasta | Tags: dinner-...
4,111447,Easy Chicken Spaghetti,0.802921,"Easy Chicken Spaghetti | Tags: spaghetti, past..."
...,...,...,...,...
494944,487294,Ruth's Caramels,-0.048094,"Ruth's Caramels | Tags: snacks, sweet, occasio..."
494945,474833,Sugar Diamonds,-0.054166,"Sugar Diamonds | Tags: equipment, occasion, su..."
494946,396902,Cherries in the Snow,-0.055827,"Cherries in the Snow | Tags: dietary, occasion..."
494947,201124,Chocolate-Covered Bourbon Cherries,-0.057735,Chocolate-Covered Bourbon Cherries | Tags: che...


Combined recommender

In [249]:
def recommend_combined(ingredients_list=None, user_query=None, ingredients_weight=0.5, description_weight=0.5, excluded_ingredients=None, top_n=5):
    if ingredients_list is not None and user_query is None:
        return recommend_by_ingredients(ingredients_list=ingredients_list, excluded_ingredients=excluded_ingredients).head(top_n)

    if ingredients_list is None and user_query is not None:
        return recommend_by_description(user_query=user_query).head(top_n)

    if ingredients_list is not None and user_query is not None:
        if ingredients_weight + description_weight != 1:
            print("Please make sure ingredients_weight and description_weight add up to 1.")
        else:
            # get recommendations from both models
            ingredient_recs = recommend_by_ingredients(ingredients_list=ingredients_list, excluded_ingredients=excluded_ingredients)
            description_recs = recommend_by_description(user_query)

            # merge the two recommendation lists on ID
            combined_recs = pd.merge(ingredient_recs, description_recs, on=['ID','Name'], how='outer')

            # normalize scores to make sure they are on the same scale
            scaler = MinMaxScaler()
            combined_recs[['IngredientSimilarity', 'DescriptionSimilarity']] = scaler.fit_transform(combined_recs[['IngredientSimilarity','DescriptionSimilarity']])

            # combine the scores
            combined_recs['WeightedScore'] = (combined_recs['IngredientSimilarity'] * ingredients_weight) + (combined_recs['DescriptionSimilarity'] * description_weight)

            # sort by combined score and return top recommendations
            combined_recs = combined_recs.sort_values(by='WeightedScore', ascending=False).head(top_n).reset_index()

            return combined_recs[['ID', 'Name', 'WeightedScore', 'IngredientSimilarity', 'DescriptionSimilarity','NLP_Ingredients','CombinedInfo']]

In [252]:
# ingredients only
ingredients_list = 'pasta basil tomato cheese'

recommend_combined(ingredients_list=ingredients_list)


Unnamed: 0,ID,Name,IngredientSimilarity,IngredientsExtracted,NLP_Ingredients
0,14360,Alla Checca,0.716976,"('tomatoes', 'garlic cloves', 'fresh basil', '...","('fresh basil', 'garlic clove', 'olive oil', '..."
1,182256,Cheese Tostados,0.715557,"('tortilla', 'tomato and basil pasta sauce', '...","('cheese', 'chili', 'salt', 'tomato basil past..."
2,432546,Eggplant Baked With Tomato,0.691293,"('eggplants', 'olive oil', 'mozzarella cheese'...","('eggplant', 'fresh basil', 'mozzarella cheese..."
3,441472,Italian Red Sauce,0.680544,"('olive oil', 'onion', 'kosher salt', 'black p...","('black pepper', 'cheese', 'fresh basil', 'kos..."
4,52849,La Pasta Di Casamicciola (Donkey House Pasta),0.676511,"('tomatoes', 'lemon juice', 'olive oil', 'salt...","('fresh basil', 'lemon juice', 'olive oil', 'p..."


In [254]:
# description only
user_query = 'easy dinner italian'

recommend_combined(user_query=user_query)

Unnamed: 0,ID,Name,DescriptionSimilarity,CombinedInfo
0,165594,Easy Italian Supper,0.79656,"Easy Italian Supper | Tags: main-dish, course"
1,251206,Easy Italian Stew,0.736756,"Easy Italian Stew | Tags: occasion, 30-minutes..."
2,250677,Italian Rollups,0.728983,"Italian Rollups | Tags: dinner-party, to-go, o..."
3,330423,Easy Italian Casserole,0.725602,"Easy Italian Casserole | Tags: beef, pasta-ric..."
4,305079,Easy Italian Pasta Casserole,0.718995,"Easy Italian Pasta Casserole | Tags: beef, pas..."


In [256]:
# both ingredients and description
ingredients_list = 'pasta basil tomato cheese'
user_query = 'easy dinner italian'

recommend_combined(ingredients_list=ingredients_list, user_query=user_query)

Unnamed: 0,ID,Name,WeightedScore,IngredientSimilarity,DescriptionSimilarity,NLP_Ingredients,CombinedInfo
0,441472,Italian Red Sauce,0.821185,0.949186,0.693184,"('black pepper', 'cheese', 'fresh basil', 'kos...",Italian Red Sauce | Tags: pasta-rice-and-grain...
1,14360,Alla Checca,0.813382,1.0,0.626763,"('fresh basil', 'garlic clove', 'olive oil', '...","Alla Checca | Tags: equipment, pasta-rice-and-..."
2,52849,La Pasta Di Casamicciola (Donkey House Pasta),0.809075,0.943562,0.674589,"('fresh basil', 'lemon juice', 'olive oil', 'p...",La Pasta Di Casamicciola (Donkey House Pasta) ...
3,51517,Pasta with Oil and Garlic Sauce,0.807977,0.899758,0.716196,"('basil', 'garlic', 'olive oil', 'parmesan che...",Pasta with Oil and Garlic Sauce | Tags: dinner...
4,182256,Cheese Tostados,0.799673,0.998021,0.601325,"('cheese', 'chili', 'salt', 'tomato basil past...","Cheese Tostados | Tags: dinner-party, brunch, ..."


In [None]:
# ingredients_weight and description_weight does not add up to 1
ingredients_list = 'pasta basil tomato cheese'
user_query = 'easy dinner italian'
ingredients_weight = 0.7
description_weight = 0.5

recommend_combined(ingredients_list=ingredients_list, user_query=user_query, ingredients_weight=ingredients_weight, description_weight=description_weight)

Please make sure ingredients_weight and description_weight add up to 1.


Save Model

In [180]:
# Save the nearest neighbors model
with open('models/nearest_neighbors_model.pkl', 'wb') as f:
    pickle.dump(nearest_neighbors, f)

In [None]:
with open('models/faiss_model.pkl', 'wb') as f:
    pickle.dump(model, f)