# Recommender Model

In [9]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import scipy.sparse
import pickle
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\annie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
# initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [11]:
# define a function to lemmatize an input string
def lemmatize_string(string):
    string_lower = string.lower()
    # tokenize the string into individual words
    tokens = word_tokenize(string_lower)
    # lemmatize each token
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

# define a function to lemmatize an input list
def lemmatize_list(list):
    return [lemmatizer.lemmatize(item.lower()) for item in list]

In [12]:
# load precomputed tfidf matrix and vectorizer
tfidf_matrix = scipy.sparse.load_npz('models/tfidf_matrix.npz')
with open('models/tfidf_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)
data = pd.read_csv('Data/recipes_food_com_embeddedtexts.csv')

ParserError: Error tokenizing data. C error: out of memory

Test combined cleaned data

In [None]:
# ingredients based recommender
# Initialize NearestNeighbors model and fit on the tfidf_matrix
nearest_neighbors = NearestNeighbors(metric='cosine', algorithm='brute')
nearest_neighbors.fit(tfidf_matrix)

def recommend_by_ingredients(preferred_ingredients, top_n=5, excluded_ingredients=None):
    if excluded_ingredients is None:
        excluded_ingredients = []
    # if excluded_ingredients is a string, convert to list
    elif isinstance(excluded_ingredients, str):
        excluded_ingredients = excluded_ingredients.lower().split(', ')

    # preprocess input by lemmatizing
    lemmatized_ingredients = lemmatize_string(preferred_ingredients)

    # Transform the user's input ingredients into the vector space
    user_vector = vectorizer.transform([lemmatized_ingredients])
    
    # Find the top N nearest neighbors
    distances, indices = nearest_neighbors.kneighbors(user_vector, n_neighbors=top_n)
    
    # Retrieve recommended recipes and their similarity scores
    recommendations = data.iloc[indices[0]].copy()
    recommendations['IngredientSimilarity'] = 1 - distances[0]  # Similarity = 1 - distance (cosine)

    # filter out recipes containing excluded ingredients
    lemmatized_excluded_ingredients = lemmatize_list(excluded_ingredients)
    def contains_excluded(ingredients):
        for excluded in lemmatized_excluded_ingredients:
            if excluded in ingredients:
                return True
        return False
    
    recommendations = recommendations[~recommendations['NLP_Ingredients'].apply(contains_excluded)]
    
    return recommendations[['Name', 'Ingredient Similarity', 'IngredientsExtracted', 'NLP_Ingredients']]


In [None]:
def recommend_by_description(user_input, top_n=5):
    user_input_embedding = model.encode(user_input, convert_to_tensor=True)
    data['SentenceSimilarity'] = data['DescriptionEmbedding'].apply(
        lambda x: cosine_similarity([user_input_embedding], [x])[0][0]
    )
    recommendations = data.sort_values(by='SentenceSimilarity', ascending=False).head(top_n)
    return recommendations[['Name', 'SentenceSimilarity']]

In [None]:
# sentence similarity based recommender
# load the sentence-transformers model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Precompute embeddings for recipe descriptions
data['DescriptionEmbedding'] = list(model.encode(data['Name_Description'], convert_to_tensor=True))

In [None]:
def recommend_by_description(user_input, top_n=5):
    user_input_embedding = model.encode(user_input, convert_to_tensor=True)
    data['SentenceSimilarity'] = data['DescriptionEmbedding'].apply(
        lambda x: cosine_similarity([user_input_embedding], [x])[0][0]
    )
    recommendations = data.sort_values(by='SentenceSimilarity', ascending=False).head(top_n)
    return recommendations[['Name', 'SentenceSimilarity']]

In [None]:
ingredients_list = 'tomato bread beef carrot'

recommend(ingredients_list)

In [None]:
ingredients_list = 'tomato bread beef carrot'
excluded_ingredients = 'cheese'

recommend(ingredients_list, excluded_ingredients=excluded_ingredients)

In [None]:
ingredients_list = 'egg'
recommend(ingredients_list)

In [None]:
ingredients_list = 'Eggs'
recommend(ingredients_list)

In [None]:
ingredients_list = 'egg vanilla'
recommend(ingredients_list)

In [None]:
ingredients_list = 'eggs vanilla'
recommend(ingredients_list)

In [None]:
ingredients_list = 'Beef Tomato Egg Rice'
recommend(ingredients_list)

In [None]:
ingredients_list = 'beef tomato egg rice'
recommend(ingredients_list)

In [None]:
ingredients_list = 'chicken noodles chives onion'
excluded_ingredients = 'eggplant, cheese'
recommend(ingredients_list, excluded_ingredients=excluded_ingredients)

Save Model

In [None]:
# Save the nearest neighbors model
with open('models/nearest_neighbors_model.pkl', 'wb') as f:
    pickle.dump(nearest_neighbors, f)