In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
# Load the data set
recipes = pd.read_csv('RAW_recipes.csv')
row_data = recipes[recipes['id'] == 110548]

# Display the row
print(row_data)

                                         name      id  minutes  \
188361  skylike chili   skyline chili copycat  110548      105   

        contributor_id   submitted  \
188361          166475  2005-02-07   

                                                     tags  \
188361  ['time-to-make', 'course', 'main-ingredient', ...   

                                         nutrition  n_steps  \
188361  [241.5, 23.0, 14.0, 33.0, 38.0, 29.0, 2.0]        7   

                                                    steps  \
188361  ['brown ground beef and onion', 'drain', 'add ...   

                                              description  \
188361  a copycat of skyline chili, true cincinnati ch...   

                                              ingredients  n_ingredients  
188361  ['ground beef', 'onions', 'beef stock', 'tomat...             19  


In [3]:
# Check for null values
recipes.isnull().sum()

name                 1
id                   0
minutes              0
contributor_id       0
submitted            0
tags                 0
nutrition            0
n_steps              0
steps                0
description       4979
ingredients          0
n_ingredients        0
dtype: int64

In [4]:
# Describe the data set
recipes.describe()

Unnamed: 0,id,minutes,contributor_id,n_steps,n_ingredients
count,231637.0,231637.0,231637.0,231637.0,231637.0
mean,222014.708984,9398.546,5534885.0,9.765499,9.051153
std,141206.635626,4461963.0,99791410.0,5.995128,3.734796
min,38.0,0.0,27.0,0.0,1.0
25%,99944.0,20.0,56905.0,6.0,6.0
50%,207249.0,40.0,173614.0,9.0,9.0
75%,333816.0,65.0,398275.0,12.0,11.0
max,537716.0,2147484000.0,2002290000.0,145.0,43.0


In [5]:
# Check movies info
recipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


In [6]:
# check columns
recipes.columns

Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')

In [7]:
# select features needed for content-based filtering, based on the movie title.
recipes = recipes[['name','id', 'minutes','tags', 'nutrition','steps','description','ingredients','n_ingredients']]
recipes

Unnamed: 0,name,id,minutes,tags,nutrition,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]","['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]","['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]","['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8
...,...,...,...,...,...,...,...,...,...
231632,zydeco soup,486161,60,"['ham', '60-minutes-or-less', 'time-to-make', ...","[415.2, 26.0, 34.0, 26.0, 44.0, 21.0, 15.0]","['heat oil in a 4-quart dutch oven', 'add cele...",this is a delicious soup that i originally fou...,"['celery', 'onion', 'green sweet pepper', 'gar...",22
231633,zydeco spice mix,493372,5,"['15-minutes-or-less', 'time-to-make', 'course...","[14.8, 0.0, 2.0, 58.0, 1.0, 0.0, 1.0]",['mix all ingredients together thoroughly'],this spice mix will make your taste buds dance!,"['paprika', 'salt', 'garlic powder', 'onion po...",13
231634,zydeco ya ya deviled eggs,308080,40,"['60-minutes-or-less', 'time-to-make', 'course...","[59.2, 6.0, 2.0, 3.0, 6.0, 5.0, 0.0]","['in a bowl , combine the mashed yolks and may...","deviled eggs, cajun-style","['hard-cooked eggs', 'mayonnaise', 'dijon must...",8
231635,cookies by design cookies on a stick,298512,29,"['30-minutes-or-less', 'time-to-make', 'course...","[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]",['place melted butter in a large mixing bowl a...,"i've heard of the 'cookies by design' company,...","['butter', 'eagle brand condensed milk', 'ligh...",10


In [8]:
recipes['tags_cleaned'] = recipes['tags'].apply(lambda x: " ".join(eval(x)))
recipes['text_data'] = (recipes['tags_cleaned'] + " " + recipes['description'] + " " + recipes['ingredients'].apply(lambda x: " ".join(eval(x))))
recipes['text_data'] = recipes['text_data'].fillna("")
recipes

Unnamed: 0,name,id,minutes,tags,nutrition,steps,description,ingredients,n_ingredients,tags_cleaned,text_data
0,arriba baked winter squash mexican style,137739,55,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]","['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,60-minutes-or-less time-to-make course main-in...,60-minutes-or-less time-to-make course main-in...
1,a bit different breakfast pizza,31490,30,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]","['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,30-minutes-or-less time-to-make course main-in...,30-minutes-or-less time-to-make course main-in...
2,all in the kitchen chili,112140,130,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]","['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13,time-to-make course preparation main-dish chil...,time-to-make course preparation main-dish chil...
3,alouette potatoes,59389,45,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11,60-minutes-or-less time-to-make course main-in...,60-minutes-or-less time-to-make course main-in...
4,amish tomato ketchup for canning,44061,190,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8,weeknight time-to-make course main-ingredient ...,weeknight time-to-make course main-ingredient ...
...,...,...,...,...,...,...,...,...,...,...,...
231632,zydeco soup,486161,60,"['ham', '60-minutes-or-less', 'time-to-make', ...","[415.2, 26.0, 34.0, 26.0, 44.0, 21.0, 15.0]","['heat oil in a 4-quart dutch oven', 'add cele...",this is a delicious soup that i originally fou...,"['celery', 'onion', 'green sweet pepper', 'gar...",22,ham 60-minutes-or-less time-to-make course mai...,ham 60-minutes-or-less time-to-make course mai...
231633,zydeco spice mix,493372,5,"['15-minutes-or-less', 'time-to-make', 'course...","[14.8, 0.0, 2.0, 58.0, 1.0, 0.0, 1.0]",['mix all ingredients together thoroughly'],this spice mix will make your taste buds dance!,"['paprika', 'salt', 'garlic powder', 'onion po...",13,15-minutes-or-less time-to-make course prepara...,15-minutes-or-less time-to-make course prepara...
231634,zydeco ya ya deviled eggs,308080,40,"['60-minutes-or-less', 'time-to-make', 'course...","[59.2, 6.0, 2.0, 3.0, 6.0, 5.0, 0.0]","['in a bowl , combine the mashed yolks and may...","deviled eggs, cajun-style","['hard-cooked eggs', 'mayonnaise', 'dijon must...",8,60-minutes-or-less time-to-make course main-in...,60-minutes-or-less time-to-make course main-in...
231635,cookies by design cookies on a stick,298512,29,"['30-minutes-or-less', 'time-to-make', 'course...","[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]",['place melted butter in a large mixing bowl a...,"i've heard of the 'cookies by design' company,...","['butter', 'eagle brand condensed milk', 'ligh...",10,30-minutes-or-less time-to-make course prepara...,30-minutes-or-less time-to-make course prepara...


In [9]:
vectorizer = TfidfVectorizer(stop_words='english')
vectorizer

In [10]:
tfidf_matrix = vectorizer.fit_transform(recipes['text_data'])
tfidf_matrix.shape

(231637, 69376)

In [11]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=20)  # Reduce dimensions further
reduced_matrix = svd.fit_transform(tfidf_matrix)

In [15]:
from sklearn.metrics.pairwise import cosine_similarity
import heapq

def compute_top_n_similarities(matrix, top_n=5):
    n = matrix.shape[0]
    top_similarities = {}

    for i in range(n):
        # Compute similarity for the current document
        row_sim = cosine_similarity(matrix[i:i+1], matrix).flatten()
        # Get top-N similar indices and scores (excluding itself)
        top_indices = heapq.nlargest(top_n + 1, range(len(row_sim)), key=row_sim.__getitem__)
        top_similarities[i] = [(idx, row_sim[idx]) for idx in top_indices if idx != i]

    return top_similarities

# Compute top-N similarities
top_n_similarities = compute_top_n_similarities(reduced_matrix, top_n=5)


KeyboardInterrupt: 

In [None]:
top_n_similarities

NameError: name 'top_n_similarities' is not defined

In [14]:
# Save the cosine similarity matrix
with open('cosine_similarity.pkl', 'wb') as f:
    pickle.dump(cosine_sim_matrix, f)

NameError: name 'cosine_sim_matrix' is not defined

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

# User preferences as a query
preferences = ["vegetarian", "Italian", "spicy"] 
user_query = " ".join(preferences)  
# user_query = "vegetarian Italian spicy"  
user_vector = vectorizer.transform([user_query])

# Compute similarity incrementally
def compute_similarity_incrementally(user_vector, tfidf_matrix, batch_size=1000):
    similarities = []
    for i in range(0, tfidf_matrix.shape[0], batch_size):
        batch = tfidf_matrix[i:i + batch_size]
        batch_similarity = cosine_similarity(user_vector, batch).flatten()
        similarities.extend(batch_similarity)
    return similarities

# Incremental computation of similarity
similarity_scores = compute_similarity_incrementally(user_vector, tfidf_matrix)

# Rank recipes by similarity
top_indices = sorted(range(len(similarity_scores)), key=lambda i: similarity_scores[i], reverse=True)[:10]

# Retrieve top recommendations
recommended_recipes = recipes.iloc[top_indices]

# Display recommendations
for _, row in recommended_recipes.iterrows():
    print(f"RECIPE NAME: {row['name'].upper()}")
    print(f"Description: {row['description']}")
    print(f"Ingredients: {row['ingredients']}")
    print(f"Steps: {row['steps']}")
    print("-" * 50)


RECIPE NAME: RIGATONI WITH CHEESE AND ITALIAN SAUSAGE
Description: delicious! if you like spicy food, try it with spicy italian sausage and spicy marinara sauce!
Ingredients: ['rigatoni pasta', 'italian sausage', 'garlic cloves', 'marinara sauce', 'crushed red pepper flakes', 'mozzarella cheese', 'parmesan cheese', 'fresh italian parsley', 'extra virgin olive oil']
Steps: ['cook rigatoni until tender in large pot , drain pasta', 'meanwhile , preheat broiler', 'cook sausage in large heavy pot over medium high heat until no longer pink , stirring frequently and breaking up with back of a wooden spoon', 'add garlic and saute until soft , about 2 minutes', 'drain off excess oil and return pot to medium high heat', 'add marinara sauce and crushed red pepper , then pasta', 'season to taste with salt and pepper', 'divide pasta among four 1 1 / 4-cup custard cups , or double recipe and put in one large baking dish', 'sprinkle parmesan and mozzarella over', 'place in broiler until cheese melts 

In [17]:
# Save the vectorizer and TF-IDF matrix as pickle files
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

# Save the preprocessed recipes dataset
recipes.to_csv('preprocessed_recipes.csv', index=False)


In [18]:
row_data = recipes[recipes['id'] == 110548]

row_data

Unnamed: 0,name,id,minutes,tags,nutrition,steps,description,ingredients,n_ingredients,tags_cleaned,text_data
188361,skylike chili skyline chili copycat,110548,105,"['time-to-make', 'course', 'main-ingredient', ...","[241.5, 23.0, 14.0, 33.0, 38.0, 29.0, 2.0]","['brown ground beef and onion', 'drain', 'add ...","a copycat of skyline chili, true cincinnati ch...","['ground beef', 'onions', 'beef stock', 'tomat...",19,time-to-make course main-ingredient preparatio...,time-to-make course main-ingredient preparatio...
