In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [None]:
# Load the data set
recipes = pd.read_csv('RAW_recipes.csv')
row_data = recipes[recipes['id'] == 110548]

# Display the row
print(row_data)

In [None]:
# Check for null values
recipes.isnull().sum()

In [None]:
# Describe the data set
recipes.describe()

In [None]:
# Check movies info
recipes.info()

In [None]:
# check columns
recipes.columns

In [None]:
# select features needed for content-based filtering, based on the movie title.
recipes = recipes[['name','id', 'minutes','tags', 'nutrition','steps','description','ingredients','n_ingredients']]
recipes

In [None]:
recipes['tags_cleaned'] = recipes['tags'].apply(lambda x: " ".join(eval(x)))
recipes['text_data'] = (recipes['tags_cleaned'] + " " + recipes['description'] + " " + recipes['ingredients'].apply(lambda x: " ".join(eval(x))))
recipes['text_data'] = recipes['text_data'].fillna("")
recipes

In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
vectorizer

In [None]:
tfidf_matrix = vectorizer.fit_transform(recipes['text_data'])
tfidf_matrix.shape

In [57]:
# # Compute cosine similarity for the entire TF-IDF matrix
# cosine_sim_matrix = cosine_similarity(tfidf_matrix)
# cosine_sim_matrix

In [None]:
# # Save the cosine similarity matrix
# with open('cosine_similarity.pkl', 'wb') as f:
#     pickle.dump(cosine_sim_matrix, f)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# User preferences as a query
preferences = ["vegetarian", "Italian", "spicy"] 
user_query = " ".join(preferences)  
# user_query = "vegetarian Italian spicy"  
user_vector = vectorizer.transform([user_query])

# Compute similarity incrementally
def compute_similarity_incrementally(user_vector, tfidf_matrix, batch_size=1000):
    similarities = []
    for i in range(0, tfidf_matrix.shape[0], batch_size):
        batch = tfidf_matrix[i:i + batch_size]
        batch_similarity = cosine_similarity(user_vector, batch).flatten()
        similarities.extend(batch_similarity)
    return similarities

# Incremental computation of similarity
similarity_scores = compute_similarity_incrementally(user_vector, tfidf_matrix)

# Rank recipes by similarity
top_indices = sorted(range(len(similarity_scores)), key=lambda i: similarity_scores[i], reverse=True)[:10]

# Retrieve top recommendations
recommended_recipes = recipes.iloc[top_indices]

# Display recommendations
for _, row in recommended_recipes.iterrows():
    print(f"RECIPE NAME: {row['name'].upper()}")
    print(f"Description: {row['description']}")
    print(f"Ingredients: {row['ingredients']}")
    print(f"Steps: {row['steps']}")
    print("-" * 50)


In [26]:
# Save the vectorizer and TF-IDF matrix as pickle files
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

# Save the preprocessed recipes dataset
recipes.to_csv('preprocessed_recipes.csv', index=False)


In [None]:
row_data = recipes[recipes['id'] == 110548]

row_data