In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('drive/MyDrive/CapstoneML/RAW_recipes.csv')

In [4]:
data.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [5]:
all_ingredients = data['ingredients'].str.lower().str.replace('[^\w\s]', '').str.split()
all_ingredients_flat = [word for sublist in all_ingredients for ingredient in sublist for word in ingredient.split()]
ingredient_counts = pd.Series(all_ingredients_flat).value_counts()

In [6]:
# Filter ingredients that occur more than 2000 times
common_keywords = ingredient_counts[ingredient_counts > 2000].index.tolist()

In [7]:
common_keywords

["'salt',",
 "'fresh",
 "pepper',",
 "oil',",
 "cheese',",
 "'garlic",
 "powder',",
 "sauce',",
 "'ground",
 "'butter',",
 "'sugar',",
 "sugar',",
 "'onion',",
 "juice',",
 "'baking",
 "flour',",
 "cloves',",
 "'eggs',",
 "'red",
 "'dried",
 "'green",
 "'water',",
 "'olive",
 "cream',",
 "'salt",
 "'milk',",
 "'lemon",
 'and',
 "'white",
 "'flour',",
 "'brown",
 'ground',
 "'black",
 'black',
 "pepper']",
 "onion',",
 "vinegar',",
 "'pepper',",
 "'chicken",
 "'garlic',",
 'chicken',
 "salt',",
 "'vegetable",
 "'egg',",
 "'parmesan",
 "cheese']",
 "soda',",
 "broth',",
 "milk',",
 "'vanilla",
 "butter',",
 "tomatoes',",
 "beans',",
 "extract',",
 "'cream",
 "'all-purpose",
 "mustard',",
 "mix',",
 "'dry",
 "'sour",
 "['butter',",
 "'vanilla',",
 "leaves',",
 "'cinnamon',",
 'bell',
 "'tomato",
 "wine',",
 "beef',",
 "'frozen",
 "'tomatoes',",
 "clove',",
 "'chili",
 'pepper',
 "'egg",
 'red',
 "'orange",
 "'whole",
 "onions',",
 "'celery',",
 "'cheddar",
 "parsley',",
 "flakes',",
 "wat

In [8]:
def filter_common_ingredients(data, common_keywords):
    # Extract the ingredients from the dataset and prepare for filtering
    ingredients_series = data['ingredients'].str.lower().str.replace('[^\w\s]', '').str.split(', ')

    # Initialize a dictionary to hold the 2D matrix
    ingredient_matrix = {key: [] for key in common_keywords}

    # Loop through each recipe's ingredients
    for recipe_ingredients in ingredients_series:
        flat_list = [item for item in recipe_ingredients]
        for key in common_keywords:
            ingredient_matrix[key].append(1 if any(key in ingredient for ingredient in flat_list) else 0)

    # Convert the dictionary to a DataFrame
    ingredient = pd.DataFrame(ingredient_matrix)
    return ingredient

In [9]:
# Generate the ingredient matrix
ingredient = filter_common_ingredients(data, common_keywords)

# Display the first few rows of the resulting DataFrame to verify the results
ingredient.head()

Unnamed: 0,"'salt',",'fresh,"pepper',","oil',","cheese',",'garlic,"powder',","sauce',",'ground,"'butter',",...,"nuts',","'scallions',","chops',","'shallots',","'salsa',",'ice,'cold,'pork,fresh,'bell
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Recommend Recipe based on Ingredient

In [10]:
def find_recipes_by_ingredients(data, ingredients_input):
    # Convert ingredients input to a set for faster operation
    ingredients_set = set(ingredients_input)

    # Define a function to count how many input ingredients are in each recipe
    def ingredient_match_count(recipe_ingredients):
        recipe_ingredients_set = set(recipe_ingredients.lower().replace('[^\w\s]', '').split(', '))
        return len(ingredients_set.intersection(recipe_ingredients_set))

    # Apply the function to count matches and add a new column 'match_count'
    data['match_count'] = data['ingredients'].apply(ingredient_match_count)

    # Sort by 'match_count' and return the top 10 recipe names
    top_recipes = data.sort_values(by='match_count', ascending=False).head(10)
    return top_recipes['name']

In [11]:
# Example ingredients input
ingredients_input = ['chicken', 'salt']  # Modify this list with your desired ingredients
top_recipe_names = find_recipes_by_ingredients(data, ingredients_input)
print(top_recipe_names)

0                arriba   baked winter squash mexican style
154444    patates fourno riganates  baked potatoes with ...
154418                                  pat s pumpkin bread
154419                              pat s rice a roni salad
154420                     pat s scalloped potatoes supreme
154421                   pat s secret technique chili no  4
154422                         pat s southern fried chicken
154423                                      pat s spaghetti
154424                              pat s spicy fried wings
154425                              pat s tomato bread soup
Name: name, dtype: object


Recommend similar recipes

In [None]:
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import euclidean_distances
import random

In [None]:
svd = TruncatedSVD(n_components=10, random_state=42)
reduced_matrix = svd.fit_transform(ingredient)

In [None]:
distance_matrix = euclidean_distances(reduced_matrix)

In [None]:
# def get_similar_recipes(recipe_index, num_recipes=5):
#     sim_scores = list(enumerate(cosine_sim[recipe_index]))
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#     sim_scores = sim_scores[1:num_recipes+1]  # Exclude the recipe itself from its results
#     recipe_indices = [i[0] for i in sim_scores]
#     return data.iloc[recipe_indices]

In [None]:
# cosine_sim = cosine_similarity(ingredient)

In [None]:
# Function to get recipes similar to a given recipe
# def get_similar_recipes(recipe_index, num_recipes=5):
#     sim_scores = list(enumerate(cosine_sim[recipe_index]))
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#     sim_scores = sim_scores[1:num_recipes+1]  # Exclude the recipe itself from its results
#     recipe_indices = [i[0] for i in sim_scores]
#     return data.iloc[recipe_indices]

In [None]:
# similar_recipes = get_similar_recipes(0, 5)
# print(similar_recipes)