In [21]:
#libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine, euclidean, hamming
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import normalize



In [22]:
#read the data & have randome sample for reproducability
recipes = pd.read_csv('../data/full_clean_data.csv')
recipes.sample(3, random_state=123)

Unnamed: 0,name,url,category,rating,rating_count,ingredients,total,servings,yield,calories,carbohydrates_g,fat_g,protein_g,ingredients_parsed
7358,Maple Dill Carrots,https://www.allrecipes.com/recipe/87694/maple-...,side-dish,4.57,729,"['3 cups peeled and sliced carrots ', ' 2 tabl...",20,4,4 servings,117.3,16.1,6.0,1.0,"['carrot', 'butter', 'sugar']"
2656,Dill Pickle Soup,https://www.allrecipes.com/recipe/54804/dill-p...,soups-stews-and-chili,4.31,205,"['2 tablespoons butter ', ' ½ cup all-purpose ...",30,8,8 servings,104.3,13.7,4.2,3.1,"['butter', 'flour', 'chicken broth', 'pickle',..."
12521,White Bean Tabbouleh,https://www.allrecipes.com/recipe/205611/white...,salad,4.74,46,"['1 cup bulgur wheat, uncooked ', ' 1\u2009½ c...",110,6,6 servings,197.2,24.8,9.7,5.6,"['bulgur', 'spinach', 'cannellini']"


In [23]:
##rename the dataframe id
recipes.index.names = ['recipes_id']
recipes.head(2)

Unnamed: 0_level_0,name,url,category,rating,rating_count,ingredients,total,servings,yield,calories,carbohydrates_g,fat_g,protein_g,ingredients_parsed
recipes_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,Dessert Crepes,https://www.allrecipes.com/recipe/19037/desser...,breakfast-and-brunch,4.8,1156,"['4 eggs, lightly beaten ', ' 1\u2009⅓ cups m...",20,8,8 crepes,163.8,17.2,7.7,6.4,"['milk', 'flour', 'sugar']"
1,Chicken Parmesan,https://www.allrecipes.com/recipe/223042/chick...,world-cuisine,4.83,4245,"['4 skinless, boneless chicken breast halves ...",60,4,4 servings,470.8,24.8,24.9,42.1,"['chicken breast', 'egg', 'panko bread', 'parm..."


In [24]:
##remove punctuation from parsed_ingredients?
recipes['ingredients_parsed'] = recipes['ingredients_parsed'].str.replace('[^\w\s]','')
recipes.head(2)

  recipes['ingredients_parsed'] = recipes['ingredients_parsed'].str.replace('[^\w\s]','')


Unnamed: 0_level_0,name,url,category,rating,rating_count,ingredients,total,servings,yield,calories,carbohydrates_g,fat_g,protein_g,ingredients_parsed
recipes_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,Dessert Crepes,https://www.allrecipes.com/recipe/19037/desser...,breakfast-and-brunch,4.8,1156,"['4 eggs, lightly beaten ', ' 1\u2009⅓ cups m...",20,8,8 crepes,163.8,17.2,7.7,6.4,milk flour sugar
1,Chicken Parmesan,https://www.allrecipes.com/recipe/223042/chick...,world-cuisine,4.83,4245,"['4 skinless, boneless chicken breast halves ...",60,4,4 servings,470.8,24.8,24.9,42.1,chicken breast egg panko bread parmesan cheese...


In [25]:
## Recomend a recipe based on the ingredient of another recipe < guider reference: https://www.datacamp.com/tutorial/recommender-systems-python 
##drop the features i will not use
#recipes.drop(['category', 'rating', 'rating_count', 'ingredients', 'ingredients_parsed'], axis=1, inplace=True)

In [26]:
## Normalize numerical values(total, calories, carbs, protein, fat)
normal_recipes = recipes[['total', 'calories', 'carbohydrates_g', 'fat_g', 'protein_g']]
normal_recipes = pd.DataFrame(normalize(normal_recipes, axis=0))
normal_recipes.columns = normal_recipes.columns
normal_recipes.index = normal_recipes.index
normal_recipes.head(2)

Unnamed: 0,0,1,2,3,4
0,0.000837,0.003888,0.003539,0.003818,0.002726
1,0.002512,0.011174,0.005103,0.012346,0.017931


In [30]:
#compute the similarity score
cosine_sim = linear_kernel(normal_recipes, normal_recipes)
cosine_sim.shape

(19224, 19224)

In [28]:
#Reverse mapping to identify the index of a recipe in the dataset, given its name.
indices = pd.Series(recipes.index, index=recipes['name'])
indices[3:6]

name
Two-Ingredient Pizza Dough    3
Basic Mashed Potatoes         4
Classic Waffles               5
Name: recipes_id, dtype: int64

In [31]:
#Defined a function
def recommend_recipe(name, cosine_sim=cosine_sim):
    #match the recipe name with the index
    indice = indices[name]
    #get the similarity score of other recipes with it
    similarity_score = list(enumerate(cosine_sim[indice]))
    #sort them based on the score
    similarity_score = sorted(similarity_score, key = lambda x: x[1], reverse=True)
    #get the score of top 10 most similar
    similarity_score = similarity_score[1:4]
    #get the recipe indices
    recipe_indices = [i[0] for i in similarity_score]
    #return the top 3
    return recipes['name'].iloc[recipe_indices]


In [35]:
recommend_recipe('Slow Cooker Broccoli Beef')

recipes_id
16119                          Slow Cooker Beef Bone Broth
6481     Mamita's Mojito Scallop Kabobs with Stuffed To...
10860                                          Brown Beans
Name: name, dtype: object