In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import altair as alt
import random
import ast
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
interactions = pd.read_csv('drive/MyDrive/CapstoneML/RAW_interactions.csv')
recipes = pd.read_csv('drive/MyDrive/CapstoneML/RAW_recipes.csv')

In [None]:
interactions.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [None]:
recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [None]:
#Only recipes with 2 or more interactions
interactions_filtered = interactions.groupby('recipe_id').filter(lambda x : len(x) >=2).sort_values(by = 'recipe_id')
recipes_filtered_reviews = interactions_filtered.recipe_id.unique()

#Keep only those with times in the interquantile range
q1 = recipes['minutes'].quantile(0.25)
q3 = recipes['minutes'].quantile(0.75)
iqr = q3-q1

recipes_filtered_time = recipes[(recipes['minutes'] >= q1 - 1.5*iqr) & (recipes['minutes'] <= q3 + 1.5*iqr)].id.to_list()

#Intersect both restrictions
recipes_filtered = list(set(recipes_filtered_reviews).intersection(set(recipes_filtered_time)))

#Take 10k recipes at random
sample_size = 10000
random.seed(1)
recipe_id_idx = random.sample(recipes_filtered, sample_size)

#Merge the data sets
merged_df = pd.merge(recipes, interactions, how = 'left', left_on ='id', right_on = 'recipe_id')

#Keep only the recipes and reviews for recipes in the recipe_id_idx
merged_df = merged_df[merged_df.recipe_id.isin(recipe_id_idx)]
merged_df.reset_index(drop=True, inplace=True)
merged_df['ingredients'] = merged_df['ingredients'].apply(lambda x: ast.literal_eval(x))
merged_df['nutrition'] = merged_df['nutrition'].apply(lambda x : ast.literal_eval(x))
merged_df['steps'] = merged_df['steps'].apply(lambda x : ast.literal_eval(x))
merged_df['tags'] = merged_df['tags'].apply(lambda x : ast.literal_eval(x))

merged_df.to_csv('clean_data.csv', index = False)

In [None]:
from google.colab import files
files.download('clean_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
merged_df_clean = pd.read_csv('clean_data.csv')

In [None]:
merged_df_clean.shape

(71255, 17)

In [None]:
merged_df_clean['ingredients'] = merged_df_clean['ingredients'].apply(lambda x: ast.literal_eval(x))
merged_df_clean['nutrition'] = merged_df_clean['nutrition'].apply(lambda x : ast.literal_eval(x))
merged_df_clean['steps'] = merged_df_clean['steps'].apply(lambda x : ast.literal_eval(x))
merged_df_clean['tags'] = merged_df_clean['tags'].apply(lambda x : ast.literal_eval(x))

## Recomendation System

In [None]:
merged_df_clean = merged_df_clean.drop_duplicates(subset = 'recipe_id', keep = 'first')

In [None]:
#Prepare the data, keep only three cols
df = merged_df_clean[['name', 'ingredients','minutes','n_steps','n_ingredients']].copy()

#Create a list of unique ingredients
list_ingredients = list(set([ing for ingredients in df['ingredients'] for ing in ingredients]))

#Create matrix (recipes,ingredients) with zeros
ingredient_matrix = pd.DataFrame(0, index=df.index, columns=list_ingredients)

In [None]:
#populate the matrix
for i, ing_list in enumerate(df['ingredients']):
  ingredient_matrix.iloc[i, [list_ingredients.index(ing) for ing in ing_list]] = 1
ingredient_matrix

Unnamed: 0,fresh spinach,maraschino cherry juice,dry red wine,gala apples,fresh peaches,durkee onions,vanilla beans,boneless lean pork,dried ginger,blueberry pie filling,...,pineapple chunks in syrup,salmon,dried red lentils,all-bran cereal,fruit spread,sugar-free strawberry jam,tri-color spiral pasta,quick-cooking oats,white balsamic vinegar,lemon thyme
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
55,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71221,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71224,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71227,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71238,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
ingredient_matrix.sum(axis = 0)

fresh spinach                51
maraschino cherry juice       4
dry red wine                 42
gala apples                   3
fresh peaches                12
                             ..
sugar-free strawberry jam     2
tri-color spiral pasta        2
quick-cooking oats           33
white balsamic vinegar        3
lemon thyme                   1
Length: 5318, dtype: int64

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
normalized_minutes = scaler.fit_transform(df['minutes'].values.reshape(-1,1))
normalized_n_steps = scaler.fit_transform(df['n_steps'].values.reshape(-1,1))
normalized_n_ingredients = scaler.fit_transform(df['n_ingredients'].values.reshape(-1,1))

df_normalized = df.copy()
df_normalized['minutes'] = normalized_minutes
df_normalized['n_steps'] = normalized_n_steps
df_normalized['n_ingredients'] = normalized_n_ingredients
df_normalized.reset_index(drop=True, inplace=True)

In [None]:
#Calculate the similarity for ingredients
ingredient_similarities = cosine_similarity(ingredient_matrix)

In [None]:
ingredient_similarities.shape

(10000, 10000)

In [None]:
#Calculate the distance for prep times
from tqdm import tqdm
distances_minutes = []
for i in tqdm(range(len(df_normalized))):
  row_distances = []
  for j in range(len(df_normalized)):
    distance = euclidean([df_normalized.loc[i,'minutes']], [df_normalized.loc[j,'minutes']])
    row_distances.append(distance)
  distances_minutes.append(row_distances)

 22%|██▏       | 2194/10000 [10:41<40:21,  3.22it/s]

In [None]:
#Calculate the distance for n_steps
from tqdm import tqdm
distances_n_steps = []
for i in tqdm(range(len(df_normalized))):
  row_distances = []
  for j in range(len(df_normalized)):
    distance = euclidean([df_normalized.loc[i,'n_steps']], [df_normalized.loc[j,'n_steps']])
    row_distances.append(distance)
  distances_n_steps.append(row_distances)

In [None]:
#Calculate the distance for n_ingredients
from tqdm import tqdm
distances_n_ingredients = []
for i in tqdm(range(len(df_normalized))):
  row_distances = []
  for j in range(len(df_normalized)):
    distance = euclidean([df_normalized.loc[i,'n_ingredients']], [df_normalized.loc[j,'n_ingredients']])
    row_distances.append(distance)
  distances_n_ingredients.append(row_distances)

In [None]:
len(distances_minutes) , len(distances_n_steps), len(distances_n_ingredients)

In [None]:
#Combine the similarity and distance scores; weight ingredients more
#All weights must add up to 1
weight_ingredients = 0.55
weight_minutes = 0.15
weight_n_steps = 0.15
weight_n_ingredients = 0.15
combined_scores = [[weight_ingredients^ingredient_similarities[i,j] \
                    +(weight_minutes)*(1/(1 + distances_minutes[i][j]))\
                    +(weight_n_ingredients)*(1/(1 + distances_n_ingredients[i][j]))\
                    +(weight_n_steps)*(1/(1 + distances_n_steps[i][j]))
                    for j in range(len(df_normalized))]
                   for i in range(len(df_normalized))]

df_scores = pd.DataFrame(combined_scores, index=df_normalized['name'],columns=df_normalized['name'])
#Export the scores
import gzip
with gzip.open('df_scores.csv.gz','wb' as f):
  df_scores.to_csv(f,index=True)

df_scores

In [None]:
#Select a recipe at random
recipe_name = df_normalized.loc[random.sample(range(len(df_normalized)),1),'name'].iloc[0]
recipe_name

#Recommend 5 similar recipes
similar_recipes_2 = df_scores[recipe_name].reset_index(drop=False).sort_values(by = recipe_name, ascending=False).iloc[1:5].name.tolist()
atts_recipe = df[df.name == recipe_name]
recommendations = [{'recipe':i,'minutes':df[df.name ==i].minutes.iloc[0],'n_steps':df[df.name==i].n_steps.iloc[0]
                    , 'n_ingredients':df[df.name==i].n_ingredients.iloc[0]} for i in similar_recipes_2]
recipe = {'recipe':recipe_name,'minutes':df[df.name ==recipe_name].minutes.iloc[0],'n_steps':df[df.name==recipe_name].n_steps.iloc[0]
                    , 'n_ingredients':df[df.name==recipe_name].n_ingredients.iloc[0]}

In [None]:
recipe

In [None]:
recommendations