In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import altair as alt
import random
import ast
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
interactions = pd.read_csv('drive/MyDrive/CapstoneML/RAW_interactions.csv')
recipes = pd.read_csv('drive/MyDrive/CapstoneML/RAW_recipes.csv')

In [4]:
interactions.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [5]:
recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [21]:
#Only recipes with 2 or more interactions
interactions_filtered = interactions.groupby('recipe_id').filter(lambda x : len(x) >=2).sort_values(by = 'recipe_id')
recipes_filtered_reviews = interactions_filtered.recipe_id.unique()


# Extract and filterall ingredients and count their occurrences
ingredient_filter = recipes['ingredients'].str.lower().str.replace('[^\w\s]', '').str.split()
ingredient_flat = [word for sublist in ingredient_filter for ingredient in sublist for word in ingredient.split()]
ingredient_counts = pd.Series(ingredient_flat).value_counts()
common_keywords = ingredient_counts[ingredient_counts > 2000].index.tolist()

def filter_common_ingredients(data, common_keywords):
    ingredients_series = recipes['ingredients'].str.lower().str.replace('[^\w\s]', '').str.split(', ')
    ingredient_matrix = {key: [] for key in common_keywords}
    for recipe_ingredients in ingredients_series:
        flat_list = [item for item in recipe_ingredients]
        for key in common_keywords:
            ingredient_matrix[key].append(1 if any(key in ingredient for ingredient in flat_list) else 0)
    return pd.DataFrame(ingredient_matrix)

# Apply the filter function to get a DataFrame with binary indicators for common ingredients
ingredient_matrix = filter_common_ingredients(recipes, common_keywords)

# Now proceed with filtering recipes based on interactions and cooking times
interactions_filtered = interactions.groupby('recipe_id').filter(lambda x : len(x) >=2).sort_values(by = 'recipe_id')
recipes_filtered_reviews = interactions_filtered.recipe_id.unique()

# Keep only those with times in the interquantile range
q1 = recipes['minutes'].quantile(0.25)
q3 = recipes['minutes'].quantile(0.75)
iqr = q3 - q1

recipes_filtered_time = recipes[(recipes['minutes'] >= q1 - 1.5 * iqr) & (recipes['minutes'] <= q3 + 1.5 * iqr)].id.to_list()

# Intersect both restrictions
recipes_filtered = list(set(recipes_filtered_reviews).intersection(set(recipes_filtered_time)))

# Take 10k recipes at random
sample_size = 2000 # 2k to start for now
random.seed(1)
recipe_id_idx = random.sample(recipes_filtered, sample_size)

# Merge the data sets
merged_df = pd.merge(recipes, interactions, how = 'left', left_on ='id', right_on = 'recipe_id')

# Keep only the recipes and reviews for recipes in the recipe_id_idx
merged_df = merged_df[merged_df.recipe_id.isin(recipe_id_idx)]
merged_df.reset_index(drop=True, inplace=True)
merged_df['ingredients'] = merged_df['ingredients'].apply(lambda x: ast.literal_eval(x))
merged_df['nutrition'] = merged_df['nutrition'].apply(lambda x: ast.literal_eval(x))
merged_df['steps'] = merged_df['steps'].apply(lambda x: ast.literal_eval(x))
merged_df['tags'] = merged_df['tags'].apply(lambda x: ast.literal_eval(x))

merged_df.to_csv('clean_data.csv', index=False)

In [22]:
# from google.colab import files
# files.download('clean_data.csv')

In [23]:
merged_df_clean = pd.read_csv('clean_data.csv')

In [24]:
merged_df_clean.shape

(14788, 17)

In [25]:
merged_df_clean['ingredients'] = merged_df_clean['ingredients'].apply(lambda x: ast.literal_eval(x))
merged_df_clean['nutrition'] = merged_df_clean['nutrition'].apply(lambda x : ast.literal_eval(x))
merged_df_clean['steps'] = merged_df_clean['steps'].apply(lambda x : ast.literal_eval(x))
merged_df_clean['tags'] = merged_df_clean['tags'].apply(lambda x : ast.literal_eval(x))

## Recommend Recipe based on Ingredients

In [26]:
def recipes_by_ingredients(data, ingredients_input):
    # Convert ingredients input to a set for faster operation
    ingredients_set = set(ingredients_input)

    # Define a function to count how many input ingredients are in each recipe
    def ingredient_match_count(recipe_ingredients):
        recipe_ingredients_set = set(recipe_ingredients)
        return len(ingredients_set.intersection(recipe_ingredients_set))

    # Apply the function to count matches and add a new column 'match_count'
    data['match_count'] = data['ingredients'].apply(ingredient_match_count)

    # Aggregate data to avoid duplicates and sort by 'match_count'
    aggregated_data = data.groupby('name').agg({'match_count': 'max'}).reset_index()
    top_recipes = aggregated_data.sort_values(by='match_count', ascending=False).head(10)
    return top_recipes

In [27]:
ingredients_input = ['chicken', 'salt', 'pepper', 'beef', 'cheese', 'carrot', 'potato', 'pepper']
top_recipe_names = recipes_by_ingredients(merged_df_clean, ingredients_input)
print(top_recipe_names)

                                                   name  match_count
322                            cheese   potato pancakes            4
652                                      easy beefaroni            3
1446                                  pureed watercress            3
372                   chicken in new orleans wine sauce            3
1576  seafood casserole  for those who don t like se...            3
624                                   donna s cole slaw            3
1488                                      rice tortitas            3
452                                 chunky italian soup            3
568              cube steak with gravy and potatoes  ww            3
1524                            root soup w blue cheese            3


## Recomendation System

In [28]:
merged_df_clean = merged_df_clean.drop_duplicates(subset = 'recipe_id', keep = 'first')

In [29]:
#Prepare the data, keep only three cols
df = merged_df_clean[['name', 'ingredients','minutes','n_steps','n_ingredients']].copy()

#Create a list of unique ingredients
list_ingredients = list(set([ing for ingredients in df['ingredients'] for ing in ingredients]))

#Create matrix (recipes,ingredients) with zeros
ingredient_matrix = pd.DataFrame(0, index=df.index, columns=list_ingredients)

In [30]:
#populate the matrix
for i, ing_list in enumerate(df['ingredients']):
  ingredient_matrix.iloc[i, [list_ingredients.index(ing) for ing in ing_list]] = 1
ingredient_matrix

Unnamed: 0,unbleached cane sugar,whipped cream,sugar-free vanilla pudding mix,toasted sliced almonds,egg,amaretto,chocolate-covered coffee beans,cream cheese with chives,caramel-flavored bits,kraft caramels,...,dried rubbed sage,radish,wagon wheel macaroni,bay leaves,oyster mushroom,sea salt,roll,skim milk,pure vanilla extract,pure chile powder
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14692,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14694,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14722,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14773,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
ingredient_matrix.sum(axis = 0)

unbleached cane sugar               1
whipped cream                      11
sugar-free vanilla pudding mix      1
toasted sliced almonds              1
egg                               138
                                 ... 
sea salt                           27
roll                                1
skim milk                          28
pure vanilla extract               13
pure chile powder                   1
Length: 2638, dtype: int64

In [32]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
normalized_minutes = scaler.fit_transform(df['minutes'].values.reshape(-1,1))
normalized_n_steps = scaler.fit_transform(df['n_steps'].values.reshape(-1,1))
normalized_n_ingredients = scaler.fit_transform(df['n_ingredients'].values.reshape(-1,1))

df_normalized = df.copy()
df_normalized['minutes'] = normalized_minutes
df_normalized['n_steps'] = normalized_n_steps
df_normalized['n_ingredients'] = normalized_n_ingredients
df_normalized.reset_index(drop=True, inplace=True)

In [33]:
#Calculate the similarity for ingredients
ingredient_similarities = cosine_similarity(ingredient_matrix)

In [34]:
ingredient_similarities.shape

(2000, 2000)

In [35]:
#Calculate the distance for prep times

distances_minutes = []
for i in tqdm(range(len(df_normalized))):
  row_distances = []
  for j in range(len(df_normalized)):
    distance = euclidean([df_normalized.loc[i,'minutes']], [df_normalized.loc[j,'minutes']])
    row_distances.append(distance)
  distances_minutes.append(row_distances)

100%|██████████| 2000/2000 [01:49<00:00, 18.21it/s]


In [36]:
#Calculate the distance for n_steps
from tqdm import tqdm
distances_n_steps = []
for i in tqdm(range(len(df_normalized))):
  row_distances = []
  for j in range(len(df_normalized)):
    distance = euclidean([df_normalized.loc[i,'n_steps']], [df_normalized.loc[j,'n_steps']])
    row_distances.append(distance)
  distances_n_steps.append(row_distances)

100%|██████████| 2000/2000 [02:04<00:00, 16.06it/s]


In [37]:
#Calculate the distance for n_ingredients
from tqdm import tqdm
distances_n_ingredients = []
for i in tqdm(range(len(df_normalized))):
  row_distances = []
  for j in range(len(df_normalized)):
    distance = euclidean([df_normalized.loc[i,'n_ingredients']], [df_normalized.loc[j,'n_ingredients']])
    row_distances.append(distance)
  distances_n_ingredients.append(row_distances)

100%|██████████| 2000/2000 [01:52<00:00, 17.75it/s]


In [39]:
# # Same function as the top, just to save lines

# def calculate_distances(df, column):
#   distances = []
#   for i in tqdm(range(len(df))):
#     for j in range(len(df)):
#       distance = euclidean([df.loc[i,column]], [df.loc[j,column]])
#       row_distances.append(distance)
#     distances.append(row_distances)

#   return distances

In [40]:
# distances_minutes = calculate_distances(df_normalized, 'minutes')
# distances_n_steps = calculate_distances(df_normalized, 'n_steps')
# distances_n_ingredients = calculate_distances(df_normalized, 'n_ingredients')

In [41]:
len(distances_minutes) , len(distances_n_steps), len(distances_n_ingredients)

(2000, 2000, 2000)

In [44]:
#Combine the similarity and distance scores; weight ingredients more
#All weights must add up to 1
weight_ingredients = 0.55
weight_minutes = 0.15
weight_n_steps = 0.15
weight_n_ingredients = 0.15
combined_scores = [[weight_ingredients*ingredient_similarities[i,j] \
                    +(weight_minutes)*(1/(1 + distances_minutes[i][j]))\
                    +(weight_n_ingredients)*(1/(1 + distances_n_ingredients[i][j]))\
                    +(weight_n_steps)*(1/(1 + distances_n_steps[i][j]))
                    for j in range(len(df_normalized))]
                   for i in range(len(df_normalized))]

df_scores = pd.DataFrame(combined_scores, index=df_normalized['name'],columns=df_normalized['name'])
#Export the scores
import gzip
with gzip.open('df_scores.csv.gz', 'wb') as f:
    df_scores.to_csv(f, index=True)

df_scores

name,pick me up party chicken kabobs,say what banana sandwich,better than a can homemade sloppy joes,buffalo wild wings medium wing sauce copycat by todd,s wonderful carrot and raisin salad,10 bars,15 minute chicken broccoli and rice dinner,2 minute broccoli,3 step biscuits or how to break up with your girlfriend biscuits,30 minute chicken tamales,...,zitumbuwa banana fritters,zoobana zucchini banana bread,zucchini onions with mozzarella cheese,zucchini appetizer squares,zucchini chicken,zucchini corn fritters,zucchini oven fries,zucchini ribbons with basil butter,zucchini wheat germ burgers,zucchini parmesan pancakes
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pick me up party chicken kabobs,1.000000,0.254858,0.203798,0.409053,0.478934,0.255009,0.306795,0.231737,0.276539,0.303811,...,0.363206,0.327801,0.475713,0.320225,0.347743,0.296356,0.370198,0.364703,0.413322,0.426787
say what banana sandwich,0.254858,1.000000,0.114166,0.224523,0.379012,0.314838,0.263745,0.358236,0.307654,0.184086,...,0.433423,0.291951,0.260185,0.180885,0.294079,0.171769,0.242694,0.238326,0.220706,0.335450
better than a can homemade sloppy joes,0.203798,0.114166,1.000000,0.206293,0.153775,0.124737,0.144988,0.115512,0.147077,0.161355,...,0.137287,0.187376,0.159089,0.254585,0.195417,0.197664,0.148784,0.217920,0.201816,0.200730
buffalo wild wings medium wing sauce copycat by todd,0.409053,0.224523,0.206293,1.000000,0.332565,0.236425,0.418747,0.395669,0.400340,0.265705,...,0.266840,0.286854,0.376791,0.350560,0.306795,0.304180,0.466270,0.324033,0.340231,0.374347
s wonderful carrot and raisin salad,0.478934,0.379012,0.153775,0.332565,1.000000,0.304380,0.308744,0.218091,0.273298,0.349045,...,0.421758,0.277508,0.441609,0.226098,0.332130,0.229023,0.394786,0.300448,0.288376,0.367230
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zucchini corn fritters,0.296356,0.171769,0.197664,0.304180,0.229023,0.195761,0.316955,0.178598,0.242646,0.182846,...,0.236873,0.336277,0.339595,0.359349,0.309131,1.000000,0.321779,0.280706,0.466556,0.415598
zucchini oven fries,0.370198,0.242694,0.148784,0.466270,0.394786,0.276326,0.394028,0.273244,0.332592,0.373101,...,0.410405,0.425168,0.457620,0.343633,0.448561,0.321779,1.000000,0.461493,0.413535,0.450515
zucchini ribbons with basil butter,0.364703,0.238326,0.217920,0.324033,0.300448,0.374158,0.385178,0.266963,0.327259,0.287802,...,0.297166,0.267571,0.454152,0.357012,0.446510,0.280706,0.461493,1.000000,0.329275,0.487399
zucchini wheat germ burgers,0.413322,0.220706,0.201816,0.340231,0.288376,0.251277,0.337934,0.247035,0.296089,0.317203,...,0.295230,0.334925,0.410388,0.481777,0.314549,0.466556,0.413535,0.329275,1.000000,0.396796


In [45]:
#Select a recipe at random
recipe_name = df_normalized.loc[random.sample(range(len(df_normalized)),1),'name'].iloc[0]
recipe_name

#Recommend 5 similar recipes
similar_recipes_2 = df_scores[recipe_name].reset_index(drop=False).sort_values(by = recipe_name, ascending=False).iloc[1:5].name.tolist()
atts_recipe = df[df.name == recipe_name]
recommendations = [{'recipe':i,'minutes':df[df.name ==i].minutes.iloc[0],'n_steps':df[df.name==i].n_steps.iloc[0]
                    , 'n_ingredients':df[df.name==i].n_ingredients.iloc[0]} for i in similar_recipes_2]
recipe = {'recipe':recipe_name,'minutes':df[df.name ==recipe_name].minutes.iloc[0],'n_steps':df[df.name==recipe_name].n_steps.iloc[0]
                    , 'n_ingredients':df[df.name==recipe_name].n_ingredients.iloc[0]}

In [46]:
recipe

{'recipe': 'cantaloupe banana slush',
 'minutes': 5,
 'n_steps': 7,
 'n_ingredients': 5}

In [47]:
recommendations

[{'recipe': 'swedish melon with red raspberry puree  melon och hallendessert',
  'minutes': 10,
  'n_steps': 12,
  'n_ingredients': 5},
 {'recipe': 'traditional irish coffee',
  'minutes': 5,
  'n_steps': 7,
  'n_ingredients': 4},
 {'recipe': 'banana caterpillars',
  'minutes': 10,
  'n_steps': 7,
  'n_ingredients': 5},
 {'recipe': 'zitumbuwa  banana fritters',
  'minutes': 25,
  'n_steps': 2,
  'n_ingredients': 5}]