In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import altair as alt
import random
import ast
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
interactions = pd.read_csv('drive/MyDrive/CapstoneML/RAW_interactions.csv')
recipes = pd.read_csv('drive/MyDrive/CapstoneML/RAW_recipes.csv')

In [4]:
interactions.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [5]:
recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [6]:
#Only recipes with 2 or more interactions
interactions_filtered = interactions.groupby('recipe_id').filter(lambda x : len(x) >=2).sort_values(by = 'recipe_id')
recipes_filtered_reviews = interactions_filtered.recipe_id.unique()


# Extract and filterall ingredients and count their occurrences
ingredient_filter = recipes['ingredients'].str.lower().str.replace('[^\w\s]', '').str.split()
ingredient_flat = [word for sublist in ingredient_filter for ingredient in sublist for word in ingredient.split()]
ingredient_counts = pd.Series(ingredient_flat).value_counts()
common_keywords = ingredient_counts[ingredient_counts > 2000].index.tolist()

def filter_common_ingredients(data, common_keywords):
    ingredients_series = recipes['ingredients'].str.lower().str.replace('[^\w\s]', '').str.split(', ')
    ingredient_matrix = {key: [] for key in common_keywords}
    for recipe_ingredients in ingredients_series:
        flat_list = [item for item in recipe_ingredients]
        for key in common_keywords:
            ingredient_matrix[key].append(1 if any(key in ingredient for ingredient in flat_list) else 0)
    return pd.DataFrame(ingredient_matrix)

# Apply the filter function to get a DataFrame with binary indicators for common ingredients
ingredient_matrix = filter_common_ingredients(recipes, common_keywords)

# Now proceed with filtering recipes based on interactions and cooking times
interactions_filtered = interactions.groupby('recipe_id').filter(lambda x : len(x) >=2).sort_values(by = 'recipe_id')
recipes_filtered_reviews = interactions_filtered.recipe_id.unique()

# Keep only those with times in the interquantile range
q1 = recipes['minutes'].quantile(0.25)
q3 = recipes['minutes'].quantile(0.75)
iqr = q3 - q1

recipes_filtered_time = recipes[(recipes['minutes'] >= q1 - 1.5 * iqr) & (recipes['minutes'] <= q3 + 1.5 * iqr)].id.to_list()

# Intersect both restrictions
recipes_filtered = list(set(recipes_filtered_reviews).intersection(set(recipes_filtered_time)))

# Take 10k recipes at random
sample_size = 5000 # 2k to start for now
random.seed(1)
recipe_id_idx = random.sample(recipes_filtered, sample_size)

# Merge the data sets
merged_df = pd.merge(recipes, interactions, how = 'left', left_on ='id', right_on = 'recipe_id')

# Keep only the recipes and reviews for recipes in the recipe_id_idx
merged_df = merged_df[merged_df.recipe_id.isin(recipe_id_idx)]
merged_df.reset_index(drop=True, inplace=True)
merged_df['ingredients'] = merged_df['ingredients'].apply(lambda x: ast.literal_eval(x))
merged_df['nutrition'] = merged_df['nutrition'].apply(lambda x: ast.literal_eval(x))
merged_df['steps'] = merged_df['steps'].apply(lambda x: ast.literal_eval(x))
merged_df['tags'] = merged_df['tags'].apply(lambda x: ast.literal_eval(x))

merged_df.to_csv('clean_data.csv', index=False)

In [7]:
# from google.colab import files
# files.download('clean_data.csv')

In [8]:
merged_df_clean = pd.read_csv('clean_data.csv')

In [9]:
merged_df_clean.shape

(35623, 17)

In [10]:
merged_df_clean['ingredients'] = merged_df_clean['ingredients'].apply(lambda x: ast.literal_eval(x))
merged_df_clean['nutrition'] = merged_df_clean['nutrition'].apply(lambda x : ast.literal_eval(x))
merged_df_clean['steps'] = merged_df_clean['steps'].apply(lambda x : ast.literal_eval(x))
merged_df_clean['tags'] = merged_df_clean['tags'].apply(lambda x : ast.literal_eval(x))

## Recommend Recipe based on Ingredients

In [11]:
def recipes_by_ingredients(data, ingredients_input):
    # Convert ingredients input to a set for faster operation
    ingredients_set = set(ingredients_input)

    # Define a function to count how many input ingredients are in each recipe
    def ingredient_match_count(recipe_ingredients):
        recipe_ingredients_set = set(recipe_ingredients)
        return len(ingredients_set.intersection(recipe_ingredients_set))

    # Apply the function to count matches and add a new column 'match_count'
    data['match_count'] = data['ingredients'].apply(ingredient_match_count)

    # Aggregate data to avoid duplicates and sort by 'match_count'
    aggregated_data = data.groupby('name').agg({'match_count': 'max'}).reset_index()
    top_recipes = aggregated_data.sort_values(by='match_count', ascending=False).head(10)
    return top_recipes

In [12]:
ingredients_input = ['chicken', 'salt', 'pepper', 'beef', 'cheese', 'carrot', 'potato', 'pepper']
top_recipe_names = recipes_by_ingredients(merged_df_clean, ingredients_input)
print(top_recipe_names)

                                                   name  match_count
848                            cheese   potato pancakes            4
2479                    joyce s cheese n chicken dinner            4
3802               root vegetables casserole for winter            4
3607                                  pureed watercress            3
3137                                    nutty wild rice            3
3961                                         shell soup            3
630          broiled flounder with puffy cheese topping            3
4249                      steph s savory stewed chicken            3
1285                        cottage pie simply the best            3
1864  frango  moda do alentejana  chicken with potat...            3


## Recomendation System

In [13]:
merged_df_clean = merged_df_clean.drop_duplicates(subset = 'recipe_id', keep = 'first')

In [14]:
#Prepare the data, keep only three cols
df = merged_df_clean[['name', 'ingredients','minutes','n_steps','n_ingredients']].copy()

#Create a list of unique ingredients
list_ingredients = list(set([ing for ingredients in df['ingredients'] for ing in ingredients]))

#Create matrix (recipes,ingredients) with zeros
ingredient_matrix = pd.DataFrame(0, index=df.index, columns=list_ingredients)

In [15]:
#populate the matrix
for i, ing_list in enumerate(df['ingredients']):
  ingredient_matrix.iloc[i, [list_ingredients.index(ing) for ing in ing_list]] = 1
ingredient_matrix

Unnamed: 0,cucumber,canned pineapple,black cherries,whole chicken,mexican blend cheese,minced clams,red bell peppers,whipped cream cheese,lime peel,seasoned pepper,...,chevre cheese,chive & onion cream cheese,tap water,special k cereal,syrup,roma tomato,morton tender quick salt,beef t-bone steaks,broth,madras curry powder
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
49,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
54,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35590,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35592,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35595,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35606,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
ingredient_matrix.sum(axis = 0)

cucumber                    63
canned pineapple             1
black cherries               1
whole chicken                6
mexican blend cheese        11
                            ..
roma tomato                  4
morton tender quick salt     1
beef t-bone steaks           1
broth                        4
madras curry powder          3
Length: 4001, dtype: int64

In [17]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
normalized_minutes = scaler.fit_transform(df['minutes'].values.reshape(-1,1))
normalized_n_steps = scaler.fit_transform(df['n_steps'].values.reshape(-1,1))
normalized_n_ingredients = scaler.fit_transform(df['n_ingredients'].values.reshape(-1,1))

df_normalized = df.copy()
df_normalized['minutes'] = normalized_minutes
df_normalized['n_steps'] = normalized_n_steps
df_normalized['n_ingredients'] = normalized_n_ingredients
df_normalized.reset_index(drop=True, inplace=True)

In [18]:
#Calculate the similarity for ingredients
ingredient_similarities = cosine_similarity(ingredient_matrix)

In [19]:
ingredient_similarities.shape

(5000, 5000)

In [20]:
#Calculate the distance for prep times

distances_minutes = []
for i in tqdm(range(len(df_normalized))):
  row_distances = []
  for j in range(len(df_normalized)):
    distance = euclidean([df_normalized.loc[i,'minutes']], [df_normalized.loc[j,'minutes']])
    row_distances.append(distance)
  distances_minutes.append(row_distances)

100%|██████████| 5000/5000 [11:29<00:00,  7.25it/s]


In [21]:
#Calculate the distance for n_steps
from tqdm import tqdm
distances_n_steps = []
for i in tqdm(range(len(df_normalized))):
  row_distances = []
  for j in range(len(df_normalized)):
    distance = euclidean([df_normalized.loc[i,'n_steps']], [df_normalized.loc[j,'n_steps']])
    row_distances.append(distance)
  distances_n_steps.append(row_distances)

100%|██████████| 5000/5000 [11:28<00:00,  7.26it/s]


In [22]:
#Calculate the distance for n_ingredients
from tqdm import tqdm
distances_n_ingredients = []
for i in tqdm(range(len(df_normalized))):
  row_distances = []
  for j in range(len(df_normalized)):
    distance = euclidean([df_normalized.loc[i,'n_ingredients']], [df_normalized.loc[j,'n_ingredients']])
    row_distances.append(distance)
  distances_n_ingredients.append(row_distances)

100%|██████████| 5000/5000 [11:23<00:00,  7.32it/s]


In [23]:
# # Same function as the top, just to save lines

# def calculate_distances(df, column):
#   distances = []
#   for i in tqdm(range(len(df))):
#     for j in range(len(df)):
#       distance = euclidean([df.loc[i,column]], [df.loc[j,column]])
#       row_distances.append(distance)
#     distances.append(row_distances)

#   return distances

In [24]:
# distances_minutes = calculate_distances(df_normalized, 'minutes')
# distances_n_steps = calculate_distances(df_normalized, 'n_steps')
# distances_n_ingredients = calculate_distances(df_normalized, 'n_ingredients')

In [25]:
len(distances_minutes) , len(distances_n_steps), len(distances_n_ingredients)

(5000, 5000, 5000)

In [30]:
#Combine the similarity and distance scores; weight ingredients more
#All weights must add up to 1
weight_ingredients = 0.50  #0.50 #0.55
weight_minutes = 0.20 #0.20 #0.15
weight_n_steps = 0.10 #0.10 #0.15
weight_n_ingredients = 0.20 #0.20 #0.15
combined_scores = [[weight_ingredients*ingredient_similarities[i,j] \
                    +(weight_minutes)*(1/(1 + distances_minutes[i][j]))\
                    +(weight_n_ingredients)*(1/(1 + distances_n_ingredients[i][j]))\
                    +(weight_n_steps)*(1/(1 + distances_n_steps[i][j]))
                    for j in range(len(df_normalized))]
                   for i in range(len(df_normalized))]

df_scores = pd.DataFrame(combined_scores, index=df_normalized['name'],columns=df_normalized['name'])
#Export the scores
import gzip
with gzip.open('df_scores.csv.gz', 'wb') as f:
    df_scores.to_csv(f, index=True)

df_scores

name,pick me up party chicken kabobs,say what banana sandwich,creamy vegan potato leek soup,free toe pie,it s too easy pork chops rice casserole,wild game moose sloppy joes,better than a can homemade sloppy joes,buffalo wild wings medium wing sauce copycat by todd,cheeseburger stuffed french bread,hershey s triple layer cheesecake,...,zucchini ribbons with basil butter,zucchini salad with tomatoes,zucchini soup,zucchini spread,zucchini wheat germ burgers,zucchini with pasta,zucchini feta casserole,zucchini parmesan pancakes,zuppa di pesce cioppino or fish stew,zwiebelfleisch onion beef
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pick me up party chicken kabobs,1.000000,0.255011,0.381335,0.320246,0.254803,0.340476,0.233538,0.472782,0.406050,0.182889,...,0.391648,0.473797,0.461721,0.508660,0.432005,0.614745,0.173404,0.462223,0.241466,0.379781
say what banana sandwich,0.255011,1.000000,0.216971,0.254521,0.230017,0.328910,0.132289,0.234828,0.208244,0.096637,...,0.260050,0.320183,0.241522,0.281890,0.242224,0.262093,0.128405,0.333392,0.140915,0.195355
creamy vegan potato leek soup,0.381335,0.216971,1.000000,0.381104,0.340476,0.412322,0.306484,0.337815,0.456632,0.132506,...,0.441808,0.306454,0.381979,0.441691,0.274979,0.412271,0.237909,0.350428,0.293014,0.354124
free toe pie,0.320246,0.254521,0.381104,1.000000,0.413765,0.366335,0.197709,0.335817,0.429677,0.171853,...,0.484247,0.274697,0.441356,0.330962,0.343574,0.334599,0.225553,0.358618,0.229838,0.270425
it s too easy pork chops rice casserole,0.254803,0.230017,0.340476,0.413765,1.000000,0.259892,0.238337,0.399474,0.321008,0.128556,...,0.398012,0.240423,0.287079,0.279518,0.306643,0.269156,0.307940,0.243220,0.333186,0.418206
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zucchini with pasta,0.614745,0.262093,0.412271,0.334599,0.269156,0.368388,0.230621,0.429413,0.507825,0.176371,...,0.545576,0.578423,0.619051,0.719397,0.378543,1.000000,0.247039,0.487142,0.274238,0.304196
zucchini feta casserole,0.173404,0.128405,0.237909,0.225553,0.307940,0.148999,0.416856,0.216709,0.200450,0.228319,...,0.220999,0.271058,0.216115,0.242758,0.296152,0.247039,1.000000,0.249203,0.359933,0.260679
zucchini parmesan pancakes,0.462223,0.333392,0.350428,0.358618,0.243220,0.429786,0.229434,0.414550,0.417101,0.152114,...,0.490428,0.392906,0.478354,0.439495,0.425538,0.487142,0.249203,1.000000,0.212870,0.515889
zuppa di pesce cioppino or fish stew,0.241466,0.140915,0.293014,0.229838,0.333186,0.154953,0.327820,0.227676,0.259553,0.139265,...,0.289895,0.218095,0.259249,0.271004,0.238636,0.274238,0.359933,0.212870,1.000000,0.302337


In [31]:
#Select a recipe at random
recipe_name = df_normalized.loc[random.sample(range(len(df_normalized)),1),'name'].iloc[0]
recipe_name

#Recommend 5 similar recipes
similar_recipes_2 = df_scores[recipe_name].reset_index(drop=False).sort_values(by = recipe_name, ascending=False).iloc[1:5].name.tolist()
atts_recipe = df[df.name == recipe_name]
recommendations = [{'recipe':i,'minutes':df[df.name ==i].minutes.iloc[0],'n_steps':df[df.name==i].n_steps.iloc[0]
                    , 'n_ingredients':df[df.name==i].n_ingredients.iloc[0]} for i in similar_recipes_2]
recipe = {'recipe':recipe_name,'minutes':df[df.name ==recipe_name].minutes.iloc[0],'n_steps':df[df.name==recipe_name].n_steps.iloc[0]
                    , 'n_ingredients':df[df.name==recipe_name].n_ingredients.iloc[0]}

In [32]:
recipe

{'recipe': 'best backfin crab cakes',
 'minutes': 25,
 'n_steps': 8,
 'n_ingredients': 12}

In [33]:
recommendations

[{'recipe': 'classic old bay crab cakes',
  'minutes': 20,
  'n_steps': 5,
  'n_ingredients': 12},
 {'recipe': 'sweettreats ultimate chocolate chip cookies',
  'minutes': 25,
  'n_steps': 9,
  'n_ingredients': 12},
 {'recipe': 'california roll burgers with wasabi mayonnaise',
  'minutes': 20,
  'n_steps': 5,
  'n_ingredients': 12},
 {'recipe': 'supper easy turkey stroganoff',
  'minutes': 25,
  'n_steps': 10,
  'n_ingredients': 12}]