In [1]:
import pandas as pd

In [2]:
recipe_df = pd.read_csv('./data/RAW_recipes.csv')
inter_df = pd.read_csv('./data/RAW_interactions.csv')

In [3]:
print(len(recipe_df))
recipe_df.head()

231637


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [4]:
inter_df.head()
len(inter_df['user_id'].unique())

226570

In [5]:
recipe_df.dropna(inplace=True)
recipe_df.drop_duplicates(inplace=True)
inter_df.dropna(inplace=True)
inter_df.drop_duplicates(inplace=True)
inter_df.drop(columns=['review'], inplace=True)

## Filtering out users and recipes with less than n ratings/reviews

In [6]:
n = 10

# get users/recipes with n or more reviews
recipe_rating_count = inter_df[['recipe_id', 'rating']].groupby(by=['recipe_id']).count()
recipe_more_than_n_rating = recipe_rating_count[recipe_rating_count['rating'] >= n]

user_rating_count = inter_df[['user_id', 'rating']].groupby(by=['user_id']).count()
user_more_than_n_rating = user_rating_count[user_rating_count['rating'] >= n]

print(len(recipe_more_than_n_rating),len(user_more_than_n_rating))
user_more_than_n_rating


21393 12486


Unnamed: 0_level_0,rating
user_id,Unnamed: 1_level_1
1533,128
1535,794
1634,60
1676,31
1792,31
...,...
2001898367,11
2001987473,11
2002015205,10
2002084712,11


In [7]:
# Filter out recipes and users with less than n ratings
inter_df = inter_df[inter_df['recipe_id'].isin(list(recipe_more_than_n_rating.index.to_numpy()))]
inter_df = inter_df[inter_df['user_id'].isin(list(user_more_than_n_rating.index.to_numpy()))]
recipe_df = recipe_df[recipe_df['id'].isin(list(recipe_more_than_n_rating.index.to_numpy()))]
print(len(inter_df))
print(len(recipe_df))


401029
20929


In [8]:
inter_df.count()
print(len(inter_df['user_id'].unique()))
print(len(inter_df['recipe_id'].unique()))

12465
21304


## Map user/recipe ids
Creating new mappings for user and recipe ids so they start at 0


In [9]:
user_ids = inter_df['user_id'].unique()
user_id_map = {}
for i, uid in enumerate(user_ids):
    user_id_map[uid] = i

print(len(user_id_map))

12465


In [10]:
recipe_ids = inter_df['recipe_id'].unique()
recipe_id_map = {}
for i, rec_id in enumerate(recipe_ids):
    recipe_id_map[rec_id] = i

print(len(recipe_id_map))

21304


In [11]:
inter_df['user_id'] = inter_df['user_id'].apply(lambda x: user_id_map[x])

In [12]:
inter_df['recipe_id'] = inter_df['recipe_id'].apply(lambda x: recipe_id_map[x])

In [13]:
recipe_df = recipe_df[recipe_df['id'].isin(recipe_id_map)]
recipe_df['id'] = recipe_df['id'].apply(lambda x: recipe_id_map[x])

In [14]:
print(len(recipe_df))

20841


## Creating ingredient ids

In [15]:
from ast import literal_eval

ingredients_list = recipe_df['ingredients']
print(len(ingredients_list))


unique_ingredients = set()

for ingredients in ingredients_list:
    ingredients = literal_eval(ingredients)
    for ingredient in ingredients:
        unique_ingredients.add(ingredient)

print(len(unique_ingredients))

20841
6233


In [16]:
unique_ingredients_map = {}

for i, ingredient in enumerate(unique_ingredients):
    unique_ingredients_map[ingredient] = i


In [17]:
def map_ingredients(ingredient_list):
    ingredient_list = literal_eval(ingredient_list)
    for i in range(len(ingredient_list)):
        ingredient_list[i] = unique_ingredients_map[ingredient_list[i]] 

    return ingredient_list


recipe_df['ingredients'] = recipe_df['ingredients'].apply(map_ingredients)

## Create User Recipe rating matrix

In [51]:
user_recipe_matrix = pd.DataFrame(index=inter_df['user_id'].unique(), columns=inter_df['recipe_id'].unique(), dtype='Int32').fillna(0)
user_recipe_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21294,21295,21296,21297,21298,21299,21300,21301,21302,21303
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12460,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12461,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
inter_df.head()

Unnamed: 0,user_id,recipe_id,date,rating
31,0,0,2006-11-11,5
43,1,0,2006-02-13,5
44,2,0,2006-03-21,5
46,3,0,2008-02-01,4
47,4,0,2008-03-07,4


In [52]:
for i, row in inter_df.iterrows():
    # datafram format ['col']['row'] = val -> ['recipe_id']['user_id'] = rating
    user_id = row['user_id']
    recipe_id = row['recipe_id']
    rating = row['rating']
    user_recipe_matrix[recipe_id][user_id] = rating

user_recipe_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21294,21295,21296,21297,21298,21299,21300,21301,21302,21303
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12460,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12461,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
