In [73]:
import pandas as pd

In [74]:
recipe_df = pd.read_csv('./data/RAW_recipes.csv')
inter_df = pd.read_csv('./data/RAW_interactions.csv')

In [75]:
print(len(recipe_df))
recipe_df.head()

231637


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [76]:
inter_df.head()
len(inter_df['user_id'].unique())

226570

In [77]:
recipe_df.dropna(inplace=True)
recipe_df.drop_duplicates(inplace=True)
inter_df.dropna(inplace=True)
inter_df.drop_duplicates(inplace=True)
inter_df.drop(columns=['review'], inplace=True)

## Filtering out users and recipes with less than n ratings/reviews

In [78]:
n = 10

# get users/recipes with n or more reviews
recipe_rating_count = inter_df[['recipe_id', 'rating']].groupby(by=['recipe_id']).count()
recipe_more_than_n_rating = recipe_rating_count[recipe_rating_count['rating'] >= n]

user_rating_count = inter_df[['user_id', 'rating']].groupby(by=['user_id']).count()
user_more_than_n_rating = user_rating_count[user_rating_count['rating'] >= n]

print(len(recipe_more_than_n_rating),len(user_more_than_n_rating))
user_more_than_n_rating


21393 12486


Unnamed: 0_level_0,rating
user_id,Unnamed: 1_level_1
1533,128
1535,794
1634,60
1676,31
1792,31
...,...
2001898367,11
2001987473,11
2002015205,10
2002084712,11


In [79]:
# Filter out recipes and users with less than n ratings
inter_df = inter_df[inter_df['recipe_id'].isin(list(recipe_more_than_n_rating.index.to_numpy()))]
inter_df = inter_df[inter_df['user_id'].isin(list(user_more_than_n_rating.index.to_numpy()))]
recipe_df = recipe_df[recipe_df['id'].isin(list(recipe_more_than_n_rating.index.to_numpy()))]
print(len(inter_df))
print(len(recipe_df))


401029
20929


In [80]:
inter_df.count()
print(len(inter_df['user_id'].unique()))
print(len(inter_df['recipe_id'].unique()))

12465
21304


## Map user/recipe ids
Creating new mappings for user and recipe ids so they start at 0


In [81]:
user_ids = inter_df['user_id'].unique()
user_id_map = {}
for i, uid in enumerate(user_ids):
    user_id_map[uid] = i

print(len(user_id_map))

12465


In [82]:
recipe_ids = inter_df['recipe_id'].unique()
recipe_id_map = {}
for i, rec_id in enumerate(recipe_ids):
    recipe_id_map[rec_id] = i

print(len(recipe_id_map))

21304


In [83]:
inter_df['user_id'] = inter_df['user_id'].apply(lambda x: user_id_map[x])

In [84]:
inter_df['recipe_id'] = inter_df['recipe_id'].apply(lambda x: recipe_id_map[x])

In [85]:
recipe_df = recipe_df[recipe_df['id'].isin(recipe_id_map)]
recipe_df['id'] = recipe_df['id'].apply(lambda x: recipe_id_map[x])

In [86]:
print(len(recipe_df))

20841


## Creating ingredient ids

In [87]:
from ast import literal_eval

ingredients_list = recipe_df['ingredients']
print(len(ingredients_list))


unique_ingredients = set()

for ingredients in ingredients_list:
    ingredients = literal_eval(ingredients)
    for ingredient in ingredients:
        unique_ingredients.add(ingredient)

print(len(unique_ingredients))

20841
6233


In [88]:
unique_ingredients_map = {}

for i, ingredient in enumerate(unique_ingredients):
    unique_ingredients_map[ingredient] = i


In [89]:
def map_ingredients(ingredient_list):
    ingredient_list = literal_eval(ingredient_list)
    for i in range(len(ingredient_list)):
        ingredient_list[i] = unique_ingredients_map[ingredient_list[i]] 

    return ingredient_list


recipe_df['ingredients'] = recipe_df['ingredients'].apply(map_ingredients)

## Create User Recipe rating matrix

In [90]:
user_recipe_matrix = pd.DataFrame(index=inter_df['user_id'].unique(), columns=inter_df['recipe_id'].unique(), dtype='Int32').fillna(0)
user_recipe_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21294,21295,21296,21297,21298,21299,21300,21301,21302,21303
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12460,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12461,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
inter_df.head()

Unnamed: 0,user_id,recipe_id,date,rating
31,0,0,2006-11-11,5
43,1,0,2006-02-13,5
44,2,0,2006-03-21,5
46,3,0,2008-02-01,4
47,4,0,2008-03-07,4


In [92]:
for i, row in inter_df.iterrows():
    # datafram format ['col']['row'] = val -> ['recipe_id']['user_id'] = rating
    user_id = row['user_id']
    recipe_id = row['recipe_id']
    rating = row['rating']
    user_recipe_matrix[recipe_id][user_id] = rating

user_recipe_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21294,21295,21296,21297,21298,21299,21300,21301,21302,21303
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12460,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12461,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Testing Cosine similarity

In [93]:
# row = user, col = recipes
user_recipe_matrix = user_recipe_matrix.to_numpy()
user_recipe_matrix

array([[5, 0, 0, ..., 0, 0, 0],
       [5, 0, 0, ..., 0, 0, 0],
       [5, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=object)

In [94]:
import numpy as np
np.count_nonzero(user_recipe_matrix)

391775

In [95]:
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(a, b):
    return dot(a, b)/ (norm(a) * norm(b))

def cosine_similarity_to_all_other_user(user_index):
    # structure [[other_user_index, similarity_score],
    #            [other_user_index1, similarity_score1],
    #            ]
    user_similarities = {}

    for other_user_id in range(user_recipe_matrix.shape[0]):
        if other_user_id == user_index:
            continue

        curr_user = user_recipe_matrix[user_index]
        other_user = user_recipe_matrix[other_user_id]

        similarity = cosine_similarity(curr_user, other_user)
        # user_similarities.append([other_user_id, similarity])
        # user_similarities.append(similarity)
        user_similarities[other_user_id] = similarity
    
    # return np.array(user_similarities)
    return user_similarities


In [96]:
x = cosine_similarity_to_all_other_user(10)

  return dot(a, b)/ (norm(a) * norm(b))


In [97]:
x = dict(sorted(x.items(), key=lambda item:-item[1]))
x

{10202: 0.17914819047411254,
 2938: 0.17363099909535037,
 8276: 0.1634082913836501,
 12333: 0.16282779828298147,
 7435: 0.15839627811134266,
 11127: 0.15580369260685323,
 10201: 0.1556824288505596,
 6083: 0.1509788742492445,
 7665: 0.1461568190483739,
 5739: 0.14392080041446434,
 10430: 0.1353701852113805,
 12180: 0.1315561729939764,
 504: 0.1300594670289064,
 7016: 0.12506597015676363,
 6882: 0.12344635402297939,
 5994: 0.12333391218185287,
 9341: 0.1222835681031486,
 4468: 0.12179734920697825,
 2315: 0.12084206113070795,
 4039: 0.11832662465040823,
 6890: 0.11334219326106228,
 2923: 0.11254995512884293,
 6915: 0.11008410115182289,
 6819: 0.10907051751454298,
 12014: 0.10876635272089798,
 5974: 0.10808442529177921,
 7290: 0.10733568695883607,
 6634: 0.10623317546007038,
 9380: 0.1057355934484557,
 293: 0.10501183747024488,
 6: 0.10492589602244963,
 1875: 0.10334847786576035,
 7576: 0.10331568444025897,
 5531: 0.10233024142916182,
 3510: 0.09803097140587666,
 7307: 0.09691624313519673,

In [98]:
user_10202 = inter_df[inter_df['user_id'] == 10202]

In [99]:
juser_10 = inter_df[inter_df['user_id'] == 10]

In [153]:
recipe_name_map = {}

for i, row in recipe_df[['name', 'id']].iterrows():
    recipe_name_map[row['id']] = row['name']

recipe_name_map = dict(sorted(recipe_name_map.items(), key=lambda item: item[0]))
ingredients_df = pd.DataFrame(index=recipe_name_map.keys(), columns=['name'], data=recipe_name_map.values())
ingredients_df

Unnamed: 0,name
0,potato crab chowder
1,tom s vanilla frozen yogurt
2,st louis style gooey butter cake
3,albers sweet corn muffins
4,jungle gems snack mix
...,...
21299,easy microwave hot fudge topping
21300,one loaf white bread
21301,salmon cakes with lemon aioli
21302,savory roasted pepper bread for the bread machine


In [101]:
def list_out_recipe_names(recipe_ids):
    for rec_id in recipe_ids:
        print(recipe_name_map[rec_id])  


In [102]:
list_out_recipe_names(list(user_10['recipe_id']))

potato crab chowder
garlic  rosemary and olive oil roasted potatoes
swedish apple pie
baked garlic rice pilaf
scallion sesame turkey burger
the camellia grill s pecan pie
jambalaya for the crock pot


In [103]:
list_out_recipe_names(list(user_10202['recipe_id']))

swedish apple pie
cream cheese brownies
delicious chicken pot pie
fried bananas
honey garlic grilled chicken
the classic french bistro sandwich   croque monsieur
incredibly delicious cheese garlic bread spread
pork chops with mustard   sour cream sauce
traditional irish shepherd s pie


In [104]:
recipe_df['tags'][12]

"['weeknight', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'side-dishes', 'beans', 'american', '1-day-or-more', 'oven', 'potluck', 'to-go', 'equipment']"

In [107]:
print(len(recipe_df))
print(len(recipe_name_map))

20841
20841


## Playing with data

Extracting unique ingredients/tags
Extracting country tags

In [114]:
import json

len(unique_ingredients_map)
json_data = json.dumps(unique_ingredients_map)

with open("unique_ingredients.json", "w") as f:
    f.write(json_data)

In [117]:
# getting all unique tags
unique_tags = set()
tags_list = recipe_df['tags']

for tags in tags_list:
    tags = literal_eval(tags)
    for tag in tags:
        unique_tags.add(tag)

print(len(unique_tags))

483


In [159]:
sorted(list(unique_tags))

['',
 '1-day-or-more',
 '15-minutes-or-less',
 '3-steps-or-less',
 '30-minutes-or-less',
 '4-hours-or-less',
 '5-ingredients-or-less',
 '60-minutes-or-less',
 'a1-sauce',
 'african',
 'american',
 'amish-mennonite',
 'appetizers',
 'apples',
 'april-fools-day',
 'argentine',
 'artichoke',
 'asian',
 'asparagus',
 'australian',
 'austrian',
 'avocado',
 'bacon',
 'baja',
 'baking',
 'bananas',
 'bar-cookies',
 'barbecue',
 'bass',
 'beans',
 'beef',
 'beef-liver',
 'beef-organ-meats',
 'beef-ribs',
 'beef-sausage',
 'beginner-cook',
 'beijing',
 'belgian',
 'berries',
 'beverages',
 'birthday',
 'biscotti',
 'bisques-cream-soups',
 'black-beans',
 'blueberries',
 'bok-choys',
 'brazilian',
 'bread-machine',
 'bread-pudding',
 'breads',
 'breakfast',
 'brewing',
 'british-columbian',
 'broccoli',
 'broil',
 'brown-bag',
 'brown-rice',
 'brownies',
 'brunch',
 'burgers',
 'cajun',
 'cake-fillings-and-frostings',
 'cakes',
 'californian',
 'camping',
 'canadian',
 'candy',
 'canning',
 'ca

In [160]:
unique_ingredients_map

{'lemon peel': 0,
 'candied cherries': 1,
 'sumaq': 2,
 'cuban bread': 3,
 'fresh snow peas': 4,
 'porridge oats': 5,
 'mozzarella cheese': 6,
 'gruyere': 7,
 'vegetarian chicken broth': 8,
 'snickers miniature candy bars': 9,
 'pork chops': 10,
 'betty crocker fudge brownie mix': 11,
 'pillsbury golden layers refrigerated flaky original biscuits': 12,
 'uncle bens wild rice': 13,
 'cream soup': 14,
 'triple sec': 15,
 'dill pickle relish': 16,
 'whole green onions': 17,
 'cherry chips': 18,
 'lime jell-o gelatin': 19,
 'tomato soup mix': 20,
 'ghirardelli double chocolate chips': 21,
 'french roll': 22,
 'boneless pork chop': 23,
 'low-fat cultured buttermilk': 24,
 'tabasco brand chipotle pepper sauce': 25,
 'button mushrooms': 26,
 'chunk pineapple': 27,
 'coffee creamer': 28,
 'chicken bouillon cubes': 29,
 'dried garbanzo beans': 30,
 'pumpkin pie': 31,
 'dried banana pieces': 32,
 'chicken thigh': 33,
 'lemon juice': 34,
 'lime rind': 35,
 'golden rum': 36,
 'serrano chili': 37,


In [175]:
with open('region_tags.txt', 'r') as f:
    data = f.read().rstrip()
    region_tags = literal_eval(data)

print(type(region_tags))
# region_tags.add('')

# region_tags_map = {}
# for i, region in enumerate(sorted(list(region_tags))):
#     region_tags_map[region] = i

# regions_df = pd.DataFrame(index=region_tags_map.values(), data=region_tags_map.keys())
# regions_df


<class 'set'>


In [176]:
def extract_country(tags):
    tags = literal_eval(tags)
    for tag in tags:
        if tag in region_tags:
            return tag
    return ''


recipe_df['region'] = recipe_df['tags'].apply(extract_country)
recipe_df

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,region
12,better then bush s baked beans,19081,2970,85627,2003-07-26,"['weeknight', 'time-to-make', 'course', 'main-...","[462.4, 28.0, 214.0, 69.0, 14.0, 29.0, 23.0]",9,['in a very large sauce pan cover the beans an...,i'd have to say that this is a labor of love d...,"[912, 29, 2656, 6058, 5276, 2422, 887, 3345, 9...",13,north-american
15,chicken lickin good pork chops,11229,500,14664,2003-06-06,"['weeknight', 'time-to-make', 'course', 'main-...","[105.7, 8.0, 0.0, 26.0, 5.0, 4.0, 3.0]",5,"['dredge pork chops in mixture of flour , salt...",here's and old standby i enjoy from time to ti...,"[301, 4599, 5781, 5191, 887, 404, 3089]",7,
16,chile rellenos,19851,45,52268,2002-10-14,"['60-minutes-or-less', 'time-to-make', 'course...","[94.0, 10.0, 0.0, 11.0, 11.0, 21.0, 0.0]",9,"['drain green chiles', 'sprinkle cornstarch on...",a favorite from a local restaurant no longer i...,"[3682, 4439, 954, 5276, 404]",5,north-american
17,chinese candy,19611,15,35268,2002-03-29,"['15-minutes-or-less', 'time-to-make', 'course...","[232.7, 21.0, 77.0, 4.0, 6.0, 38.0, 8.0]",4,['melt butterscotch chips in heavy saucepan ov...,"a little different, and oh so good. i include ...","[2576, 2114, 5252]",3,
33,grilled venison burgers,9072,26,68357,2003-02-15,"['30-minutes-or-less', 'time-to-make', 'course...","[190.9, 10.0, 10.0, 10.0, 45.0, 15.0, 2.0]",13,"['in bowl , mix dry ingredients', 'add venison...",delicious venison burgers with that,"[4355, 793, 2444, 2377, 1529, 2422, 5781, 958,...",10,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
231542,zucchini pineapple loaf cake,5169,70,89831,2003-09-15,"['weeknight', 'time-to-make', 'course', 'main-...","[3323.6, 237.0, 1118.0, 129.0, 91.0, 109.0, 15...",11,"['set oven to 350 degrees', 'grease two 9 x 5-...",i got this recipe from a local paper a long ti...,"[4656, 404, 1081, 3087, 1953, 2362, 4599, 5130...",14,
231566,zucotte,10559,65,65056,2004-08-03,"['time-to-make', 'course', 'main-ingredient', ...","[145.7, 13.0, 10.0, 2.0, 3.0, 27.0, 5.0]",10,"['in a large , heavy saucpan , melt 1 tbs of t...",a garlicky braised winter squash dish that cou...,"[5922, 5884, 5518, 3292, 3035, 4906]",6,
231587,zuppa di pesce cioppino or fish stew,15851,60,58104,2005-04-12,"['60-minutes-or-less', 'time-to-make', 'course...","[160.3, 3.0, 17.0, 31.0, 37.0, 2.0, 4.0]",9,['in a pot add 5 cups water and shrimp shells ...,whatever you may call it this italian named fi...,"[2065, 3245, 3321, 1079, 3257, 4964, 325, 4435...",29,european
231600,zuppa toscana soup olive garden clone,11860,60,346694,2007-01-30,"['60-minutes-or-less', 'time-to-make', 'course...","[432.8, 32.0, 29.0, 39.0, 42.0, 39.0, 15.0]",10,['bring chicken stock and water to a light boi...,i have tried quite a few different recipes tha...,"[1855, 2365, 3953, 2688, 430, 3321, 2065, 5298...",14,


In [184]:
num_user_reviews = inter_df.groupby(['recipe_id'])['user_id'].count()
user_rating_sum_total = inter_df.groupby(['recipe_id'])['rating'].sum()

num_user_reviews


recipe_id
0        11
1         8
2         7
3         7
4         8
         ..
21299    11
21300     5
21301     6
21302     8
21303    17
Name: user_id, Length: 21304, dtype: int64

In [188]:
recipe_df['steps'].apply(lambda x: len(x)).max()

6572

In [189]:
recipe_df['description'].apply(lambda x: len(x)).max()

3039

In [194]:
test = set()
ingredient_list = recipe_df['ingredients']

for ingredients in ingredient_list:
    # ingredients = literal_eval(ingredients)
    for ingredient in ingredients:
        test.add(ingredient)

print(len(test))
test


6233


{0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [None]:
len()