In [1]:
# By Matt Stirling
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from ast import literal_eval

pd.set_option('display.width', 1400)

In [2]:
# Load recipes into dataframe
converters = { k: literal_eval for k in ['tags', 'ingredients', 'steps', 'nutrition'] } # for evaluating strings as arrays (eg. tags)
df_recipes = pd.read_csv('dataset/RAW_recipes.csv', converters=converters)
df_recipes.set_index('id', inplace=True)
df_recipes['n_tags'] = df_recipes['tags'].apply( lambda tags: len(tags) ) # add n_tags column
df_recipes.head()

Unnamed: 0_level_0,name,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,n_tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
137739,arriba baked winter squash mexican style,55,47892,2005-09-16,"[60-minutes-or-less, time-to-make, course, mai...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7,20
31490,a bit different breakfast pizza,30,26278,2002-06-17,"[30-minutes-or-less, time-to-make, course, mai...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6,20
112140,all in the kitchen chili,130,196586,2005-02-25,"[time-to-make, course, preparation, main-dish,...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",13,9
59389,alouette potatoes,45,68585,2003-04-14,"[60-minutes-or-less, time-to-make, course, mai...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,[place potatoes in a large pot of lightly salt...,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...",11,30
44061,amish tomato ketchup for canning,190,41706,2002-10-25,"[weeknight, time-to-make, course, main-ingredi...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,"[mix all ingredients& boil for 2 1 / 2 hours ,...",my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...",8,21


In [4]:
# Load interactions (user ratings) into dataframe
df_interact = pd.read_csv('dataset/RAW_interactions.csv')
df_interact.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


#### **1. Initial Data Exploration:** Explore the distribution of recipes based on key features such as minutes, tags, n_steps, and n_ingredients, and visualize the distribution of recipes for each of these features. 

In [None]:
mins = sorted((df_recipes['minutes']), reverse=1)
print(f'{len(mins):_}')
print('Recipes with 0 minutes:', len([ x for x in mins if x == 0 ]))
th = 160
mins[:100]

In [None]:
# Visualize dist of recipe minutes
mins_fil = [ x for x in df_recipes['minutes'] if (0 < x and x < 60*4) ]
print(len(mins_fil))
plt.hist(mins_fil, bins=50)
plt.show()

In [None]:
# 
fig, ax = plt.subplots(3, 1, figsize=(6,10))
for i, key in enumerate(['n_tags', 'n_ingredients', 'n_steps']):
    ax[i].hist(df_recipes[key], bins=40)
    ax[i].set_title(key)
plt.show()

#### 2. **User Profile Generation:** Using the two datasets, RAW_recipes.csv and RAW_interactions.csv, create a new dataset named User_Data.csv, where each row corresponds to a user in the system. The columns should include rated_recipes (a list of all recipes rated by the user), ingredients (a list of all ingredients in the recipes rated by the user), and rating_list (the list of ratings given by the user). Based on this generated user profile, explore the distribution of users across key features such as the number of rated items, the total number of ingredients per user, and the average of recorded ratings. Visualize the distribution of users for each of these features. 

In [17]:
def create_userdata_dataframe(ratings):
    users = {}
    total_ratings = len(ratings)
    for i, row in enumerate(ratings.itertuples()):
        print('\rHandling interaction {:_} ({:.1f}%)'.format(i+1, (i+1)/total_ratings*100), end='')
        obj = users.get(row.user_id)
        if obj == None:
            obj = {'user_id': row.user_id, 'rated_recipes': [], 'ingredients': [], 'rating_list': []}
        obj['rated_recipes'].append(row.recipe_id)
        obj['rating_list'].append(row.rating)
        recipe = df_recipes.loc[row.recipe_id] # get recipe by its id
        obj['ingredients'].extend(recipe.ingredients)
        obj['ingredients'] = list(set(obj['ingredients']))
        users[row.user_id] = obj
        # if i >= 100_000: break
    print('\nDone.')
    df_userdata = pd.DataFrame(users.values())
    df_userdata.set_index('user_id', inplace=True)
    return df_userdata

In [24]:
# Create OR load userdata dataframe
userdata_fn = 'dataset/User_Data.csv'
if os.path.exists(userdata_fn):
    print('Loading df_userdata ...')
    converters = { k: literal_eval for k in ['rated_recipes', 'ingredients', 'rating_list'] }
    df_userdata = pd.read_csv(userdata_fn, converters=converters)
else:
    print('Creating df_userdata ...')
    df_userdata = create_userdata_dataframe(df_interact)
    df_userdata.to_csv(userdata_fn)
df_userdata.head()

Loading df_userdata ...


Unnamed: 0,user_id,rated_recipes,ingredients,rating_list
0,38094,"[40893, 16954, 40753, 34513, 69545, 49064, 800...","[onion, vegetable oil cooking spray, chicken s...","[4, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 4, 5, 5, ..."
1,1293707,"[40893, 134316, 39446, 253891, 204257, 99564, ...","[garlic clove, bean sprouts, onion, boneless p...","[5, 5, 5, 5, 0, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, ..."
2,8937,"[44394, 39230, 44793, 20128, 33990, 43762, 945...","[garlic clove, onion, chicken stock, white win...","[4, 4, 4, 5, 5, 4, 5, 4, 5, 4, 4, 4, 5, 4, 4, ..."
3,126440,"[85009, 379639, 379102, 45539, 53594, 210456, ...","[pineapple chunks in juice, pecorino cheese, f...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, ..."
4,57222,"[85009, 434181, 34233, 443705, 122826, 112378,...","[great northern bean, vegetable oil cooking sp...","[5, 5, 4, 5, 4, 5, 4, 4, 4, 4, 5, 0, 4, 5, 5, ..."


In [None]:
# Visualize distribution of user parameters


In [20]:
ratings_matrix = df_interact.pivot_table(index='user_id', columns='recipe_id', values='rating', fill_value=0)
ratings_matrix.head()

  num_cells = num_rows * num_columns


IndexError: index 942541416 is out of bounds for axis 0 with size 942387538

In [17]:
# pivot table testing
# Sample DataFrame
data = {
    'Recipe_ID': [1, 2, 1, 4, 2, 1],
    'User_ID': [101, 102, 101, 103, 102, 105],
    'Rating': [5, 4, 2, 5, 4, 2],
    'Ingredient': ['Sugar', 'Salt', 'Sugar', 'Flour', 'Salt', 'Butter']
}

df = pd.DataFrame(data)
print(df)

   Recipe_ID  User_ID  Rating Ingredient
0          1      101       5      Sugar
1          2      102       4       Salt
2          1      101       2      Sugar
3          4      103       5      Flour
4          2      102       4       Salt
5          1      105       2     Butter


In [18]:
pivot_df = df.pivot_table(
    index='Recipe_ID',   # Rows (unique recipe IDs)
    columns='User_ID',   # Columns (user IDs)
    values='Rating',      # Values to aggregate
    # aggfunc='mean',       # Aggregation function (mean rating)
    fill_value=0          # Fill missing values with 0
)
pivot_df

User_ID,101,102,103,105
Recipe_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3.5,0.0,0.0,2.0
2,0.0,4.0,0.0,0.0
4,0.0,0.0,5.0,0.0
