In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from ast import literal_eval
import random

pd.set_option('display.width', 1400)

In [2]:
# Helper Functions

def format_text(text, line_width=120):
    lines = []
    line = ''
    for word in text.split():
        if len(line) == 0:
            line = word
        elif len(line + ' ' + word) > line_width:
            lines.append(line)
            line = ''
        else:
            line += ' ' + word
    lines.append(line)
    return '\n'.join(lines)

In [3]:
# Load recipes into dataframe
converters = { k: literal_eval for k in ['tags', 'ingredients', 'steps', 'nutrition'] } # for evaluating strings as arrays (eg. tags)
df_recipes = pd.read_csv('dataset/RAW_recipes.csv', converters=converters, index_col='id')
df_recipes['n_tags'] = df_recipes['tags'].apply( lambda tags: len(tags) ) # add n_tags column
df_recipes.head()

Unnamed: 0_level_0,name,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,n_tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
137739,arriba baked winter squash mexican style,55,47892,2005-09-16,"[60-minutes-or-less, time-to-make, course, mai...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7,20
31490,a bit different breakfast pizza,30,26278,2002-06-17,"[30-minutes-or-less, time-to-make, course, mai...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6,20
112140,all in the kitchen chili,130,196586,2005-02-25,"[time-to-make, course, preparation, main-dish,...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",13,9
59389,alouette potatoes,45,68585,2003-04-14,"[60-minutes-or-less, time-to-make, course, mai...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,[place potatoes in a large pot of lightly salt...,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...",11,30
44061,amish tomato ketchup for canning,190,41706,2002-10-25,"[weeknight, time-to-make, course, main-ingredi...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,"[mix all ingredients& boil for 2 1 / 2 hours ,...",my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...",8,21


In [4]:
# Load interactions (user ratings) into dataframe
df_interact = pd.read_csv('dataset/RAW_interactions.csv', dtype={'review': str})
df_interact.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


#### **1. Initial Data Exploration:** Explore the distribution of recipes based on key features such as minutes, tags, n_steps, and n_ingredients, and visualize the distribution of recipes for each of these features. 

In [None]:
mins = sorted((df_recipes['minutes']), reverse=1)
print(f'{len(mins):_}')
print('Recipes with 0 minutes:', len([ x for x in mins if x == 0 ]))
th = 160
mins[:100]

In [None]:
# Visualize dist of recipe minutes
mins_fil = [ x for x in df_recipes['minutes'] if (0 < x and x < 60*4) ]
print(len(mins_fil))
plt.hist(mins_fil, bins=50)
plt.show()

In [None]:
# 
fig, ax = plt.subplots(3, 1, figsize=(6,10))
for i, key in enumerate(['n_tags', 'n_ingredients', 'n_steps']):
    ax[i].hist(df_recipes[key], bins=40)
    ax[i].set_title(key)
plt.show()

#### 2. **User Profile Generation:** Using the two datasets, RAW_recipes.csv and RAW_interactions.csv, create a new dataset named User_Data.csv, where each row corresponds to a user in the system. The columns should include rated_recipes (a list of all recipes rated by the user), ingredients (a list of all ingredients in the recipes rated by the user), and rating_list (the list of ratings given by the user). Based on this generated user profile, explore the distribution of users across key features such as the number of rated items, the total number of ingredients per user, and the average of recorded ratings. Visualize the distribution of users for each of these features. 

In [3]:
def create_userdata_dataframe(ratings):
    users = {}
    total_ratings = len(ratings)
    for i, row in enumerate(ratings.itertuples()):
        print('\rHandling interaction {:_} ({:.1f}%)'.format(i+1, (i+1)/total_ratings*100), end='')
        obj = users.get(row.user_id)
        if obj == None:
            obj = {'user_id': row.user_id, 'rated_recipes': [], 'rating_list': [], 'ingredients': []}
        obj['rated_recipes'].append(row.recipe_id)
        obj['rating_list'].append(row.rating)
        recipe = df_recipes.loc[row.recipe_id] # get recipe by its id
        obj['ingredients'].extend(recipe.ingredients)
        obj['ingredients'] = list(set(obj['ingredients']))
        users[row.user_id] = obj
        # if i >= 100_000: break
    print('\nDone.')
    df_userdata = pd.DataFrame(users.values())
    df_userdata.set_index('user_id', inplace=True)
    df_userdata = df_userdata.sort_index()
    return df_userdata

In [5]:
# Create OR load userdata dataframe
userdata_fn = 'dataset/User_Data.csv'
if not os.path.exists(userdata_fn):
    print('Creating df_userdata ...')
    df_userdata = create_userdata_dataframe(df_interact)
    df_userdata.to_csv(userdata_fn)
else:
    print('Loading df_userdata ...')
    converters = { k: literal_eval for k in ['rated_recipes', 'ingredients', 'rating_list'] }
    df_userdata = pd.read_csv(userdata_fn, converters=converters, index_col='user_id')
df_userdata.head()

Loading df_userdata ...


Unnamed: 0_level_0,rated_recipes,ingredients,rating_list
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
38094,"[40893, 16954, 40753, 34513, 69545, 49064, 800...","[onion, vegetable oil cooking spray, chicken s...","[4, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 4, 5, 5, ..."
1293707,"[40893, 134316, 39446, 253891, 204257, 99564, ...","[garlic clove, bean sprouts, onion, boneless p...","[5, 5, 5, 5, 0, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, ..."
8937,"[44394, 39230, 44793, 20128, 33990, 43762, 945...","[garlic clove, onion, chicken stock, white win...","[4, 4, 4, 5, 5, 4, 5, 4, 5, 4, 4, 4, 5, 4, 4, ..."
126440,"[85009, 379639, 379102, 45539, 53594, 210456, ...","[pineapple chunks in juice, pecorino cheese, f...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, ..."
57222,"[85009, 434181, 34233, 443705, 122826, 112378,...","[great northern bean, vegetable oil cooking sp...","[5, 5, 4, 5, 4, 5, 4, 4, 4, 4, 5, 0, 4, 5, 5, ..."


In [None]:
# Visualize distribution of user parameters


#### TESTING

In [None]:
ratings_matrix = df_interact.pivot_table(index='user_id', columns='recipe_id', values='rating', fill_value=0)
ratings_matrix.head()

In [None]:
# pivot table testing
# Sample DataFrame
data = {
    'Recipe_ID': [1, 2, 1, 4, 2, 1],
    'User_ID': [101, 102, 101, 103, 102, 105],
    'Rating': [5, 4, 2, 5, 4, 2],
    'Ingredient': ['Sugar', 'Salt', 'Sugar', 'Flour', 'Salt', 'Butter']
}

df = pd.DataFrame(data)
print(df)

In [None]:
pivot_df = df.pivot_table(
    index='Recipe_ID',   # Rows (unique recipe IDs)
    columns='User_ID',   # Columns (user IDs)
    values='Rating',      # Values to aggregate
    # aggfunc='mean',       # Aggregation function (mean rating)
    fill_value=0          # Fill missing values with 0
)
pivot_df

##### 5. **Tag- and ingredients-based Recipe Similarity Calculation:** Propose a method for calculating the similarity between different recipes, such as TF-IDF, Jaccard, Levenshtein Distance, Semantic Similarity, or Doc2Vec, by considering recipes' ingredients and tags. Next, calculate the recipe similarities based on their tags and ingredients using the selected method. 

In [66]:
# Functions for performing jaccard similarity between recipes
def jaccard_similarity(s1, s2):
    intersect = set([ x for x in s1 + s2 if (x in s1 and x in s2) ])
    union = list(set(s1 + s2))
    return len(intersect) / len(union)

# Given a target set, get array of jaccard similarity for all sets
def jaccard_similarity_array(target_set, sets):
    return [ jaccard_similarity(target_set, set_cmp) for set_cmp in sets ]

def get_similar_items_tags(tags_corpus, ingredients_corpus, target_i, top_n=5):
    target_tags, target_ingredients = tags_corpus[target_i], ingredients_corpus[target_i]
    sims_tags = jaccard_similarity_array(target_tags, tags_corpus)
    sims_ingredients = jaccard_similarity_array(target_ingredients, ingredients_corpus)
    sims_ave = [ (sim1 + sim2) / 2 for sim1, sim2 in zip(sims_tags, sims_ingredients) ]
    sims_items = [ (i, sim) for i, sim in enumerate(sims_ave) ]
    sims_items.sort(reverse=True, key=lambda item: item[1])
    return sims_items[1:top_n+1]

In [56]:
# Get list (corpus) of recipe tags and ingredients
n = None # Set to None to load whole dataframe
tags_corpus =        list(df_recipes.head(n)['tags'].values)
ingredients_corpus = list(df_recipes.head(n)['ingredients'].values)

In [71]:
# Get most similar items
target_i = 425
target_recipe = df_recipes.iloc[target_i]
top_results = get_similar_items_tags(tags_corpus, ingredients_corpus, target_i, top_n=5)
print('TARGET RECIPE: "{}" (index: {:_}):'.format(target_recipe['name'].replace('  ', ' - ').title(), target_i))
print('      TAGS: {}'.format(', '.join(sorted(tags_corpus[target_i]))))
print('      INGS: {}'.format(', '.join(sorted(ingredients_corpus[target_i]))))
print('\nRECOMMENDATIONS:')
for j, (i, sim) in enumerate(top_results):
    recipe = df_recipes.iloc[i]
    print('\n  {:>2}: SIM: {:.3f}   NAME: "{}"  (index: {:_})'.format( j+1, sim, recipe['name'].replace('  ', ' - ').title(), i))
    print('      TAGS: {}'.format(', '.join(sorted(tags_corpus[i]))))
    print('      INGS: {}'.format(', '.join(sorted(ingredients_corpus[i]))))

TARGET RECIPE: "Indian - Macaroni And Cheese" (index: 425):
      TAGS: 30-minutes-or-less, 5-ingredients-or-less, american, beginner-cook, cheese, course, cuisine, dietary, easy, eggs-dairy, elbow-macaroni, high-calcium, high-in-something, inexpensive, main-dish, main-ingredient, midwestern, north-american, one-dish-meal, pasta, pasta-rice-and-grains, preparation, side-dishes, time-to-make
      INGS: cheese, elbow macaroni, milk, salt and pepper

RECOMMENDATIONS:

   1: SIM: 0.620   NAME: "Baked Mac And Cheese"  (index: 14_510)
      TAGS: 4-hours-or-less, cheese, course, eggs-dairy, elbow-macaroni, main-dish, main-ingredient, pasta, pasta-rice-and-grains, preparation, side-dishes, time-to-make
      INGS: butter, cheese, elbow macaroni, milk, salt and pepper

   2: SIM: 0.611   NAME: "Super Creamy And Cheesy Mac And Cheese"  (index: 203_923)
      TAGS: 15-minutes-or-less, american, cheese, course, cuisine, dietary, easy, eggs-dairy, elbow-macaroni, lunch, main-ingredient, north-ame

#### **6. Description-based Recipe Similarity Calculation:** Repeat Task 5 for recipe similarity calculation based on the descriptions of each recipe. 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

STOPWORDS = list(nltk.corpus.stopwords.words('english'))
STEMMER = nltk.stem.snowball.SnowballStemmer('english')
LEMMATIZER = nltk.stem.WordNetLemmatizer()

In [6]:
STOPWORDS = list(nltk.corpus.stopwords.words('english'))
STEMMER = nltk.stem.snowball.SnowballStemmer('english')
LEMMATIZER = nltk.stem.WordNetLemmatizer()

def text_preprocessor(document):
    tokens = []
    for sentence in sent_tokenize(document.lower()):
        words = word_tokenize(sentence)
        words = [ word for word in words if (word.isalpha() and word not in STOPWORDS) ]
        words = [ STEMMER.stem(word) for word in words ]
        words = [ LEMMATIZER.lemmatize(word, pos="v") for word in words ]
        tokens.extend(words)
    return ' '.join(tokens)

In [7]:
def get_similar_items_TFIDF(matrix, target_i, top_n=10):
    target_vector = matrix[target_i]
    cosine_sims = cosine_similarity(target_vector, matrix)[0]
    sims_items = [ (i, sim) for i, sim in enumerate(cosine_sims) ]
    sims_items.sort(reverse=True, key=lambda item: item[1])
    return sims_items[1:top_n+1]

In [8]:
# Train TFIDF on recipe descriptions
descriptions = list(df_recipes['description'].fillna('').values)
desc_tfidf = TfidfVectorizer(preprocessor=text_preprocessor, ngram_range=(1, 2))
desc_tfidf_matrix = desc_tfidf.fit_transform(descriptions)

In [9]:
# Get most similar items
target_i = 425
target_recipe = df_recipes.iloc[target_i]
top_results = get_similar_items_TFIDF(desc_tfidf_matrix, target_i, top_n=5)
print('TARGET RECIPE:')
print('"{}" (index: {:_}):'.format(target_recipe['name'].replace('  ', ' - ').title(), target_i))
print(format_text(target_recipe['description']))
print('\nRECOMMENDATIONS:')
for j, (i, sim) in enumerate(top_results):
    recipe = df_recipes.iloc[i]
    print('\n  {:>2}: SIM: {:.3f}   NAME: "{}"  (index: {:_})'.format( j+1, sim, recipe['name'].replace('  ', ' - ').title(), i))
    print(format_text(recipe['description']))

TARGET RECIPE:
"Indian - Macaroni And Cheese" (index: 425):
for those of us who get gov't food. its is the easiest and simplest macaroni and cheese recipes.

RECOMMENDATIONS:

   1: SIM: 0.317   NAME: "Best Macaroni -  Cheese"  (index: 22_713)
delicious macaroni & cheese recipe i came up with!

   2: SIM: 0.239   NAME: "Jalapeno Pepper Mac - N - Cheese"  (index: 113_839)
macaroni & cheese recipe from rachael ray's magazine, august 2009.

   3: SIM: 0.237   NAME: "Mary S Macaroni And Cheese"  (index: 131_637)
creamy macaroni and cheese

   4: SIM: 0.237   NAME: "Leslie S Macaroni -  Cheese"  (index: 123_345)
macaroni & cheese - my favorite recipe

   5: SIM: 0.236   NAME: "Easy Mac And Cheese Lasagna"  (index: 77_038)
macaroni and cheese with a twist


#### **7. Review-based Recipe Similarity Calculation:** Repeat Task 5 for recipe similarity calculation based on the reviews of each recipe. 

In [None]:
# Combine recipe reviews into recipes dataframe
def get_recipe_reviews(id):
    reviews = df_interact[df_interact['recipe_id'] == id]['review']
    return '\n'.join(reviews.fillna(''))

recipe_reviews_filename = 'dataset/Recipe_Reviews.csv'
if not os.path.exists(recipe_reviews_filename):
    print('Creating recipe reviews dataframe ...')
    recipes = df_recipes.head(None).copy()
    df_recipe_reviews = pd.DataFrame(index=recipes.index)
    df_recipe_reviews['reviews'] = df_recipe_reviews.index.map(lambda id: get_recipe_reviews(id))
    df_recipe_reviews.to_csv(recipe_reviews_filename)
else:
    print('Loading recipe reviews dataframe ...')
    df_recipe_reviews = pd.read_csv(recipe_reviews_filename, index_col='id')

# df_recipe_reviews.head()

Loading recipe reviews dataframe ...


In [15]:
# Train TFIDF model on recipe reviews
reviews = list(df_recipe_reviews.head(None)['reviews'].fillna('').values)
reviews_tfidf = TfidfVectorizer(preprocessor=text_preprocessor, ngram_range=(1, 1))
reviews_tfidf_matrix = reviews_tfidf.fit_transform(reviews)

In [16]:
# Get most similar items
# random.seed(1); target_i = random.randint(0, len(recipes)-1) # get seeded random target_i
target_i = 425
target_recipe = df_recipes.iloc[target_i]
top_results = get_similar_items_TFIDF(reviews_tfidf_matrix, target_i, top_n=10)
print('TARGET RECIPE:')
print('"{}"   (index: {}):'.format(target_recipe['name'].replace('  ', ' - ').title(), target_i))
# print(format_text(df_recipe_reviews.iloc[target_i]['reviews'][:300]))
print('\nRECOMMENDATIONS:')
for j, (i, sim) in enumerate(top_results):
    recipe, reviews = df_recipes.iloc[i], df_recipe_reviews.iloc[i]
    print('  {:>2}: SIM: {:.3f}   "{}"   (index: {})'.format( j+1, sim, recipe['name'].replace('  ', ' - ').title(), i))
    # print(format_text(reviews['reviews'][:300]))

TARGET RECIPE:
"Indian - Macaroni And Cheese"   (index: 425):

RECOMMENDATIONS:
   1: SIM: 0.337   "Mom S Easy Crock Pot Scalloped Potatos"   (index: 138219)
   2: SIM: 0.337   "Downunder Cheese Puffs"   (index: 73340)
   3: SIM: 0.334   "Coconut Pecan Cake With Broiled Frosting"   (index: 55867)
   4: SIM: 0.318   "Mom S Easy Tuna Noodle Casserole"   (index: 138233)
   5: SIM: 0.307   "Iron Mike S White Sharp Cheddar N - Ham Macaroni And Cheese"   (index: 111999)
   6: SIM: 0.304   "Macaroni Bake"   (index: 128075)
   7: SIM: 0.295   "Mrs - B S Best Ever Macaroni And Cheese"   (index: 140076)
   8: SIM: 0.289   "Really Good Cheddar Soup"   (index: 171718)
   9: SIM: 0.286   "Irish Broccoli - Potato And Cheddar Chowder"   (index: 111762)
  10: SIM: 0.281   "Saucy Macaroni And Cheese"   (index: 180536)
