In [None]:
import os
import time
import pickle
from ast import literal_eval
import numpy as np
import matplotlib.pyplot as plt

import torch
from transformers import DistilBertTokenizer, DistilBertModel
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
def get_text_from_recipe(recipe):
    tags = recipe['tags'] if isinstance(recipe['tags'], list) else []
    ingredients = recipe['ingredients'] if isinstance(recipe['ingredients'], list) else []
    description = str(recipe['description']) if isinstance(recipe['description'], str) else ""
    return description + ' ' + ' '.join(tags + ingredients)

In [None]:
# Helper function to get DistilBERT embeddings for text
def get_embedding_DistilBERT(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach()

In [None]:
def create_recipe_embeddings_DistilBERT(df_recipes, handle_limit=None, save_dir='embeddings-distilbert', redo=False):  # Set max_recipes to the desired limit
    os.makedirs(save_dir, exist_ok=True)
    n_rows = len(df_recipes)
    start = time.time()
    handled_i = 0
    for i, (recipe_id, row) in enumerate(df_recipes.iterrows()):
        save_fn = f"{save_dir}/{recipe_id}.pkl"
        if redo or not os.path.exists(save_fn):
            print('\rGetting embedding for {:_}/{:_} ({:.1f}%) ({:_} handled)'.format(i+1, n_rows, ((i+1)/n_rows)*100, handled_i), end='')
            print(' ({:.1f} per min)'.format( (handled_i / (time.time()-start) * 60) ), end='')
            content_text = get_text_from_recipe(row)
            recipe_embedding = get_embedding_DistilBERT(content_text)
            with open(save_fn, "wb") as f:
                pickle.dump(recipe_embedding, f)
            handled_i += 1
            if handle_limit and handled_i > handle_limit:
                break
    print('\nDone. Took {:_}s'.format(int(time.time()-start)))

In [None]:
def load_recipe_embeddings_distilBERT(save_dir='embeddings-distilbert', limit=None):
    embeddings, ids = [], []
    for i, item in enumerate(os.listdir(save_dir)):
        print('\r{}'.format(i+1), end='')
        recipe_id = item.split('.')[0]
        itempath = os.path.join(save_dir, item)
        try:
            with open(itempath, 'rb') as f:
                recipe_embedding = pickle.load(f)
            embeddings.append(recipe_embedding)
            ids.append(recipe_id)
        except:
            print('Error: Unable to read "{}". Removing ...'.format(itempath))
            # os.remove(itempath) # assumes the file is empty and can be deleted
        if limit and i >= limit: break
    return { id_: emb for id_, emb in zip(ids, embeddings) }

In [None]:
def create_user_embeddings(df_userdata, recipe_embeddings, max_users=None):  # Limit number of users for testing
    df_userdata = df_userdata.head(max_users)
    user_embeddings = {}
    for user_id, row in df_userdata.iterrows():
        rated_recipes = row['rated_recipes']
        ratings = row['rating_list']
        user_embedding = None
        for recipe_id, rating in zip(rated_recipes, ratings):
            recipe_embedding = recipe_embeddings.get(recipe_id)
            if recipe_embedding and isinstance(recipe_embedding, torch.Tensor):
                recipe_embedding = recipe_embedding.numpy() # Convert recipe embedding from tensor to numpy array
                weighted_embedding = (rating-2) * recipe_embedding # Weight recipe embeddings by rating
                user_embedding.append(weighted_embedding)
        if user_embedding:
            user_embedding = np.array(user_embedding) # Ensure all embeddings are the same shape (i.e., 1D vectors)
            if user_embedding.ndim == 2:
                user_embedding = np.mean(user_embedding, axis=0)
            user_embeddings[user_id] = user_embedding
    return user_embeddings

#### Load Data

In [None]:
# Load recipes & recipe_reviews
converters = { k: literal_eval for k in ['tags', 'ingredients', 'steps', 'nutrition'] } # for evaluating strings as arrays (eg. tags)
df_recipes = pd.read_csv('dataset/RAW_recipes.csv', converters=converters, index_col='id')
# df_recipe_reviews = pd.read_csv('dataset/Recipe_Reviews.csv', index_col='id')
# df_interact = pd.read_csv('dataset/RAW_interactions.csv', dtype={'review': str})

converters = { k: literal_eval for k in ['rated_recipes', 'ingredients', 'rating_list'] }
df_userdata = pd.read_csv('dataset/User_Data.csv', converters=converters, index_col='user_id')

#### Initialize DistilBERT Model

In [None]:
# Initialize DistilBERT
tokenizer_DistilBERT = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model_DistilBERT = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
# Creates the recipe embeddings and pickles them into folder
create_recipe_embeddings_DistilBERT(df_recipes, handle_limit=None)

In [None]:
# Load recipe_embeddings and create user embeddings
print('Loading recipe embeddings ...')
recipe_embeddings = load_recipe_embeddings_distilBERT()
print('Getting user embeddings ...')
user_embeddings = create_user_embeddings(df_userdata, recipe_embeddings, max_users=1000)

# Display results
print("Recipe Embeddings:", recipe_embeddings)
print("User Embeddings:", user_embeddings)

In [None]:
# 
def get_recipe_recommendations_for_user(user_id, user_embeddings, recipe_embeddings, recipes_rated_by_user, top_n=100):
    user_embedding = user_embeddings.get(user_id)
    if not user_embedding:
        print('No user embedding for:', user_id)
        return
    sims = [ (recipe_id, sim) for recipe_id, recipe_embedding in recipe_embeddings.items() ]
    sims.sort(reverse=True, key=lambda item: item[1])
    recommend = []
    while len(recommend) < top_n and len(sims) > 0:
        recipe_id, sim = sims.pop()
        if recipe_id not in recipes_rated_by_user:
            recommend.append((recipe_id, sim))
    return recommend

In [None]:
# 
i = 0
user_id = df_userdata.index[i]
recipes = df_userdata.loc[user_id]['recipes']
recommend = get_recipe_recommendations_for_user(user_id, user_embeddings, recipe_embeddings, recipes)
recommend