In [1]:
import os
import time
import pickle
from ast import literal_eval
import numpy as np
import matplotlib.pyplot as plt

import torch
from transformers import DistilBertTokenizer, DistilBertModel
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [67]:
def get_train_and_test_data(df):
    from sklearn.model_selection import train_test_split
    df_exploded = df.explode(['rated_recipes', 'rating_list'])
    df_train_exploded, df_test_exploded = train_test_split(df_exploded, test_size=0.2, random_state=42)
    df_train = df_train_exploded.groupby(level=0).agg(list).drop('ingredients', axis=1)
    df_test = df_test_exploded.groupby(level=0).agg(list).drop('ingredients', axis=1)
    all_user_ids = df.index
    df_train = df_train.reindex(all_user_ids, fill_value=[]) # Re-index to ensure all user_ids are included
    df_test = df_test.reindex(all_user_ids, fill_value=[])
    return df_train, df_test

In [2]:
def get_text_from_recipe(recipe):
    tags = recipe['tags'] if isinstance(recipe['tags'], list) else []
    ingredients = recipe['ingredients'] if isinstance(recipe['ingredients'], list) else []
    description = str(recipe['description']) if isinstance(recipe['description'], str) else ""
    return description + ' ' + ' '.join(tags + ingredients)

In [3]:
# Helper function to get DistilBERT embeddings for text
def get_embedding_DistilBERT(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach()

In [4]:
def create_recipe_embeddings_DistilBERT(df_recipes, handle_limit=None, save_dir='embeddings-distilbert', redo=False):  # Set max_recipes to the desired limit
    os.makedirs(save_dir, exist_ok=True)
    n_rows = len(df_recipes)
    start = time.time()
    handled_i = 0
    for i, (recipe_id, row) in enumerate(df_recipes.iterrows()):
        save_fn = f"{save_dir}/{recipe_id}.pkl"
        if redo or not os.path.exists(save_fn):
            print('\rGetting embedding for {:_}/{:_} ({:.1f}%) ({:_} handled)'.format(i+1, n_rows, ((i+1)/n_rows)*100, handled_i), end='')
            print(' ({:.1f} per min)'.format( (handled_i / (time.time()-start) * 60) ), end='')
            content_text = get_text_from_recipe(row)
            recipe_embedding = get_embedding_DistilBERT(content_text)
            with open(save_fn, "wb") as f:
                pickle.dump(recipe_embedding, f)
            handled_i += 1
            if handle_limit and handled_i > handle_limit:
                break
    print('\nDone. Took {:_}s'.format(int(time.time()-start)))

In [None]:
# DEPRECATED: Loads recipe embeddings from pkl files
def load_recipe_embeddings_distilBERT(save_dir='embeddings-distilbert', limit=None):
    embeddings, ids = [], []
    for i, item in enumerate(os.listdir(save_dir)):
        print('\r{}'.format(i+1), end='')
        recipe_id = item.split('.')[0]
        itempath = os.path.join(save_dir, item)
        try:
            with open(itempath, 'rb') as f:
                recipe_embedding = pickle.load(f)
            embeddings.append(recipe_embedding)
            ids.append(recipe_id)
        except:
            print('Error: Unable to read "{}". Removing ...'.format(itempath))
            # os.remove(itempath) # assumes the file is empty and can be deleted
        if limit and i >= limit: break
    return { id_: emb for id_, emb in zip(ids, embeddings) }

In [39]:
def create_recipe_embeddings_tensor(save_dir='embeddings-distilbert', limit=None):
    ids, embeddings = [], []
    n_items = len(os.listdir(save_dir))
    for i, item in enumerate(os.listdir(save_dir)):
        print('\r({:_}/{:_})'.format(i+1, n_items), end='')
        id_ = item.split('.')[0]
        path = f'{save_dir}/{item}'
        try:
            with open(path, 'rb') as f:
                embeddings.append(pickle.load(f))
            ids.append(id_)
        except:
            pass
        if limit and i >= limit: break
    print('\nDone.')
    embeddings_tensor = torch.stack(embeddings)
    ids_to_index_BERT = { id_: i for i, id_ in enumerate(ids) }
    return embeddings_tensor, ids_to_index_BERT

In [28]:
def save_recipe_embeddings_tensor(embeds_tensor, id_index_map, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    torch_fn = f'{save_dir}/tensor.pt'
    pkl_fn = f'{save_dir}/id_index_map.pkl'
    torch.save(embeds_tensor, torch_fn)
    with open(pkl_fn, 'wb') as f:
        pickle.dump(id_index_map, f)

In [87]:
def load_recipe_embeddings_matrix(save_dir):
    torch_fn = f'{save_dir}/tensor.pt'
    pkl_fn = f'{save_dir}/id_index_map.pkl'
    tensor = torch.load(torch_fn)
    matrix = tensor.squeeze().numpy()
    with open(pkl_fn, 'rb') as f:
        id_index_map = pickle.load(f)
    id_index_map = { int(id_): i for id_, i in id_index_map.items() }
    return matrix, id_index_map

In [109]:
def create_user_embeddings_from_recipe_embeddings(df_userdata, recipe_embeddings_matrix, id_index_map, max_users=None):  # Limit number of users for testing
    df_userdata = df_userdata.head(max_users)
    n_rows = len(df_userdata)
    user_embeddings = []
    for i, (_, row) in enumerate(df_userdata.iterrows()):
        print('\rGetting embedding for {:_}/{:_} ({:.1f}%)'.format(i+1, n_rows, ((i+1)/n_rows)*100), end='')
        rated_recipes, ratings = row['rated_recipes'], row['rating_list']
        user_embedding = []
        for recipe_id, rating in zip(rated_recipes, ratings):
            idx = id_index_map[recipe_id]
            recipe_embedding = recipe_embeddings_matrix[idx]
            weighted_embedding = (rating-2) * recipe_embedding # Weight recipe embeddings by rating
            user_embedding.append(weighted_embedding)
        if user_embedding != []:
            user_embedding = np.array(user_embedding) # Ensure all embeddings are the same shape (i.e., 1D vectors)
            if user_embedding.ndim == 2:
                user_embedding = np.mean(user_embedding, axis=0)
            user_embeddings.append(user_embedding)
    return np.array(user_embeddings)

#### Load Data

In [7]:
# Load recipes & recipe_reviews
converters = { k: literal_eval for k in ['tags', 'ingredients', 'steps', 'nutrition'] } # for evaluating strings as arrays (eg. tags)
df_recipes = pd.read_csv('dataset/RAW_recipes.csv', converters=converters, index_col='id')
# df_recipe_reviews = pd.read_csv('dataset/Recipe_Reviews.csv', index_col='id')
# df_interact = pd.read_csv('dataset/RAW_interactions.csv', dtype={'review': str})

converters = { k: literal_eval for k in ['rated_recipes', 'ingredients', 'rating_list'] }
df_userdata = pd.read_csv('dataset/User_Data.csv', converters=converters, index_col='user_id')

In [68]:
# Create/Load Training and Test data
converters = { k: literal_eval for k in ['rated_recipes', 'ingredients', 'rating_list'] }
train_fn = 'dataset/User_Data_Train.csv'
test_fn = 'dataset/User_Data_Test.csv'
if os.path.exists(train_fn):
    print('Loading train and test userdata ...')
    df_train = pd.read_csv(train_fn, converters=converters, index_col='user_id')
    df_test =  pd.read_csv(test_fn, converters=converters, index_col='user_id')
else:
    print('Reading userdata dataframe ...')
    df_userdata = pd.read_csv('dataset/User_Data.csv', converters=converters, index_col='user_id')
    print('Splitting userdata into training and test data ...')
    df_train, df_test = get_train_and_test_data(df_userdata.head(None).copy())
    df_train.to_csv(train_fn)
    df_test.to_csv(test_fn)
print('Done.')

Loading train and test userdata ...
Done.


In [104]:
# create id-index maps
id_to_index_RECIPES = { id_: i for i, id_ in enumerate(df_recipes.index) }
index_to_id_RECIPES = { i: id_ for i, id_ in enumerate(df_recipes.index) }

id_to_index_USERS = { id_: i for i, id_ in enumerate(df_train.index) }
index_to_id_USERS = { i: id_ for i, id_ in enumerate(df_train.index) }

#### Initialize DistilBERT Model

In [8]:
# Initialize DistilBERT
tokenizer_DistilBERT = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model_DistilBERT = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
# Creates the recipe embeddings and pickles them into folder (slow to read)
create_recipe_embeddings_DistilBERT(df_recipes, handle_limit=None)


Done. Took 81s


In [None]:
# Create recipe
try:
    embeds_tensor, ids_to_index_map = create_recipe_embeddings_tensor(limit=None)
    print('Saving tensor ...')
    save_recipe_embeddings_tensor(embeds_tensor, ids_to_index_map, 'embeddings/recipe_embeddings_BERT')
except KeyboardInterrupt:
    print('\nKeyboard interrupt detected ...')

(231_637/231_637)
Done.
Saving tensor ...


In [101]:
# Load recipe embeddings tensor
recipe_embeddings, id_index_map = load_recipe_embeddings_matrix('embeddings/recipe_embeddings_BERT')

  tensor = torch.load(torch_fn)


In [102]:
# Load recipe_embeddings and create user embeddings
try:
    user_embeddings = create_user_embeddings_from_recipe_embeddings(df_train, recipe_embeddings, id_index_map, max_users=None)
except KeyboardInterrupt:
    print('\nKeyboard interrupt detected ...')

Getting embedding for 226_570/226_570 (100.0%)

In [178]:
# 
def get_recipe_recommendations_for_user(user_embedding, recipe_embeddings, recipes_rated_by_user, index_to_id_RECIPES, top_n=100):
    cosine_sims = cosine_similarity([user_embedding], recipe_embeddings)[0]
    sims_items = [ (i, sim) for i, sim in enumerate(cosine_sims) ]
    sims_items.sort(reverse=True, key=lambda item: item[1])
    sims_ids = [ (index_to_id_RECIPES[idx], sim) for idx, sim in sims_items ]
    recommend = []
    while len(recommend) < top_n and len(sims_ids) > 0:
        recipe_id, sim = sims_ids.pop()
        if recipe_id not in recipes_rated_by_user:
            recommend.append((recipe_id, sim))
    return recommend

In [None]:
# 
i = 0
user_id = df_train.index[i]
recipes = df_train.loc[user_id]['rated_recipes']
user_embedding = user_embeddings[user_id]
recommend = get_recipe_recommendations_for_user(user_embedding, recipe_embeddings, recipes, index_to_id_RECIPES, top_n=10)
recommend

In [None]:
def get_recipe_recommendations_for_all_users(df_userdata, user_embeddings, item_embeddings, index_to_id_RECIPES, df_recommend, save_name='recommend.csv', save_period=None, size_limit=None, top_n=100):
    n_rows = len(df_recommend)
    for i, (user_id, row) in enumerate(df_recommend.iterrows()):
        if not isinstance(row['recommended_recipes'], list):
            print('\r({:_}/{:_})'.format(i+1, n_rows), end='')
            user_embedding = user_embeddings[i]
            rated_recipes = df_userdata.loc[user_id]['rated_recipes']
            recommend_items = get_recipe_recommendations_for_user(user_embedding, item_embeddings, rated_recipes, index_to_id_RECIPES, top_n=top_n)
            recommend_ids = [ id_ for id_, sim in recommend_items ]
            recommend_sims = [ sim for id_, sim in recommend_items ]
            df_recommend.at[user_id, 'recommended_recipes'] = recommend_ids
            df_recommend.at[user_id, 'recommended_sims'] = recommend_sims
            if size_limit and i >= size_limit-1:            break
            if save_period and i>0 and i%save_period==0:    df_recommend.to_csv(save_name)
    print('\nDone.')
    df_recommend.to_csv(save_name)
    return df_recommend

In [181]:
# 
df_recommend_fn = 'dataset/recommendations_BERT/recommend_BERT.csv'
df_recommend = pd.DataFrame(index=df_train.index, columns=['recommended_recipes', 'recommended_sims'])

In [196]:
# 
try:
    df_recommend = get_recipe_recommendations_for_all_users(df_train, user_embeddings, recipe_embeddings, index_to_id_RECIPES, df_recommend, 
                                                            save_name=df_recommend_fn, save_period=10, size_limit=None, top_n=1000)
except KeyboardInterrupt:
    print('\nKeyboard interrupt detected ...')

(8_010/226_570)
Keyboard interrupt detected ...
