In [None]:
import pandas as pd

data_path = '../../data/jeehoshin/foodcom_dataset/'

user_summary = pd.read_csv(data_path + 'user_summaries.csv')
recipe_summary = pd.read_csv(data_path + 'detailed_recipe_summaries.csv')
u_id_mapping = pd.read_csv(data_path + 'u_id_mapping.csv', sep = '\t')
i_id_mapping = pd.read_csv(data_path + 'i_id_mapping.csv', sep = '\t')

print(user_summary.head())
print(recipe_summary.head())
print(u_id_mapping.head())
print(i_id_mapping.head())

missing_mask1 = user_summary["summary"].isnull()
missing_mask2 = recipe_summary["summary"].isnull()
num_missing1 = missing_mask1.sum()
num_missing2 = missing_mask2.sum()
print(f"Number of missing user summaries: {num_missing1}")
print(f"Number of missing recipe summaries: {num_missing2}")

   user_id                                            summary
0     2999  Based on the recipes the user has interacted w...
1     6836  Based on the recipes the user has interacted w...
2     6702  Based on the recipes the user has interacted w...
3     6512  Based on the recipes the user has interacted w...
4     7802  Based on the recipes the user has interacted w...
      id                                               name  \
0  63986                   chicken lickin  good  pork chops   
1  43026                                     chile rellenos   
2  54100                           grilled  venison burgers   
3  25775  how i got my family to eat spinach  spinach ca...   
4  90921         i stole the idea from mirj  sesame noodles   

                                             summary  
0  The dish is a delicious and hearty pork chop m...  
1  The dish is a delicious chile rellenos, featur...  
2  The image displays a delicious grilled venison...  
3  The dish is a spinach cass

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

user_sentences = user_summary["summary"].astype(str).tolist()
recipe_sentences = recipe_summary["summary"].astype(str).tolist()

# Encode with tqdm
batch_size = 64
user_embeddings = []
recipe_embeddings = []

for i in tqdm(range(0, len(user_sentences), batch_size), desc="Encoding user summaries"):
    batch = user_sentences[i:i+batch_size]
    batch_embeddings = model.encode(batch)
    user_embeddings.extend(batch_embeddings)

for i in tqdm(range(0, len(recipe_sentences), batch_size), desc="Encoding recipe summaries"):
    batch = recipe_sentences[i:i+batch_size]
    batch_embeddings = model.encode(batch)
    recipe_embeddings.extend(batch_embeddings)

# Convert to numpy array
user_embeddings = np.array(user_embeddings)  # shape: (num_samples, embedding_dim)
recipe_embeddings = np.array(recipe_embeddings)  # shape: (num_samples, embedding_dim)

# Create DataFrame: each embedding dimension becomes a separate column
user_embedding_df = pd.DataFrame(user_embeddings)
user_embedding_df.insert(0, "user_id", user_summary["user_id"])
print(user_embedding_df.head())

recipe_embedding_df = pd.DataFrame(recipe_embeddings)
recipe_embedding_df.insert(0, "recipe_id", recipe_summary["id"])
print(recipe_embedding_df.head())

Encoding user summaries: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 119/119 [00:08<00:00, 13.86it/s]
Encoding recipe summaries: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 468/468 [00:31<00:00, 14.63it/s]


   user_id         0         1         2         3         4         5  \
0     2999  0.021544 -0.056994  0.054519  0.065520 -0.067060  0.040505   
1     6836 -0.039900 -0.023910  0.003339  0.058721 -0.027135 -0.039752   
2     6702 -0.000031 -0.110317  0.033960  0.073314 -0.038335  0.006783   
3     6512  0.003957 -0.069140  0.016375  0.072921 -0.055657 -0.002401   
4     7802 -0.053619 -0.059632  0.040562  0.066850 -0.079849  0.002197   

          6         7         8  ...       374       375       376       377  \
0  0.090345  0.004677 -0.011507  ... -0.009542 -0.009803  0.069436 -0.018011   
1  0.105247 -0.005163 -0.036395  ...  0.027052 -0.007610  0.069329 -0.035303   
2  0.037612 -0.040247 -0.013637  ...  0.014178 -0.017808  0.011334 -0.040222   
3  0.033302 -0.069891  0.046387  ...  0.017008  0.014058  0.021317 -0.041976   
4  0.094238 -0.023548  0.010362  ...  0.057556 -0.033104  0.018133  0.002121   

        378       379       380       381       382       383  
0  0.01657

In [None]:
user_embedding_df.to_parquet(data_path + "user_embeddings.parquet", index=False)
recipe_embedding_df.to_parquet(data_path + "recipe_embeddings.parquet", index=False)
print("User / Recipe embeddings saved to Parquet.")

User / Recipe embeddings saved to Parquet.


In [None]:
import torch
import torch.nn as nn
import pandas as pd

# Map initial user/recipe embeddings
initial_user = pd.read_parquet(data_path + "user_embeddings.parquet")
initial_item = pd.read_parquet(data_path + "recipe_embeddings.parquet")
user_id_mapping = pd.read_csv(data_path + "u_id_mapping.csv", sep = "\t")
item_id_mapping = pd.read_csv(data_path + "i_id_mapping.csv", sep = "\t")

print(initial_user.head())
print(initial_item.head())
print(user_id_mapping.head())
print(item_id_mapping.head())

   user_id         0         1         2         3         4         5  \
0     2999  0.021544 -0.056994  0.054519  0.065520 -0.067060  0.040505   
1     6836 -0.039900 -0.023910  0.003339  0.058721 -0.027135 -0.039752   
2     6702 -0.000031 -0.110317  0.033960  0.073314 -0.038335  0.006783   
3     6512  0.003957 -0.069140  0.016375  0.072921 -0.055657 -0.002401   
4     7802 -0.053619 -0.059632  0.040562  0.066850 -0.079849  0.002197   

          6         7         8  ...       374       375       376       377  \
0  0.090345  0.004677 -0.011507  ... -0.009542 -0.009803  0.069436 -0.018011   
1  0.105247 -0.005163 -0.036395  ...  0.027052 -0.007610  0.069329 -0.035303   
2  0.037612 -0.040247 -0.013637  ...  0.014178 -0.017808  0.011334 -0.040222   
3  0.033302 -0.069891  0.046387  ...  0.017008  0.014058  0.021317 -0.041976   
4  0.094238 -0.023548  0.010362  ...  0.057556 -0.033104  0.018133  0.002121   

        378       379       380       381       382       383  
0  0.01657

In [5]:
uid_map = {}
rid_map = {}

for idx, row in user_id_mapping.iterrows():
    uid = row['user_id']
    uid_map[uid] = idx
    
for idx, row in item_id_mapping.iterrows():
    rid = row['recipe_id']
    rid_map[rid] = idx


user_ids = initial_user["user_id"].tolist()
embedding_matrix = initial_user.drop(columns=["user_id"]).values
user_input = {
    user_id: embedding_matrix[i]
    for i, user_id in enumerate(user_ids)
}

recipe_ids = initial_item["recipe_id"].tolist()
embedding_matrix = initial_item.drop(columns=["recipe_id"]).values
recipe_input = {
    recipe_id: embedding_matrix[i]
    for i, recipe_id in enumerate(recipe_ids)
}

num_users = len(user_input)
num_items = len(recipe_input)
input_dim = len(embedding_matrix[0])
user_embedding = nn.Embedding(num_users, input_dim)
item_embedding = nn.Embedding(num_items, input_dim)

with torch.no_grad():
    for uid in user_ids:
        index = uid_map[uid]
        embedding_row = user_input[uid]
        user_embedding.weight[index] = torch.tensor(embedding_row, dtype=torch.float)

    for rid in recipe_ids:
        index = rid_map[rid]
        embedding_row = recipe_input[rid]
        item_embedding.weight[index] = torch.tensor(embedding_row, dtype=torch.float)

In [None]:
# Save user and item embeddings
torch.save(user_embedding.state_dict(), data_path + "user_embedding.pt")
torch.save(item_embedding.state_dict(), data_path + "item_embedding.pt")
print("pretrained embedding weights saved successfully")

pretrained embedding weights saved successfully
