In [None]:
import pandas as pd

data_path = '../../data/jeehoshin/allrecipe_dataset/'

user_summary = pd.read_csv(data_path + 'user_summaries.csv')
recipe_summary = pd.read_csv(data_path + 'detailed_recipe_summaries.csv')
u_id_mapping = pd.read_csv(data_path + 'u_id_mapping.csv', sep = '\t')
i_id_mapping = pd.read_csv(data_path + 'i_id_mapping.csv', sep = '\t')

print(user_summary.head())
print(recipe_summary.head())
print(u_id_mapping.head())
print(i_id_mapping.head())

missing_mask1 = user_summary["summary"].isnull()
missing_mask2 = recipe_summary["summary"].isnull()
num_missing1 = missing_mask1.sum()
num_missing2 = missing_mask2.sum()
print(f"Number of missing user summaries: {num_missing1}")
print(f"Number of missing recipe summaries: {num_missing2}")

   user_id                                            reviews  \
0  2783111  {218939: "i enjoyed this didn't discard the ti...   
1  5404163  {87211: 'Mixed the pesto with mayo, used deli ...   
2   702483  {87211: "I made this on my foreman grill and i...   
3  3471401  {87211: "Really simple!   I used French bread ...   
4  1605138  {87211: "Who needs Atlanta Bread with a recipe...   

                                             summary  
0  Based on the recipes the user has interacted w...  
1  Based on the recipe, it appears that the user ...  
2  Based on the recipes the user has interacted w...  
3  Based on the recipes the user has interacted w...  
4  Based on the recipes the user has interacted w...  
   recipe_id                              recipe_name  \
0     240488        Pork Loin, Apples, and Sauerkraut   
1     218939         Foolproof Rosemary Chicken Wings   
2      87211                    Chicken Pesto Paninis   
3     245714                       Potato Bacon Piz

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

user_sentences = user_summary["summary"].astype(str).tolist()
recipe_sentences = recipe_summary["summary"].astype(str).tolist()

# Encode with tqdm
batch_size = 64
user_embeddings = []
recipe_embeddings = []

for i in tqdm(range(0, len(user_sentences), batch_size), desc="Encoding user summaries"):
    batch = user_sentences[i:i+batch_size]
    batch_embeddings = model.encode(batch)
    user_embeddings.extend(batch_embeddings)

for i in tqdm(range(0, len(recipe_sentences), batch_size), desc="Encoding recipe summaries"):
    batch = recipe_sentences[i:i+batch_size]
    batch_embeddings = model.encode(batch)
    recipe_embeddings.extend(batch_embeddings)

# Convert to numpy array
user_embeddings = np.array(user_embeddings)  # shape: (num_samples, embedding_dim)
recipe_embeddings = np.array(recipe_embeddings)  # shape: (num_samples, embedding_dim)

# Create DataFrame: each embedding dimension becomes a separate column
user_embedding_df = pd.DataFrame(user_embeddings)
user_embedding_df.insert(0, "user_id", user_summary["user_id"])
print(user_embedding_df.head())

recipe_embedding_df = pd.DataFrame(recipe_embeddings)
recipe_embedding_df.insert(0, "recipe_id", recipe_summary["recipe_id"])
print(recipe_embedding_df.head())

# Create DataFrame: each embedding dimension becomes a separate column
user_embedding_df = pd.DataFrame(user_embeddings)
user_embedding_df.insert(0, "user_id", user_summary["user_id"])
print(user_embedding_df.head())

recipe_embedding_df = pd.DataFrame(recipe_embeddings)
recipe_embedding_df.insert(0, "recipe_id", recipe_summary["recipe_id"])
print(recipe_embedding_df.head())

Encoding user summaries: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1075/1075 [01:08<00:00, 15.67it/s]
Encoding recipe summaries: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 713/713 [00:47<00:00, 15.09it/s]


   user_id         0         1         2         3         4         5  \
0  2783111 -0.043740 -0.099668 -0.018126  0.047951  0.008202  0.005865   
1  5404163 -0.031359 -0.060623 -0.045879  0.030368 -0.074899  0.050836   
2   702483 -0.000898 -0.019975 -0.054803  0.053898 -0.025539  0.035048   
3  3471401 -0.044875 -0.030315  0.006900  0.039213 -0.008102  0.052415   
4  1605138 -0.028164 -0.065254  0.019806  0.021984 -0.045679 -0.031341   

          6         7         8  ...       374       375       376       377  \
0  0.012112 -0.047916  0.057459  ...  0.000923 -0.007915  0.035003  0.027737   
1  0.044080  0.030300  0.043701  ... -0.034556 -0.004899  0.036245  0.023646   
2  0.006614 -0.064936  0.004329  ... -0.029162  0.013640  0.099835 -0.045994   
3  0.101710 -0.001237  0.006349  ... -0.004868 -0.011456  0.045408 -0.021434   
4  0.026968  0.004432  0.009931  ...  0.027022 -0.018053  0.035989  0.005251   

        378       379       380       381       382       383  
0  0.10469

In [None]:
user_embedding_df.to_parquet(data_path + "user_embeddings.parquet", index=False)
recipe_embedding_df.to_parquet(data_path + "recipe_embeddings.parquet", index=False)
print("User / Recipe embeddings saved to Parquet.")

User / Recipe embeddings saved to Parquet.


In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

# Map initial user/recipe embeddings
initial_user = pd.read_parquet(data_path + "user_embeddings.parquet")
initial_item = pd.read_parquet(data_path + "recipe_embeddings.parquet")
user_id_mapping = pd.read_csv(data_path + "u_id_mapping.csv", sep = "\t")
item_id_mapping = pd.read_csv(data_path + "i_id_mapping.csv", sep = "\t")

print(initial_user.head())
print(initial_item.head())
print(user_id_mapping.head())
print(item_id_mapping.head())

   user_id         0         1         2         3         4         5  \
0  2783111 -0.043740 -0.099668 -0.018126  0.047951  0.008202  0.005865   
1  5404163 -0.031359 -0.060623 -0.045879  0.030368 -0.074899  0.050836   
2   702483 -0.000898 -0.019975 -0.054803  0.053898 -0.025539  0.035048   
3  3471401 -0.044875 -0.030315  0.006900  0.039213 -0.008102  0.052415   
4  1605138 -0.028164 -0.065254  0.019806  0.021984 -0.045679 -0.031341   

          6         7         8  ...       374       375       376       377  \
0  0.012112 -0.047916  0.057459  ...  0.000923 -0.007915  0.035003  0.027737   
1  0.044080  0.030300  0.043701  ... -0.034556 -0.004899  0.036245  0.023646   
2  0.006614 -0.064936  0.004329  ... -0.029162  0.013640  0.099835 -0.045994   
3  0.101710 -0.001237  0.006349  ... -0.004868 -0.011456  0.045408 -0.021434   
4  0.026968  0.004432  0.009931  ...  0.027022 -0.018053  0.035989  0.005251   

        378       379       380       381       382       383  
0  0.10469

In [5]:
uid_map = {}
rid_map = {}

for idx, row in user_id_mapping.iterrows():
    uid = row['user_id']
    uid_map[uid] = idx
    
for idx, row in item_id_mapping.iterrows():
    rid = row['recipe_id']
    rid_map[rid] = idx


user_ids = initial_user["user_id"].tolist()
embedding_matrix = initial_user.drop(columns=["user_id"]).values
user_input = {
    user_id: embedding_matrix[i]
    for i, user_id in enumerate(user_ids)
}

recipe_ids = initial_item["recipe_id"].tolist()
embedding_matrix = initial_item.drop(columns=["recipe_id"]).values
recipe_input = {
    recipe_id: embedding_matrix[i]
    for i, recipe_id in enumerate(recipe_ids)
}

num_users = len(user_input)
num_items = len(recipe_input)
input_dim = len(embedding_matrix[0])
user_embedding = nn.Embedding(num_users, input_dim)
item_embedding = nn.Embedding(num_items, input_dim)

with torch.no_grad():
    for uid in user_ids:
        index = uid_map[uid]
        embedding_row = user_input[uid]
        user_embedding.weight[index] = torch.tensor(embedding_row, dtype=torch.float)

    for rid in recipe_ids:
        index = rid_map[rid]
        embedding_row = recipe_input[rid]
        item_embedding.weight[index] = torch.tensor(embedding_row, dtype=torch.float)

In [None]:
# Save user and item embeddings
torch.save(user_embedding.state_dict(), data_path + "user_embedding.pt")
torch.save(item_embedding.state_dict(), data_path + "item_embedding.pt")

print("pretrained embedding weights saved successfully")

pretrained embedding weights saved successfully
