In [1]:
import os
os.environ['CUDA_DEVICE_ORDER']="PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES']='0,2,3'
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

import torch

print(torch.__version__)
print(torch.version.cuda)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name)

2.1.0
11.8
cuda
True
0
<function get_device_name at 0x7f7d182205e0>


In [None]:
import pandas as pd

data_path = '../../data/jeehoshin/foodcom_dataset/'
model_path = "../../data/jeehoshin/huggingface"

raw_interaction = pd.read_csv(data_path + 'RAW_interactions.csv')
pp_recipe = pd.read_csv(data_path + 'PP_recipes.csv')
raw_recipe = pd.read_csv(data_path + 'RAW_recipes.csv')

records_removed = True
df = raw_interaction[raw_interaction['recipe_id'].isin(pp_recipe['id'].tolist())]
while records_removed:
    user_counts = df['user_id'].value_counts()
    item_counts = df['recipe_id'].value_counts()

    valid_users = user_counts[user_counts >= 5].index
    valid_items = item_counts[item_counts >= 5].index

    filtered_df = df[df['user_id'].isin(valid_users) & df['recipe_id'].isin(valid_items)]

    if len(filtered_df) == len(df):
        records_removed = False
    else:
        records_removed = True

    df = filtered_df
    
core_inter = df
sort_filter_ui = core_inter.sort_values('date').reset_index(drop=True)

file_names = os.listdir(data_path + 'images/')

finish_image = [int(os.path.splitext(file_name)[0]) for file_name in file_names]
sort_filter_ui = sort_filter_ui[sort_filter_ui['recipe_id'].isin(finish_image)].reset_index(drop=True)
print(len(finish_image))
sort_filter_ui

29943


Unnamed: 0,user_id,recipe_id,date,rating,review
0,2999,3567,2000-10-23,5,I have made this pie instead of plain ol' pump...
1,2178,3704,2000-10-30,3,Careful not to cook it too long... you want th...
2,2178,4366,2000-11-04,5,"if you like oysters, this is a great alternati..."
3,5523,7695,2001-02-01,1,I agree.
4,42189,4460,2001-02-12,5,I have had this before. It has really good fla...
...,...,...,...,...,...
428496,2001513060,367414,2018-12-17,1,"maybe I did something wrong , but I thought th..."
428497,2001513060,192495,2018-12-17,5,This is a keeper. Delicious Soup that both my ...
428498,454804,20713,2018-12-17,0,"Made this as gifts. Did 6, quart jars plus had..."
428499,1290903,131607,2018-12-18,5,This is a great recipe for a nice thin crispy ...


In [None]:
print(f"# of recipes with images: {len(file_names)}")

users = sort_filter_ui['user_id'].unique().tolist()
items = sort_filter_ui['recipe_id'].unique().tolist()
print(f"number of users: {len(users)}, number of items: {len(items)}")

train_ = sort_filter_ui[:int(0.6*len(sort_filter_ui))]
valid_ = sort_filter_ui[int(0.6*len(sort_filter_ui)):int(0.7*len(sort_filter_ui))]
test_ = sort_filter_ui[int(0.7*len(sort_filter_ui)):]

u_tr = set(train_['user_id'].tolist())
u_va = set(valid_['user_id'].tolist())
u_te = set(test_['user_id'].tolist())
u_total = u_tr & u_te

filter_u_tr = train_[train_['user_id'].isin(u_total)].reset_index(drop=True)
filter_u_te = test_[test_['user_id'].isin(u_total)].reset_index(drop=True)
filter_u_va = valid_[valid_['user_id'].isin(u_total)].reset_index(drop=True)
print(f"train interaction count : {len(filter_u_tr)}, valid interaction count : {len(filter_u_va)}, test interaction count : {len(filter_u_te)}")

u_train = set(filter_u_tr['user_id'].tolist())
u_test = set(filter_u_te['user_id'].tolist())
u_valid = set(filter_u_va['user_id'].tolist())
print(f"train user count : {len(u_train)}, valid user count : {len(u_valid)}, test user count : {len(u_test)}")

i_tr = set(filter_u_tr['recipe_id'].tolist())
i_te = set(filter_u_te['recipe_id'].tolist())
i_va = set(filter_u_va['recipe_id'].tolist())
print(f"train item count : {len(i_tr)}, valid item count : {len(i_va)}, test item count : {len(i_te)}")

i_total = i_tr|i_va|i_te
print(f"total user count : {len(u_total)}")
print(f"total item count : {len(i_total)}")

In [None]:
train_interaction = filter_u_tr
test_interaction = filter_u_te
valid_interaction = filter_u_va
print(f"train interaction  : {len(train_interaction)}, valid interaction count : {len(valid_interaction)}, test interaction count : {len(test_interaction)}")
print(train_interaction.columns)
print(test_interaction.columns)
print(valid_interaction.columns)

recipe_processed = raw_recipe[raw_recipe['id'].isin(i_total)].reset_index(drop=True)
print(f"total recipe count : {len(recipe_processed)}")
print(recipe_processed.columns)

In [None]:
from collections import defaultdict

short_recipe_summaries = pd.read_csv(data_path + 'short_recipe_summaries.csv')
recipe_summaries = {id : (name, summary) for id, name, summary
                    in zip(short_recipe_summaries['id'], short_recipe_summaries['name'], short_recipe_summaries['summary'])}

# gather 10 most recent reviews for each user in the training set
user_reviews = defaultdict(list)
for i in range(len(train_interaction)):
    user_id = int(train_interaction['user_id'].iloc[i])
    recipe_id = int(train_interaction['recipe_id'].iloc[i])
    review = train_interaction['review'].iloc[i]
    user_reviews[user_id].append((recipe_id, review))

for user in user_reviews:
    if len(user_reviews[user]) > 10:
        user_reviews[user] = {recipe_id : review for recipe_id, review in user_reviews[user][-10:]}
    else:
        user_reviews[user] = {recipe_id : review for recipe_id, review in user_reviews[user]}

user_reviews_df = pd.DataFrame(list(user_reviews.items()), columns=['user_id', 'reviews'])
print("recipe summary loaded & user review dataframe created")

recipe summary loaded & user review dataframe created


In [6]:
user_reviews_df.to_csv(data_path + "user_reviews_dict.csv", index=False)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import re
import gc

cache_dir = model_path
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    cache_dir=cache_dir,
    padding_side='left'
)
tokenizer.pad_token = tokenizer.eos_token


max_memory = {
    0: "40GB",
    1: "40GB",
    2: "40GB",
}
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    cache_dir=cache_dir,
    torch_dtype=torch.float16,
    device_map="auto",
    max_memory=max_memory
)

# Extract summary
def extract_summary_with_tags(text):
    match = re.search(r"Summarize the user's dietary preferences in 2–3 well-written sentences. assistant(.*)", text, re.DOTALL | re.IGNORECASE)
    if match:
        return match.group(1).strip()
    else:
        return text.strip()       
        
# Build prompts for batch
def build_prompts(batch_rows):
    prompts, valid_indices = [], []
    for idx, row in batch_rows.iterrows():
        try:
            review_list = row['reviews']
        except:
            continue

        segments = []
        for recipe_id, review in review_list.items():
            try:
                recipe_id = int(recipe_id)
                if recipe_id in recipe_summaries:
                    name, summary = recipe_summaries[recipe_id]
                    segments.append(f"Recipe {len(segments) + 1}:\n- Name: {name}\n- Summary: {summary}\n- User Review: {review}")
            except:
                continue

        if segments:
            recipe_info = "\n\n".join(segments)
            prompt = (f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a nutrition-focused culinary analyst. You will be given a list of recipes along with user-written reviews.

Your goal is to analyze the user's dietary preferences by identifying patterns in:
- Ingredients and preparation styles mentioned in the recipe summaries
- Tone and content of the user reviews
- Patterns in cuisine, dietary concerns, or cooking techniques

Use this information to infer the user's food preferences.
                      
{recipe_info}

<|eot_id|><|start_header_id|>user<|end_header_id|>

Summarize the user's dietary preferences in 2–3 well-written sentences. <|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
    )
            
            prompts.append(prompt)
            valid_indices.append(idx)
    return prompts, valid_indices


# Generate summaries in batches
def generate_batch_summaries(df, batch_size=8):
    all_summaries = [""] * len(df)
    for i in tqdm(range(0, len(df), batch_size), desc="Generating summaries"):
        batch = df.iloc[i:i + batch_size]
        prompts, valid_indices = build_prompts(batch)
        if not prompts:
            continue
        
        inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(device)      
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=200,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )

        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        summaries = [extract_summary_with_tags(text) for text in decoded]

        for idx, summary in zip(valid_indices, summaries):
            all_summaries[idx] = summary
        torch.cuda.empty_cache()
        gc.collect()
    return all_summaries

# Apply batch generation
user_preference = generate_batch_summaries(user_reviews_df, batch_size=4)
user_reviews_df['summary'] = user_preference
print('Summaries generated')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Generating summaries: 100%|██████████| 1897/1897 [7:27:56<00:00, 14.17s/it]

Summaries generated





In [7]:
# Save result
print(user_reviews_df.head())
user_reviews_df.drop(columns=['reviews'], inplace=True)

print(user_reviews_df.head())
user_reviews_df.to_csv(data_path + "user_summaries.csv", index=False)
print("Summaries generated and saved.")

   user_id                                            reviews  \
0     2999  {3567: 'I have made this pie instead of plain ...   
1     6836  {136026: 'I love spinach & feta cheese, so I f...   
2     6702  {536: 'The corn dog casserole tasted a lot lik...   
3     6512  {32147: 'SUPER SUPPER!!!!! This was an excelle...   
4     7802  {73939: 'Easy to make and everyone loved it. I...   

                                             summary  
0  Based on the user's reviews and recipe prefere...  
1  Based on the user's reviews, it appears that t...  
2  Based on the user's reviews and preferences, i...  
3  Based on the user reviews, it appears that thi...  
4  Based on the user's reviews and recipe prefere...  
   user_id                                            summary
0     2999  Based on the user's reviews and recipe prefere...
1     6836  Based on the user's reviews, it appears that t...
2     6702  Based on the user's reviews and preferences, i...
3     6512  Based on the user r

In [None]:
generated_user_preference = pd.read_csv(data_path + "user_summaries.csv")
print(generated_user_preference)

      user_id                                            summary
0        2999  Based on the user's reviews and recipe prefere...
1        6836  Based on the user's reviews, it appears that t...
2        6702  Based on the user's reviews and preferences, i...
3        6512  Based on the user reviews, it appears that thi...
4        7802  Based on the user's reviews and recipe prefere...
...       ...                                                ...
7580   808500  Based on the user's review, it appears that th...
7581   894666  Based on the user's review, it appears that th...
7582  1185443  Based on the recipe and review, it appears tha...
7583  1186221  Based on the recipe and review, it appears tha...
7584  1185804  Based on the user's review, it appears that th...

[7585 rows x 2 columns]
