In [1]:
import os
os.environ['CUDA_DEVICE_ORDER']="PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES']="0,5,6"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

import torch

print(torch.__version__)
print(torch.version.cuda)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name)
print(torch.cuda.device_count())
from transformers.utils import logging
logging.set_verbosity_error()

2.1.0
11.8
cuda
True
0
<function get_device_name at 0x7f7fdc0d0af0>
3


In [None]:
import pandas as pd
import ast
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
import re
import gc

data_path = "../../data/jeehoshin/allrecipe_dataset/"
model_path = "../../data/jeehoshin/huggingface"

users = pd.read_csv(data_path + "user_reviews_train_interactions.csv")
recipes = pd.read_csv(data_path + "short_recipe_summaries.csv")

cache_dir = model_path
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    cache_dir=cache_dir,
    padding_side='left'
)
tokenizer.pad_token = tokenizer.eos_token

max_memory = {
    0: "40GB",
    1: "40GB",
    2: "40GB",
}
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    cache_dir=cache_dir,
    torch_dtype=torch.float16,
    device_map="auto",
    max_memory=max_memory
)

# Map from recipe_id to (name, summary)
recipe_lookup = {
    row['recipe_id']: (row['recipe_name'], row['summary'])
    for _, row in recipes.iterrows()
}

# Extract summary
def extract_summary_with_tags(text):
    match = re.search(r"Summarize the user's dietary preferences in 2–3 well-written sentences. assistant(.*)", text, re.DOTALL | re.IGNORECASE)
    if match:
        return match.group(1).strip()
    else:
        return text.strip()       
        
# Build prompts for batch
def build_prompts(batch_rows):
    prompts, valid_indices = [], []
    for idx, row in batch_rows.iterrows():
        try:
            review_dict = ast.literal_eval(row['reviews'])
        except:
            continue

        segments = []
        for recipe_id_str, review in review_dict.items():
            try:
                recipe_id = int(recipe_id_str)
                if recipe_id in recipe_lookup:
                    name, summary = recipe_lookup[recipe_id]
                    segments.append(f"{name} - {summary} - {review}")
            except:
                continue

        if segments:
            recipe_info = "\n\n".join(segments)
            prompt = (f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a nutrition-focused culinary analyst. You will be given a list of recipes along with user-written reviews.

Your goal is to analyze the user's dietary preferences by identifying patterns in:
- Ingredients and preparation styles mentioned in the recipe summaries
- Tone and content of the user reviews
- Patterns in cuisine, dietary concerns, or cooking techniques

Use this information to infer the user's food preferences.{recipe_info}

<|eot_id|><|start_header_id|>user<|end_header_id|>

Summarize the user's dietary preferences in 2–3 well-written sentences. <|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
    )
            prompts.append(prompt)
            valid_indices.append(idx)
    return prompts, valid_indices


# Generate summaries in batches
def generate_batch_summaries(df, batch_size=8):
    all_summaries = [""] * len(df)
    for i in tqdm(range(0, len(df), batch_size), desc="Generating summaries"):
        batch = df.iloc[i:i + batch_size]
        prompts, valid_indices = build_prompts(batch)
        if not prompts:
            continue

        inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(device)      
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=200,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )

        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        summaries = [extract_summary_with_tags(text) for text in decoded]

        for idx, summary in zip(valid_indices, summaries):
            all_summaries[idx] = summary
        torch.cuda.empty_cache()
        gc.collect()
        
    return all_summaries

# Apply batch generation
users['summary'] = generate_batch_summaries(users, batch_size=8)

# Save result
users.to_csv(data_path + "user_summaries.csv", index=False)
print("Summaries generated and saved.")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Generating summaries: 100%|███████████████████████████████████████████████| 8596/8596 [25:23:52<00:00, 10.64s/it]


Summaries generated and saved.


In [1]:
import pandas as pd

summaries = pd.read_csv(data_path + "user_summaries.csv")
for i in range(20):
    print(f"User ID : {summaries.iloc[i]['user_id']}")
    print(f"User summary : {summaries.iloc[i]['summary']}")
    print()

User ID : 2783111
User summary : Based on the provided reviews, it appears that the user has a diverse and indulgent palate, often seeking out rich and flavorful dishes with a focus on comfort food and sweet treats. They seem to appreciate convenience and ease of preparation, often modifying recipes to suit their tastes and dietary needs. Additionally, the user appears to prioritize flavor and texture over strict adherence to nutritional guidelines, often opting for high-calorie and high-fat options, but also occasionally seeking out healthier alternatives like salads and fruit-based desserts.

User ID : 5404163
User summary : Based on the user's review, it appears that they prioritize health and nutrition in their food choices, as evidenced by their mention of the sandwich being "delicious and nutritious" and high in protein, fiber, and healthy fats. They also seem to value convenience and flexibility, as they modified the original recipe to use deli turkey instead of chicken and opte