In [None]:
# distilroberta_trainer.py
 import pandas as pd
 import torch
 from transformers import AutoTokenizer, EncoderDecoderModel, TrainingArguments, Trainer, DataCollatorForSeq2Seq
 import matplotlib.pyplot as plt
 import os

In [None]:
data_path = "recipes.csv"
df = pd.read_csv(data_path)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
train_end = int(0.8 * len(df))
val_end = int(0.9 * len(df))
train_df = df.iloc[:train_end]
val_df = df.iloc[train_end:val_end]
test_df = df.iloc[val_end:]
train_prompts = [f"Title: {title}\nIngredients: {ing}\nInstructions:"for title, ing in zip(train_df["Title"], train_df["Cleaned_Ingredients"])]
train_targets = train_df["Instructions"].tolist()
val_prompts = [
    f"Title: {title}\nIngredients: {ing}\nInstructions:"
    for title, ing in zip(val_df["Title"], val_df["Cleaned_Ingredients"])
]
val_targets = val_df["Instructions"].tolist()


In [None]:
 # 2. Initialize tokenizer and EncoderDecoderModel for DistilRoBERTa
 encoder_name = "distilroberta-base"
 decoder_name = "distilroberta-base"
 tokenizer = AutoTokenizer.from_pretrained(encoder_name)
 model = EncoderDecoderModel.from_encoder_decoder_pretrained(encoder_name, decoder_name)
 # Set special tokens:
 # RoBERTa uses <s> as bos, </s> as eos, and has <pad>.
 model.config.decoder_start_token_id = tokenizer.bos_token_id   # start decoding with <s>
 model.config.eos_token_id = tokenizer.eos_token_id             # end of sequence </s>
 model.config.pad_token_id = tokenizer.pad_token_id             # pad token
 model.config.vocab_size = model.config.decoder.vocab_size

In [None]:
 # 3. Tokenization
 max_input_length = 512
 max_target_length = 512
 train_encodings = tokenizer(train_prompts, padding=True, truncation=True, max_length=max_input_length)
 with tokenizer.as_target_tokenizer():
    train_target_encodings = tokenizer(train_targets, padding=True, truncation=True, max_length=max_target_length)
 val_encodings = tokenizer(val_prompts, padding=True, truncation=True, max_length=max_input_length)

 with tokenizer.as_target_tokenizer():
    val_target_encodings = tokenizer(val_targets, padding=True, truncation=True, max_length=max_target_length)
 train_dataset = {
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "labels": train_target_encodings["input_ids"]
 }
 val_dataset = {
    "input_ids": val_encodings["input_ids"],
    "attention_mask": val_encodings["attention_mask"],
    "labels": val_target_encodings["input_ids"]
 }

In [None]:
 # 4. Training setup
 output_dir = "distilroberta_recipe_model"
 training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
    report_to="none"
 )
 data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
 trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
 )

 # 5. Train the model
 trainer.train()
 # 6. Save model and tokenizer
 trainer.save_model(output_dir)
 tokenizer.save_pretrained(output_dir)

In [None]:
 # 7. Plot loss curves
 logs = trainer.state.log_history
 train_losses = [entry.get("loss") for entry in logs if "loss" in entry]
 eval_losses = [entry.get("eval_loss") for entry in logs if "eval_loss" in entry]
 plt.figure()
 plt.plot(train_losses, label="Training Loss")
 plt.plot(eval_losses, label="Validation Loss")
 plt.xlabel("Logging Step or Epoch")
 plt.ylabel("Loss")
 plt.title("DistilRoBERTa2DistilRoBERTa Fine-Tuning Loss")
 plt.legend()
 plt.savefig(os.path.join(output_dir, "loss_curve.png"))
 plt.close()
 print("DistilRoBERTa encoder-decoder fine-tuning complete. Model saved to", 
output_dir)

In [None]:
# distilroberta_evaluator.py
import pandas as pd
import torch
from transformers import AutoTokenizer, EncoderDecoderModel
import evaluate
import random

In [None]:
# 1. Load test data
data_path = "recipes.csv"
df = pd.read_csv(data_path)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
val_end = int(0.9 * len(df))
test_df = df.iloc[val_end:]
test_prompts = [
    f"Title: {title}\nIngredients: {ing}\nInstructions:"
    for title, ing in zip(test_df["Title"], test_df["Cleaned_Ingredients"])
]
test_references = test_df["Instructions"].tolist()

In [None]:
# 2. Load the model and tokenizer
model_dir = "distilroberta_recipe_model"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = EncoderDecoderModel.from_pretrained(model_dir)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# 3. Generate outputs for test prompts
predictions = []
for prompt in test_prompts:
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.to(device)
    outputs = model.generate(input_ids, max_length=300, num_beams=4, early_stopping=True)
    pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predictions.append(pred_text)

In [None]:
 # 4. Compute evaluation metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
# meteor = evaluate.load("meteor")

bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in test_references])["bleu"]
rouge_result = rouge.compute(predictions=predictions, references=test_references)
# meteor_score = meteor.compute(predictions=predictions, references=test_references)["meteor"]
rougeL = rouge_result["rougeL"]
def token_overlap_f1(pred, ref):
    pred_tokens = pred.split()
    ref_tokens = ref.split()
    if len(pred_tokens) == 0 or len(ref_tokens) == 0:
        return 0.0
    common = set(pred_tokens) & set(ref_tokens)
    prec = len(common) / len(set(pred_tokens))
    rec = len(common) / len(set(ref_tokens))
    if prec + rec == 0:
        return 0.0
    return 2 * prec * rec / (prec + rec)
avg_f1 = sum(token_overlap_f1(p, r) for p, r in zip(predictions, test_references)) / len(predictions)
print(f"BLEU: {bleu_score:.4f}")
print(f"ROUGE-L: {rougeL:.4f}")
# print(f"METEOR: {meteor_score:.4f}")
print(f"F1-score: {avg_f1:.4f}")


In [None]:
 # 5. Show some example predictions
print("\nSample Predictions:")
for idx in random.sample(range(len(test_prompts)), 3):
    print(f"Prompt: {test_prompts[idx]}")
    print("-" * 50)
    print(f"Reference: {test_references[idx]}")
    print("-" * 50)
    print(f"Generated: {predictions[idx]}")
    print("-" * 50)