In [1]:
 # t5_trainer.py
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import matplotlib.pyplot as plt
import os

In [2]:
 # 1. Load and preprocess the dataset
data_path = "/kaggle/input/recipedata/recipes.csv"
df = pd.read_csv(data_path)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
train_end = int(0.8 * len(df))
val_end = int(0.9 * len(df))
train_df = df.iloc[:train_end]
val_df = df.iloc[train_end:val_end]
test_df = df.iloc[val_end:]

In [3]:
train_prompts = [f"Title: {title}\nIngredients: {ing}\nInstructions:"for title, ing in zip(train_df["Title"], train_df["Cleaned_Ingredients"])]
train_targets = [str(instr) for instr in train_df["Instructions"].tolist()]
val_prompts = [f"Title: {title}\nIngredients: {ing}\nInstructions:"for title, ing in zip(val_df["Title"], val_df["Cleaned_Ingredients"])]
val_targets = [str(instr) for instr in val_df["Instructions"].tolist()]

In [4]:
train_df["Instructions"] = train_df["Instructions"].fillna("")
val_df["Instructions"] = val_df["Instructions"].fillna("")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["Instructions"] = train_df["Instructions"].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df["Instructions"] = val_df["Instructions"].fillna("")


In [5]:
 # 2. Initialize the tokenizer and model
model_name = "t5-small"  # or "t5-base" for a slightly larger model
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [6]:
for i, target in enumerate(train_targets):
    if not isinstance(target, str):
        print(f"Non-string element at index {i}: {target} (type: {type(target)})")


In [7]:
 # 3. Tokenize the prompts and targets
 # We will tokenize such that the model's encoder gets the prompt and decoder learns to generate the target.
 # Use padding and truncation to handle varying lengths.
max_input_length = 512   # max tokens for input (adjustable based on dataset)
max_target_length = 512  # max tokens for output (adjust as needed)
 # Tokenize training data
 # Tokenize training data
train_encodings = tokenizer(train_prompts, padding=True, truncation=True, max_length=max_input_length)
with tokenizer.as_target_tokenizer():
    train_target_encodings = tokenizer(train_targets, padding=True, truncation=True, max_length=max_target_length)

# Tokenize validation data
val_encodings = tokenizer(val_prompts, padding=True, truncation=True, max_length=max_input_length)
with tokenizer.as_target_tokenizer():
    val_target_encodings = tokenizer(val_targets, padding=True, truncation=True, max_length=max_target_length)




In [8]:
from torch.utils.data import Dataset
df['label'] = df['Image_Name'].apply(lambda x: x.split('_')[0])

# from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Ensure every key in encodings returns its idx-th element
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Create your dataset instances
train_dataset = MyDataset(train_encodings, train_target_encodings["input_ids"])
val_dataset = MyDataset(val_encodings, val_target_encodings["input_ids"])


In [9]:
from datasets import Dataset

train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "labels": train_target_encodings["input_ids"]
})

val_dataset = Dataset.from_dict({
    "input_ids": val_encodings["input_ids"],
    "attention_mask": val_encodings["attention_mask"],
    "labels": val_target_encodings["input_ids"]
})


In [10]:
 # Note: In the labels, T5 uses -100 internally for padding positions by default 
# (the DataCollator will handle replacing pad tokens with -100 in labels).
 # 4. Set up the Trainer with TrainingArguments
output_dir = "t5_recipe_model" 
training_args = TrainingArguments(
    output_dir=output_dir,
    disable_tqdm=False,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",       # evaluate at end of each epoch
    save_strategy="epoch",             # save model at end of each epoch
    logging_strategy="steps",
    logging_steps=50,                  # log training loss every 50 steps
    save_total_limit=1,                # only keep the best model (1 checkpoint)
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss", # use validation loss to select best model
    greater_is_better=False,
    fp16=True,                         # use mixed precision for speed (A100 supports this)
    report_to="none"                   # no third-party logging (just print to console)
 )
 # Use DataCollatorForSeq2Seq to handle padding of sequences and labels
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
 # 5. Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
    # (We don't define compute_metrics here, we'll compute metrics in the evaluation script)
 )
 # 6. Train the model
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,1.4873,1.29778
2,1.4047,1.239114
3,1.3858,1.224138


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=2025, training_loss=1.5173396659191745, metrics={'train_runtime': 1969.2004, 'train_samples_per_second': 16.453, 'train_steps_per_second': 1.028, 'total_flos': 4385074367692800.0, 'train_loss': 1.5173396659191745, 'epoch': 3.0})

In [11]:
 # 7. Save the best model and tokenizer
trainer.save_model(output_dir)            # this saves the best model (because 
load_best_model_at_end=True
tokenizer.save_pretrained(output_dir)     # save the tokenizer files

('t5_recipe_model/tokenizer_config.json',
 't5_recipe_model/special_tokens_map.json',
 't5_recipe_model/spiece.model',
 't5_recipe_model/added_tokens.json')

In [12]:
 # 8. Plot training & validation loss curves
 # Extract logged history of losses
logs = trainer.state.log_history
train_losses = [entry["loss"] for entry in logs if "loss" in entry]
eval_losses = [entry["eval_loss"] for entry in logs if "eval_loss" in entry]
plt.figure()
plt.plot(train_losses, label="Training Loss")
plt.plot(eval_losses, label="Validation Loss")
plt.xlabel("Logging Step or Epoch")
plt.ylabel("Loss")
plt.title("T5 Fine-Tuning Loss")
plt.legend()
plt.savefig(os.path.join(output_dir, "loss_curve.png"))
plt.close()
print("Training complete. Best model saved to", output_dir)

Training complete. Best model saved to t5_recipe_model


In [13]:
plt.show()

In [14]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [15]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import evaluate
import random

In [16]:
 # 1. Load the test data (same splitting logic to get test_df as used in training)
data_path = "//kaggle/input/recipedata/recipes.csv"
df = pd.read_csv(data_path)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
val_end = int(0.9 * len(df))
test_df = df.iloc[val_end:]  # last 10% as test
test_prompts = [
   f"Title: {title}\nIngredients: {ing}\nInstructions:"
   for title, ing in zip(test_df["Title"], test_df["Cleaned_Ingredients"])]
test_references = test_df["Instructions"].tolist()

In [17]:
 # 2. Load the fine-tuned model and tokenizer
model_dir = "/kaggle/working/t5_recipe_model"  # path where the fine-tuned model is saved
tokenizer = T5Tokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir)
model.eval()
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [28]:
# !pip install meteor_score

In [18]:
 # 3. Generate predictions for the test set
predictions = []
for prompt in test_prompts:
# Tokenize the input prompt and generate output
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.to(device)
    outputs = model.generate(inputs, max_length=300, num_beams=4, early_stopping=True)
    # Decode the generated sequence to text
    pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predictions.append(pred_text)

In [19]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=2d7db4fbe313ab3172f5c799e06c34f6440f6820d0084d30f62cb9bb479c6406
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [32]:
# 4. Compute EVALUATION Metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
# meteor = evaluate.load("meteor")
# BLEU expects a list of references for each prediction (e.g., list of lists)
bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in test_references])["bleu"]
rouge_score = rouge.compute(predictions=predictions, references=test_references)
# meteor_score = meteor.compute(predictions=predictions, references=test_references)["meteor"]

In [33]:
 # We will use ROUGE-L F1 score as our ROUGE-L metric:
rougeL = rouge_score["rougeL"]  # this is typically the F1 score for ROUGE-L
 # (The rouge metric returns several values; we take 'rougeL' which is the F1 measure of longest common subsequence.)
 # F1-score: define as average token overlap F1 (precision/recall over words)
def token_overlap_f1(pred, ref):
    pred_tokens = pred.split()
    ref_tokens = ref.split()
    if len(pred_tokens) == 0 or len(ref_tokens) == 0:
        return 0.0
    common = set(pred_tokens) & set(ref_tokens)
    prec = len(common) / len(set(pred_tokens))
    rec = len(common) / len(set(ref_tokens))
    if prec + rec == 0:
        return 0.0
    return 2 * prec * rec / (prec + rec)
f1_scores = [token_overlap_f1(p, r) for p, r in zip(predictions, test_references)]
avg_f1 = sum(f1_scores) / len(f1_scores)
print(f"BLEU: {bleu_score:.4f}")
print(f"ROUGE-L: {rougeL:.4f}")
# print(f"METEOR: {meteor_score:.4f}")
print(f"F1-score: {avg_f1:.4f}")

BLEU: 0.0600
ROUGE-L: 0.2076
F1-score: 0.2462


In [34]:
 # 5. Print qualitative examples
print("\nSample Predictions:")
for idx in random.sample(range(len(test_prompts)), 3):
    print(f"Prompt: {test_prompts[idx]}")
    print("-" * 50)
    print(f"Reference: {test_references[idx]}")
    print("-" * 50)
    print(f"Generated: {predictions[idx]}")
    print("-" * 50)


Sample Predictions:
Prompt: Title: Spaghetti with No-Cook Tomato Sauce and Hazelnuts
Ingredients: ['1/2 cup blanched hazelnuts', '1 pound cherry tomatoes, halved', '1 teaspoon kosher salt, plus more', '12 ounces spaghetti or linguine', '1 beefsteak tomato, chopped', '2 garlic cloves, crushed', '1 teaspoon crushed red pepper flakes', '1 cup basil leaves, divided', '2 small zucchini (about 8 ounces), coarsely grated', '1/4 cup olive oil, plus more for drizzling', 'Freshly ground black pepper', '1 ounce ricotta salata (salted dry ricotta)', 'shaved']
Instructions:
Reference: Preheat oven to 350°F. Toast hazelnuts on a rimmed baking sheet, tossing once, until golden brown, 8–10 minutes. Let cool, then coarsely chop.
Place cherry tomatoes in a large bowl; season with salt.
Cook spaghetti in a large pot of boiling salted water, stirring occasionally, until al dente. Drain pasta, reserving 1/4 cup pasta cooking liquid.
Meanwhile, puree beefsteak tomato, garlic, red pepper flakes, 1/2 cup bas