In [1]:
 # distilbert_trainer.py
import pandas as pd
import torch
from transformers import AutoTokenizer, EncoderDecoderModel, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import matplotlib.pyplot as plt
import os



In [2]:
 # 1. Load and prepare data (prompts and targets)
data_path = "/kaggle/input/recipes/recipes.csv"
df = pd.read_csv(data_path)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
train_end = int(0.8 * len(df))
val_end = int(0.9 * len(df))
train_df = df.iloc[:train_end]
val_df = df.iloc[train_end:val_end]
test_df = df.iloc[val_end:]
train_prompts = [f"Title: {title}\nIngredients: {ing}\nInstructions:"for title, ing in zip(train_df["Title"], train_df["Cleaned_Ingredients"])]
train_targets = train_df["Instructions"].tolist()
val_prompts = [f"Title: {title}\nIngredients: {ing}\nInstructions:"for title, ing in zip(val_df["Title"], val_df["Cleaned_Ingredients"])]
val_targets = val_df["Instructions"].tolist()


In [3]:
from transformers import AutoTokenizer, EncoderDecoderModel

# Define encoder and decoder model names
encoder_name = "distilbert-base-uncased"
decoder_name = "distilgpt2"

# Load tokenizers for the encoder and decoder
tokenizer_encoder = AutoTokenizer.from_pretrained(encoder_name)
tokenizer_decoder = AutoTokenizer.from_pretrained(decoder_name)

# Initialize the encoder-decoder model
model = EncoderDecoderModel.from_encoder_decoder_pretrained(encoder_name, decoder_name)


# ----------------------------------------------------------------------------
# Set special tokens for generation
# For the encoder, DistilBERT already uses:
#   - [CLS] as a start token,
#   - [SEP] as an end token, and
#   - [PAD] as a padding token.
# For the GPT2 decoder, these tokens may not be set by default.
# We add special tokens to the decoder tokenizer using the encoder's tokens.
# ----------------------------------------------------------------------------

# If the decoder does not have a bos_token, set it using the encoder's [CLS] token.
if tokenizer_decoder.bos_token is None:
    tokenizer_decoder.add_special_tokens({'bos_token': tokenizer_encoder.cls_token})
# If the decoder does not have an eos_token, set it using the encoder's [SEP] token.
if tokenizer_decoder.eos_token is None:
    tokenizer_decoder.add_special_tokens({'eos_token': tokenizer_encoder.sep_token})
# If the decoder does not have a pad_token, set it using the encoder's [PAD] token.
if tokenizer_decoder.pad_token is None:
    tokenizer_decoder.add_special_tokens({'pad_token': tokenizer_encoder.pad_token})

# Update model configuration to use the decoder's tokens.
model.config.decoder_start_token_id = tokenizer_decoder.bos_token_id
model.config.eos_token_id = tokenizer_decoder.eos_token_id
model.config.pad_token_id = tokenizer_decoder.pad_token_id

# Ensure that the model's vocab size matches the decoder's vocab size.
model.config.vocab_size = len(tokenizer_decoder)

# Resize the decoder's token embeddings to accommodate any new special tokens.
model.decoder.resize_token_embeddings(len(tokenizer_decoder))

# Now the model is ready for training and generation.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.1.ln_cross_attn.bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.2.crossattention.c_attn.bias', 'transformer.h.2.crossattention.c_attn.weight', 'transformer.h.2.crossattention.c_proj.bias', 'transformer.h.2.cr

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(50258, 768)

In [4]:
print(type(train_prompts), type(train_targets))


<class 'list'> <class 'list'>


In [5]:
for i, p in enumerate(train_prompts):
    if not isinstance(p, str):
        print(f"train_prompts[{i}] is {type(p)}: {p}")

for i, t in enumerate(train_targets):
    if not isinstance(t, str):
        print(f"train_targets[{i}] is {type(t)}: {t}")


train_targets[201] is <class 'float'>: nan
train_targets[1972] is <class 'float'>: nan
train_targets[2506] is <class 'float'>: nan
train_targets[8969] is <class 'float'>: nan
train_targets[9106] is <class 'float'>: nan


In [6]:
import math

clean_train_prompts = []
clean_train_targets = []
for prompt, target in zip(train_prompts, train_targets):
    if isinstance(target, float) and math.isnan(target):
        continue  # Skip this sample
    clean_train_prompts.append(prompt)
    clean_train_targets.append(target)

# Do the same for validation data if necessary.


In [7]:
import math

clean_train_targets = ["" if (isinstance(t, float) and math.isnan(t)) else str(t)
                         for t in train_targets]
# Optionally, ensure prompts are also strings:
clean_train_prompts = [str(p) for p in train_prompts]


In [8]:
 # # 3. Tokenize the inputs and targets

# Define maximum lengths for inputs and targets
max_input_length = 512
max_target_length = 512

# Ensure your prompts and targets are lists of strings
train_prompts = [str(p) for p in train_prompts]
train_targets = [str(t) for t in train_targets]
val_prompts = [str(p) for p in val_prompts]
val_targets = [str(t) for t in val_targets]

# Tokenize inputs and targets
train_encodings = tokenizer_encoder(train_prompts, padding=True, truncation=True, max_length=max_input_length)
with tokenizer_decoder.as_target_tokenizer():
    train_target_encodings = tokenizer_decoder(train_targets, padding=True, truncation=True, max_length=max_target_length)
val_encodings = tokenizer_encoder(val_prompts, padding=True, truncation=True, max_length=max_input_length)
with tokenizer_decoder.as_target_tokenizer():
    val_target_encodings = tokenizer_decoder(val_targets, padding=True, truncation=True, max_length=max_target_length)

# Create datasets
train_dataset = [
    {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": label
    }
    for input_ids, attention_mask, label in zip(
        train_encodings["input_ids"],
        train_encodings["attention_mask"],
        train_target_encodings["input_ids"]
    )
]

val_dataset = [
    {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": label
    }
    for input_ids, attention_mask, label in zip(
        val_encodings["input_ids"],
        val_encodings["attention_mask"],
        val_target_encodings["input_ids"]
    )
]




In [9]:
# for i, p in enumerate(train_prompts):
#     if not isinstance(p, str):
#         print(f"train_prompts[{i}] is {type(p)}: {p}")

# for i, t in enumerate(train_targets):
#     if not isinstance(t, str):
#         print(f"train_targets[{i}] is {type(t)}: {t}")


In [10]:
# 4. Set up Trainer
output_dir = "distilbert_recipe_model"
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
    report_to="none"
 )
data_collator = DataCollatorForSeq2Seq(tokenizer_encoder, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
 )
 # 5. Train
trainer.train()
 # 6. Save best model and tokenizer
trainer.save_model(output_dir)
tokenizer_encoder.save_pretrained(output_dir)




Epoch,Training Loss,Validation Loss
1,1.1221,0.992628
2,0.983,0.936924
3,0.9835,0.916499


There were missing keys in the checkpoint model loaded: ['decoder.lm_head.weight'].


('distilbert_recipe_model/tokenizer_config.json',
 'distilbert_recipe_model/special_tokens_map.json',
 'distilbert_recipe_model/vocab.txt',
 'distilbert_recipe_model/added_tokens.json',
 'distilbert_recipe_model/tokenizer.json')

In [11]:
 # 7. Plot loss curves
 logs = trainer.state.log_history
 train_losses = [entry.get("loss") for entry in logs if "loss" in entry]
 eval_losses = [entry.get("eval_loss") for entry in logs if "eval_loss" in entry]
 plt.figure()
 plt.plot(train_losses, label="Training Loss")
 plt.plot(eval_losses, label="Validation Loss")
 plt.xlabel("Logging Step or Epoch")
 plt.ylabel("Loss")
 plt.title("DistilBERT2DistilBERT Fine-Tuning Loss")
 plt.legend()
 plt.savefig(os.path.join(output_dir, "loss_curve.png"))
 plt.close()
 print("DistilBERT encoder-decoder fine-tuning complete. Model saved to", output_dir)

DistilBERT encoder-decoder fine-tuning complete. Model saved to distilbert_recipe_model


In [12]:
# from transformers import TrainingArguments, Trainer

# training_args = TrainingArguments(
#     output_dir='./results',
#     evaluation_strategy='epoch',  # Evaluate at the end of each epoch
#     save_strategy='epoch',        # Save the model at the end of each epoch
#     load_best_model_at_end=True,  # Load the best model at the end of training
#     metric_for_best_model='accuracy',  # Metric to determine the best model
#     greater_is_better=True,       # Specify if a higher metric is better
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=64,
#     num_train_epochs=4,
#     weight_decay=0.01,
#     logging_dir='./logs',
#     logging_steps=10,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
# )
# trainer.train()

In [13]:
# from sklearn.metrics import accuracy_score, classification_report

# # Get predictions
# predictions = trainer.predict(test_dataset)
# preds = torch.argmax(torch.tensor(predictions.predictions), axis=1)

# # Calculate accuracy
# accuracy = accuracy_score(test_labels, preds)
# print(f'Accuracy: {accuracy:.4f}')

# # Display classification report
# print(classification_report(test_labels, preds, target_names=label_encoder.classes_))


In [14]:
# import matplotlib.pyplot as plt

# # Save the model
# model.save_pretrained('./distilbert_recipe_classifier')
# tokenizer.save_pretrained('./distilbert_recipe_classifier')

# # Plot confusion matrix
# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# cm = confusion_matrix(test_labels, preds)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
# disp.plot(cmap=plt.cm.Blues)
# plt.title('Confusion Matrix')
# plt.show()
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [15]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=a48e15efbe1fa972a3999d60e9f44971fd76b6a317a59640e592a6e53c711fa7
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [16]:
 # distilbert_evaluator.py

import pandas as pd
import torch
from transformers import AutoTokenizer, EncoderDecoderModel
import evaluate
import random

In [17]:
 # 1. Load test data
data_path = "/kaggle/input/recipes/recipes.csv"
df = pd.read_csv(data_path)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
val_end = int(0.9 * len(df))
test_df = df.iloc[val_end:]
test_prompts = [f"Title: {title}\nIngredients: {ing}\nInstructions:"for title, ing in zip(test_df["Title"], test_df["Cleaned_Ingredients"])]
test_references = test_df["Instructions"].tolist()

In [18]:
 # 2. Load model and tokenizer
model_dir = "distilbert_recipe_model"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = EncoderDecoderModel.from_pretrained(model_dir)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

EncoderDecoderModel(
  (encoder): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): 

In [19]:
 # 3. Generate predictions
predictions = []
for prompt in test_prompts:
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.to(device)
    outputs = model.generate(input_ids, max_length=300, num_beams=4, early_stopping=True)
    pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predictions.append(pred_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask

In [20]:
 # 4. Compute metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
 # meteor = evaluate.load("meteor")
bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in test_references])["bleu"]
rouge_result = rouge.compute(predictions=predictions, references=test_references)
# meteor_score = meteor.compute(predictions=predictions, references=test_references)["meteor"]
rougeL = rouge_result["rougeL"]
def token_overlap_f1(pred, ref):
    pred_tokens = pred.split()
    ref_tokens = ref.split()
    if len(pred_tokens) == 0 or len(ref_tokens) == 0:
        return 0.0
    common = set(pred_tokens) & set(ref_tokens)
    prec = len(common) / len(set(pred_tokens))
    rec = len(common) / len(set(ref_tokens))
    if prec + rec == 0:
        return 0.0
    return 2 * prec * rec / (prec + rec)
avg_f1 = sum(token_overlap_f1(p, r) for p, r in zip(predictions, test_references)) / len(predictions)
print(f"BLEU: {bleu_score:.4f}")
print(f"ROUGE-L: {rougeL:.4f}")
# print(f"METEOR: {meteor_score:.4f}")
print(f"F1-score: {avg_f1:.4f}")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

BLEU: 0.0000
ROUGE-L: 0.0020
F1-score: 0.0025


In [21]:
 # 5. Print sample outputs
print("\nSample Predictions:")
for idx in random.sample(range(len(test_prompts)), 2):
    print(f"Prompt: {test_prompts[idx]}")
    print("-" * 50)
    print(f"Reference: {test_references[idx]}")
    print("-" * 50)
    print(f"Generated: {predictions[idx]}")
    print("-" * 50)


Sample Predictions:
Prompt: Title: Blackened Leeks With Asparagus and Boiled Eggs
Ingredients: ['4 large eggs', '4 medium leeks, white and pale-green parts only, halved lengthwise', '2 tablespoons unsalted butter', '1 bunch asparagus, trimmed', 'Kosher salt, freshly ground pepper', '1 tablespoon fresh lemon juice', '1 tablespoon whole grain mustard', 'Flaky sea salt']
Instructions:
--------------------------------------------------
Reference: Gently lower eggs into a medium pot of boiling water; cook 6 minutes. Transfer eggs to a bowl of ice water and let cool.
Heat a dry large cast-iron skillet over medium-high until smoking hot. Cook leeks, cut side down, pressing to ensure contact with skillet, until blackened, about 5 minutes. Reduce heat to medium; add butter and rotate skillet to evenly coat leeks. Transfer leeks to a platter with a slotted spoon.
Add asparagus to skillet, season with kosher salt and pepper, and cook, tossing occasionally, until bright green and crisp-tender, ab