# ![Banner](https://github.com/LittleHouse75/flatiron-resources/raw/main/NevitsBanner.png)
---
# Experiment 2 — BART & T5 (Pretrained Seq2Seq Models)
---

This notebook evaluates **purpose-built summarization models**:

- **BART** — denoising autoencoder for seq2seq  
- **T5** — text-to-text transformer trained on C4  

Both are pretrained encoder–decoder models and provide strong baselines compared to Experiment 1’s custom BERT→GPT-2 Frankenstein.

We reuse shared components from `src/`:

- SAMSum dataset loader  
- `SummaryDataset`  
- BART / T5 model builders  
- Shared seq2seq trainer (early stopping + checkpoints)  
- Qualitative preview utilities  

## 1. Environment Setup

In [1]:

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
from pathlib import Path
import pandas as pd
import sys

from torch.utils.data import DataLoader
import torch.optim as optim

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## 2. Imports from src/

In [2]:
from src.data.load_data import load_samsum
from src.data.preprocess import SummaryDataset
from src.models.build_bart import build_bart_model
from src.models.build_t5 import build_t5_model
from src.train.trainer_seq2seq import train_model
from src.eval.qualitative import qualitative_samples
from src.utils.logging import heading, line

## 3. Hyperparameters

In [3]:
MAX_SOURCE_LEN = 512
MAX_TARGET_LEN = 128

BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 3e-5     # good default for pretrained seq2seq
PATIENCE = 2             # early stopping
GRAD_ACCUM_STEPS = 1

## 4. Load SAMSum

In [4]:
train_df, val_df, test_df = load_samsum()
len(train_df), len(val_df), len(test_df)

(14731, 818, 819)

## 5. Train BART

In [5]:
heading("Build BART model")

bart_model, bart_tokenizer = build_bart_model()
bart_model = bart_model.to(device)

# Configure generation to respect our target length
gen_cfg = bart_model.generation_config
gen_cfg.max_length = MAX_TARGET_LEN
gen_cfg.min_length = 5
gen_cfg.no_repeat_ngram_size = 3
gen_cfg.length_penalty = 2.0
gen_cfg.num_beams = 4

bart_tokenizer


Build BART model


BartTokenizer(name_or_path='facebook/bart-base', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
}
)

In [None]:
heading("Prepare BART datasets")

bart_train_dataset = SummaryDataset(
    train_df,
    encoder_tokenizer=bart_tokenizer,
    decoder_tokenizer=bart_tokenizer,
    max_source_len=MAX_SOURCE_LEN,
    max_target_len=MAX_TARGET_LEN,
)

bart_val_dataset = SummaryDataset(
    val_df,
    encoder_tokenizer=bart_tokenizer,
    decoder_tokenizer=bart_tokenizer,
    max_source_len=MAX_SOURCE_LEN,
    max_target_len=MAX_TARGET_LEN,
)

bart_train_loader = DataLoader(
    bart_train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0,
)

bart_val_loader = DataLoader(
    bart_val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
)


Prepare BART datasets


In [7]:
heading("Configure BART optimizer")

bart_optimizer = optim.AdamW(bart_model.parameters(), lr=LEARNING_RATE)
bart_optimizer


Configure BART optimizer


AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    decoupled_weight_decay: True
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 3e-05
    maximize: False
    weight_decay: 0.01
)

In [8]:
heading("Train BART")

BART_CKPT_DIR = PROJECT_ROOT / "models" / "bart" / "best"

bart_history = train_model(
    model=bart_model,
    train_loader=bart_train_loader,
    val_loader=bart_val_loader,
    optimizer=bart_optimizer,
    tokenizer=bart_tokenizer,
    device=device,
    epochs=EPOCHS,
    max_target_len=MAX_TARGET_LEN,
    checkpoint_dir=str(BART_CKPT_DIR),
    patience=PATIENCE,
    grad_accum_steps=GRAD_ACCUM_STEPS,
)

bart_history


Train BART


Epoch 1/5:   0%|          | 0/3683 [00:03<?, ?it/s]

KeyboardInterrupt: 

## 6. Qualitative BART Examples

In [None]:
heading("Qualitative BART samples")

qualitative_samples(
    df=val_df,
    model=bart_model,
    encoder_tokenizer=bart_tokenizer,
    decoder_tokenizer=bart_tokenizer,
    device=device,
    max_target_len=MAX_TARGET_LEN,
    n=5,
)

## 7. Save Full BART Model

In [None]:
SAVE_DIR_BART = PROJECT_ROOT / "models" / "bart" / "final"
SAVE_DIR_BART.mkdir(parents=True, exist_ok=True)

bart_model.save_pretrained(SAVE_DIR_BART)
bart_tokenizer.save_pretrained(SAVE_DIR_BART)

print("Saved BART model to:", SAVE_DIR_BART)

---
# Train T5

T5 is a text-to-text model. For summarization it expects a task prefix like:

`"summarize: {dialogue_text}"`

We apply this prefix to the **source** text only; targets stay as the human summaries.

---

## 8. Train T5

In [None]:
heading("Build T5 model")

t5_model, t5_tokenizer = build_t5_model("t5-small")
t5_model = t5_model.to(device)

# Configure generation
gen_cfg_t5 = t5_model.generation_config
gen_cfg_t5.max_length = MAX_TARGET_LEN
gen_cfg_t5.min_length = 5
gen_cfg_t5.no_repeat_ngram_size = 3
gen_cfg_t5.length_penalty = 2.0
gen_cfg_t5.num_beams = 4

t5_tokenizer

In [None]:
heading("Prepare T5 datasets (with 'summarize:' prefix)")

train_df_prefixed = train_df.copy()
val_df_prefixed = val_df.copy()

train_df_prefixed["dialogue"] = "summarize: " + train_df_prefixed["dialogue"]
val_df_prefixed["dialogue"] = "summarize: " + val_df_prefixed["dialogue"]

t5_train_dataset = SummaryDataset(
    train_df_prefixed,
    encoder_tokenizer=t5_tokenizer,
    decoder_tokenizer=t5_tokenizer,
    max_source_len=MAX_SOURCE_LEN,
    max_target_len=MAX_TARGET_LEN,
)

t5_val_dataset = SummaryDataset(
    val_df_prefixed,
    encoder_tokenizer=t5_tokenizer,
    decoder_tokenizer=t5_tokenizer,
    max_source_len=MAX_SOURCE_LEN,
    max_target_len=MAX_TARGET_LEN,
)

t5_train_loader = DataLoader(
    t5_train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=0,
)

t5_val_loader = DataLoader(
    t5_val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
)

In [None]:
heading("Configure T5 optimizer")

t5_optimizer = optim.AdamW(t5_model.parameters(), lr=LEARNING_RATE)
t5_optimizer

In [None]:
heading("Train T5")

T5_CKPT_DIR = PROJECT_ROOT / "models" / "t5" / "best"

t5_history = train_model(
    model=t5_model,
    train_loader=t5_train_loader,
    val_loader=t5_val_loader,
    optimizer=t5_optimizer,
    tokenizer=t5_tokenizer,
    device=device,
    epochs=EPOCHS,
    max_target_len=MAX_TARGET_LEN,
    checkpoint_dir=str(T5_CKPT_DIR),
    patience=PATIENCE,
    grad_accum_steps=GRAD_ACCUM_STEPS,
)

t5_history

## 9. Qualitative T5 Examples

In [None]:
heading("Qualitative T5 samples")

qualitative_samples(
    df=val_df_prefixed,   # prefixed source text
    model=t5_model,
    encoder_tokenizer=t5_tokenizer,
    decoder_tokenizer=t5_tokenizer,
    device=device,
    max_target_len=MAX_TARGET_LEN,
    n=5,
)

## 10. Save Full T5 Model

In [None]:
SAVE_DIR_T5 = PROJECT_ROOT / "models" / "t5" / "final"
SAVE_DIR_T5.mkdir(parents=True, exist_ok=True)

t5_model.save_pretrained(SAVE_DIR_T5)
t5_tokenizer.save_pretrained(SAVE_DIR_T5)

print("Saved T5 model to:", SAVE_DIR_T5)

---
# 11. Key Takeaways — Experiment 2

TBD

---