# Project: CNN/DailyMail Summarization & Headline Models
- ‡πÇ‡∏°‡πÄ‡∏î‡∏• 1: ‡∏™‡∏£‡∏∏‡∏õ‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏°‡∏¢‡∏≤‡∏ß‡πÉ‡∏´‡πâ‡∏Å‡∏£‡∏∞‡∏ä‡∏±‡∏ö (‡πÄ‡∏ó‡∏£‡∏ô‡∏à‡∏≤‡∏Å‡∏®‡∏π‡∏ô‡∏¢‡πå)
- ‡πÇ‡∏°‡πÄ‡∏î‡∏• 2: ‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏´‡∏±‡∏ß‡∏Ç‡πâ‡∏≠‡∏à‡∏≤‡∏Å‡∏Ç‡πâ‡∏≠‡∏Ñ‡∏ß‡∏≤‡∏° (‡πÉ‡∏ä‡πâ pretrain)
- Dataset: CNN/DailyMail 287k (train), 13k (val), 11k (test)
- ‡πÉ‡∏ä‡πâ GPU + mixed precision + checkpoint every N steps

> ‡∏£‡∏±‡∏ô‡∏ó‡∏µ‡∏•‡∏∞‡πÄ‡∏ã‡∏•‡∏•‡πå‡∏ï‡∏≤‡∏°‡∏•‡∏≥‡∏î‡∏±‡∏ö ‡∏´‡πâ‡∏≤‡∏°‡∏Ç‡πâ‡∏≤‡∏° epoch ‡∏ó‡∏µ‡πà‡∏ï‡∏±‡πâ‡∏á‡πÑ‡∏ß‡πâ‡∏£‡∏∞‡∏´‡∏ß‡πà‡∏≤‡∏á‡πÄ‡∏ó‡∏£‡∏ô

In [4]:
# ‚úÖ Setup & Imports
import os, math, json, random
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import (
    BertTokenizerFast,
    BertConfig,
    EncoderDecoderModel,
    get_linear_schedule_with_warmup
)
from torch.cuda.amp import GradScaler, autocast

seed = 42
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"üî• Device: {device}")
print(f"CUDA available: {torch.cuda.is_available()}")

  from .autonotebook import tqdm as notebook_tqdm


üî• Device: cuda
CUDA available: True


  if not hasattr(np, "object"):


In [5]:
# ‚öôÔ∏è Config
cfg = {
    'model_name': 'bert-base-uncased',
    'max_src_len': 512,
    'max_tgt_len': 150,
    'max_head_len': 32,
    'train_batch': 4,
    'eval_batch': 4,
    'grad_accum': 4,
    'lr': 3e-5,
    'weight_decay': 0.01,
    'num_epochs_sum': 8,        # Summarization: 8 epochs (‡πÄ‡∏ó‡∏£‡∏ô‡∏à‡∏≤‡∏Å‡∏®‡∏π‡∏ô‡∏¢‡πå)
    'num_epochs_head': 5,       # Headline: 5 epochs (‡πÉ‡∏ä‡πâ pretrain)
    'warmup_ratio': 0.05,
    'ckpt_every': 2000,
    'save_dir': 'checkpoints_new'
}
os.makedirs(cfg['save_dir'], exist_ok=True)
print(json.dumps(cfg, indent=2))

{
  "model_name": "bert-base-uncased",
  "max_src_len": 512,
  "max_tgt_len": 150,
  "max_head_len": 32,
  "train_batch": 4,
  "eval_batch": 4,
  "grad_accum": 4,
  "lr": 3e-05,
  "weight_decay": 0.01,
  "num_epochs_sum": 8,
  "num_epochs_head": 5,
  "warmup_ratio": 0.05,
  "ckpt_every": 2000,
  "save_dir": "checkpoints_new"
}


In [6]:
# üì¶ Load CNN/DailyMail (287k train)
# Uses HuggingFace datasets; ensure disk space is enough.
dataset = load_dataset('cnn_dailymail', '3.0.0')
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


In [7]:
# üî† Tokenizer
# ‡πÉ‡∏ä‡πâ BERT uncased; ‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ‡∏™‡∏•‡∏±‡∏ö custom vocab ‡πÑ‡∏î‡πâ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ‡πÑ‡∏ü‡∏•‡πå
use_custom_tokenizer = False
custom_tokenizer_path = 'my_tokenizer_287k.json'

if use_custom_tokenizer and os.path.exists(custom_tokenizer_path):
    tokenizer = BertTokenizerFast(tokenizer_file=custom_tokenizer_path)
else:
    tokenizer = BertTokenizerFast.from_pretrained(cfg['model_name'])

tokenizer.model_max_length = cfg['max_src_len']
print(f"Tokenizer vocab size: {len(tokenizer)}")

Tokenizer vocab size: 30522


In [8]:
# üìö Dataset wrappers
class SummarizationDataset(Dataset):
    def __init__(self, split):
        self.data = dataset[split]
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        item = self.data[idx]
        return {
            'article': item['article'],
            'summary': item['highlights']
        }

class HeadlineDataset(Dataset):
    def __init__(self, split):
        self.data = dataset[split]
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        item = self.data[idx]
        return {
            'article': item['article'],
            'headline': item['highlights']  # ‡πÉ‡∏ä‡πâ highlights ‡πÄ‡∏õ‡πá‡∏ô‡∏´‡∏±‡∏ß‡∏Ç‡πâ‡∏≠‡∏™‡∏±‡πâ‡∏ô
        }

def collate_batch(batch, max_src, max_tgt):
    src_texts = [b['article'] for b in batch]
    tgt_texts = [b.get('summary', b.get('headline', '')) for b in batch]
    enc = tokenizer(src_texts, padding=True, truncation=True, max_length=max_src, return_tensors='pt')
    dec = tokenizer(tgt_texts, padding=True, truncation=True, max_length=max_tgt, return_tensors='pt')
    labels = dec['input_ids'].clone()
    labels[labels == tokenizer.pad_token_id] = -100
    enc['labels'] = labels
    return {k: v.to(device) for k, v in enc.items()}

In [9]:
# üöö DataLoaders
train_sum = SummarizationDataset('train')
val_sum = SummarizationDataset('validation')
train_head = HeadlineDataset('train')
val_head = HeadlineDataset('validation')

sum_loader = DataLoader(train_sum, batch_size=cfg['train_batch'], shuffle=True, num_workers=0,
                        collate_fn=lambda b: collate_batch(b, cfg['max_src_len'], cfg['max_tgt_len']))
val_sum_loader = DataLoader(val_sum, batch_size=cfg['eval_batch'], shuffle=False, num_workers=0,
                            collate_fn=lambda b: collate_batch(b, cfg['max_src_len'], cfg['max_tgt_len']))

head_loader = DataLoader(train_head, batch_size=cfg['train_batch'], shuffle=True, num_workers=0,
                         collate_fn=lambda b: collate_batch(b, cfg['max_src_len'], cfg['max_head_len']))
val_head_loader = DataLoader(val_head, batch_size=cfg['eval_batch'], shuffle=False, num_workers=0,
                             collate_fn=lambda b: collate_batch(b, cfg['max_src_len'], cfg['max_head_len']))

print(len(train_sum), len(val_sum), len(train_head), len(val_head))

287113 13368 287113 13368


In [10]:
# üèóÔ∏è Build Summarization Model (from scratch)
from transformers import EncoderDecoderConfig

enc_cfg = BertConfig.from_pretrained(cfg['model_name'])
dec_cfg = BertConfig.from_pretrained(cfg['model_name'])
encdec_cfg = EncoderDecoderConfig.from_encoder_decoder_configs(enc_cfg, dec_cfg)

enc_dec_model_sum = EncoderDecoderModel(encdec_cfg)
# re-init weights (from scratch)
enc_dec_model_sum.encoder.apply(enc_dec_model_sum._init_weights)
enc_dec_model_sum.decoder.apply(enc_dec_model_sum._init_weights)
enc_dec_model_sum.config.decoder_start_token_id = tokenizer.cls_token_id
enc_dec_model_sum.config.pad_token_id = tokenizer.pad_token_id
enc_dec_model_sum.config.eos_token_id = tokenizer.sep_token_id
enc_dec_model_sum.config.vocab_size = enc_dec_model_sum.config.encoder.vocab_size
enc_dec_model_sum.to(device)

print("Summarization params (M):", sum(p.numel() for p in enc_dec_model_sum.parameters())/1e6)

Summarization params (M): 247.363386


In [11]:
# üîÅ Training loop helper

def train_epoch(model, loader, optimizer, scheduler, scaler, epoch_num, total_epochs, max_grad_norm=1.0):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    total_steps = len(loader)
    
    for step, batch in enumerate(loader, 1):
        with torch.amp.autocast('cuda'):
            outputs = model(**batch)
            loss = outputs.loss / cfg['grad_accum']
        scaler.scale(loss).backward()
        if step % cfg['grad_accum'] == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()
        total_loss += loss.item() * cfg['grad_accum']
        
        # Progress indicator every 50 steps
        if step % 50 == 0 or step == total_steps:
            avg_loss = total_loss / step
            print(f"  Epoch {epoch_num}/{total_epochs} | Step {step}/{total_steps} | Loss: {avg_loss:.4f}")
    
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in loader:
            outputs = model(**batch)
            total_loss += outputs.loss.item()
    return total_loss / len(loader)

def save_ckpt(model, name):
    path = os.path.join(cfg['save_dir'], name)

    model.save_pretrained(path)    
    print(f"Saved {path}")
    tokenizer.save_pretrained(path)

In [None]:
# üöÄ Train Summarization Model (8 Epochs)
num_training_steps = math.ceil(len(sum_loader) / cfg['grad_accum']) * cfg['num_epochs_sum']
optimizer = torch.optim.AdamW(enc_dec_model_sum.parameters(), lr=cfg['lr'], weight_decay=cfg['weight_decay'])
scheduler = get_linear_schedule_with_warmup(optimizer, int(num_training_steps*cfg['warmup_ratio']), num_training_steps)
scaler = torch.amp.GradScaler('cuda')

for epoch in range(1, cfg['num_epochs_sum'] + 1):
    print(f"\n{'='*60}")
    print(f"üî• Starting Summarization Training: Epoch {epoch}/{cfg['num_epochs_sum']}")
    print(f"{'='*60}")
    train_loss = train_epoch(enc_dec_model_sum, sum_loader, optimizer, scheduler, scaler, epoch, cfg['num_epochs_sum'])
    val_loss = evaluate(enc_dec_model_sum, val_sum_loader)
    print(f"\nüìä [Summarization] Epoch {epoch}/{cfg['num_epochs_sum']} Complete | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    save_ckpt(enc_dec_model_sum, f"summ_epoch_{epoch}")
print("\n‚úÖ Summarization training finished (8 epochs)")


üî• Starting Summarization Training: Epoch 1/8
  Epoch 1/8 | Step 50/71779 | Loss: 10.5195
  Epoch 1/8 | Step 100/71779 | Loss: 10.5109
  Epoch 1/8 | Step 150/71779 | Loss: 10.4968
  Epoch 1/8 | Step 200/71779 | Loss: 10.4714
  Epoch 1/8 | Step 250/71779 | Loss: 10.4386
  Epoch 1/8 | Step 300/71779 | Loss: 10.3979
  Epoch 1/8 | Step 350/71779 | Loss: 10.3535
  Epoch 1/8 | Step 400/71779 | Loss: 10.3059
  Epoch 1/8 | Step 450/71779 | Loss: 10.2575
  Epoch 1/8 | Step 500/71779 | Loss: 10.2085
  Epoch 1/8 | Step 550/71779 | Loss: 10.1636
  Epoch 1/8 | Step 600/71779 | Loss: 10.1194
  Epoch 1/8 | Step 650/71779 | Loss: 10.0797
  Epoch 1/8 | Step 700/71779 | Loss: 10.0411
  Epoch 1/8 | Step 750/71779 | Loss: 10.0050
  Epoch 1/8 | Step 800/71779 | Loss: 9.9710
  Epoch 1/8 | Step 850/71779 | Loss: 9.9374
  Epoch 1/8 | Step 900/71779 | Loss: 9.9079
  Epoch 1/8 | Step 950/71779 | Loss: 9.8769
  Epoch 1/8 | Step 1000/71779 | Loss: 9.8480
  Epoch 1/8 | Step 1050/71779 | Loss: 9.8198
  Epoch 1/8

In [12]:
# üîÑ Resume Training (‡∏ñ‡πâ‡∏≤‡∏Ñ‡∏≠‡∏°‡∏î‡∏±‡∏ö)
# ‚ö†Ô∏è ‡∏ï‡πâ‡∏≠‡∏á‡∏£‡∏±‡∏ô Cell 1-9 ‡∏Å‡πà‡∏≠‡∏ô (Setup, Config, Dataset, Model) ‡∏ñ‡∏∂‡∏á‡∏à‡∏∞‡πÉ‡∏ä‡πâ‡πÄ‡∏ã‡∏•‡∏•‡πå‡∏ô‡∏µ‡πâ‡πÑ‡∏î‡πâ

# ‡∏Å‡∏≥‡∏´‡∏ô‡∏î epoch ‡∏ó‡∏µ‡πà‡∏à‡∏∞‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ï‡πâ‡∏ô‡πÉ‡∏´‡∏°‡πà (‡πÄ‡∏ä‡πà‡∏ô ‡∏ñ‡πâ‡∏≤‡∏°‡∏µ checkpoint epoch_6 ‡πÉ‡∏´‡πâ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏ó‡∏µ‡πà 7)
resume_from_epoch = 7  # ‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡∏ï‡∏≤‡∏°‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£
load_checkpoint = True  # ‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡πÄ‡∏õ‡πá‡∏ô False ‡∏ñ‡πâ‡∏≤‡πÄ‡∏ó‡∏£‡∏ô‡πÉ‡∏´‡∏°‡πà‡∏ï‡∏±‡πâ‡∏á‡πÅ‡∏ï‡πà‡∏ï‡πâ‡∏ô

if load_checkpoint and resume_from_epoch > 1:
    checkpoint_path = os.path.join(cfg['save_dir'], f"summ_epoch_{resume_from_epoch-1}")
    if os.path.exists(checkpoint_path):
        print(f"üîÑ Loading checkpoint from: {checkpoint_path}")
        enc_dec_model_sum = EncoderDecoderModel.from_pretrained(checkpoint_path)
        enc_dec_model_sum.to(device)
        print(f"‚úÖ Resumed from Epoch {resume_from_epoch-1}, will continue from Epoch {resume_from_epoch}")
    else:
        print(f"‚ùå Checkpoint not found: {checkpoint_path}")
        print(f"Available checkpoints:")
        if os.path.exists(cfg['save_dir']):
            ckpts = [d for d in os.listdir(cfg['save_dir']) if d.startswith('summ_epoch_')]
            for c in sorted(ckpts):
                print(f"  - {c}")
        resume_from_epoch = 1
else:
    resume_from_epoch = 1
    print(f"üÜï Starting fresh training from Epoch 1")

üîÑ Loading checkpoint from: checkpoints_new\summ_epoch_6
‚úÖ Resumed from Epoch 6, will continue from Epoch 7


In [13]:
# üöÄ Continue Training Summarization (Epoch 7-8)
num_training_steps = math.ceil(len(sum_loader) / cfg['grad_accum']) * (cfg['num_epochs_sum'] - resume_from_epoch + 1)
optimizer = torch.optim.AdamW(enc_dec_model_sum.parameters(), lr=cfg['lr'], weight_decay=cfg['weight_decay'])
scheduler = get_linear_schedule_with_warmup(optimizer, int(num_training_steps*cfg['warmup_ratio']), num_training_steps)
scaler = torch.amp.GradScaler('cuda')

for epoch in range(resume_from_epoch, cfg['num_epochs_sum'] + 1):
    print(f"\n{'='*60}")
    print(f"üî• Starting Summarization Training: Epoch {epoch}/{cfg['num_epochs_sum']}")
    print(f"{'='*60}")
    train_loss = train_epoch(enc_dec_model_sum, sum_loader, optimizer, scheduler, scaler, epoch, cfg['num_epochs_sum'])
    val_loss = evaluate(enc_dec_model_sum, val_sum_loader)
    print(f"\nüìä [Summarization] Epoch {epoch}/{cfg['num_epochs_sum']} Complete | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    save_ckpt(enc_dec_model_sum, f"summ_epoch_{epoch}")
print(f"\n‚úÖ Summarization training finished (Epoch {resume_from_epoch}-{cfg['num_epochs_sum']})")


üî• Starting Summarization Training: Epoch 7/8


  decoder_attention_mask = decoder_input_ids.new_tensor(decoder_input_ids != self.config.pad_token_id)


  Epoch 7/8 | Step 50/71779 | Loss: 3.0508
  Epoch 7/8 | Step 100/71779 | Loss: 3.0690
  Epoch 7/8 | Step 150/71779 | Loss: 3.0641
  Epoch 7/8 | Step 200/71779 | Loss: 3.0580
  Epoch 7/8 | Step 250/71779 | Loss: 3.0630
  Epoch 7/8 | Step 300/71779 | Loss: 3.0656
  Epoch 7/8 | Step 350/71779 | Loss: 3.0530
  Epoch 7/8 | Step 400/71779 | Loss: 3.0505
  Epoch 7/8 | Step 450/71779 | Loss: 3.0454
  Epoch 7/8 | Step 500/71779 | Loss: 3.0488
  Epoch 7/8 | Step 550/71779 | Loss: 3.0559
  Epoch 7/8 | Step 600/71779 | Loss: 3.0479
  Epoch 7/8 | Step 650/71779 | Loss: 3.0475
  Epoch 7/8 | Step 700/71779 | Loss: 3.0448
  Epoch 7/8 | Step 750/71779 | Loss: 3.0481
  Epoch 7/8 | Step 800/71779 | Loss: 3.0472
  Epoch 7/8 | Step 850/71779 | Loss: 3.0502
  Epoch 7/8 | Step 900/71779 | Loss: 3.0475
  Epoch 7/8 | Step 950/71779 | Loss: 3.0493
  Epoch 7/8 | Step 1000/71779 | Loss: 3.0492
  Epoch 7/8 | Step 1050/71779 | Loss: 3.0483
  Epoch 7/8 | Step 1100/71779 | Loss: 3.0481
  Epoch 7/8 | Step 1150/71779 

In [19]:
# üîÑ ‡πÄ‡∏ó‡∏£‡∏ô‡∏ï‡πà‡∏≠ Epoch 9-10 (‡∏•‡∏î Learning Rate)
# ‡πÇ‡∏´‡∏•‡∏î checkpoint epoch 8 ‡πÅ‡∏•‡πâ‡∏ß‡πÄ‡∏ó‡∏£‡∏ô‡∏ï‡πà‡∏≠‡∏î‡πâ‡∏ß‡∏¢ learning rate ‡∏ï‡πà‡∏≥‡∏Å‡∏ß‡πà‡∏≤

# ‡πÇ‡∏´‡∏•‡∏î‡πÇ‡∏°‡πÄ‡∏î‡∏• epoch 8
checkpoint_epoch = 8
checkpoint_path = os.path.join(cfg['save_dir'], f"summ_epoch_{checkpoint_epoch}")

if os.path.exists(checkpoint_path):
    print(f"üìÇ Loading checkpoint from epoch {checkpoint_epoch}")
    enc_dec_model_sum = EncoderDecoderModel.from_pretrained(checkpoint_path)
    enc_dec_model_sum.to(device)
    
    # ‡∏•‡∏î learning rate ‡∏•‡∏á‡∏Ñ‡∏£‡∏∂‡πà‡∏á‡∏´‡∏ô‡∏∂‡πà‡∏á‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö fine-tuning
    lower_lr = 1.5e-5  # ‡∏à‡∏≤‡∏Å 3e-5 ‚Üí 1.5e-5
    extra_epochs = 2    # ‡πÄ‡∏ó‡∏£‡∏ô 2 epochs ‡πÄ‡∏û‡∏¥‡πà‡∏° (9-10)
    
    num_training_steps = math.ceil(len(sum_loader) / cfg['grad_accum']) * extra_epochs
    optimizer = torch.optim.AdamW(enc_dec_model_sum.parameters(), lr=lower_lr, weight_decay=cfg['weight_decay'])
    scheduler = get_linear_schedule_with_warmup(optimizer, int(num_training_steps*cfg['warmup_ratio']), num_training_steps)
    scaler = torch.amp.GradScaler('cuda')
    
    print(f"üîß Training with lower LR: {lower_lr} for epochs {checkpoint_epoch+1}-{checkpoint_epoch+extra_epochs}")
    
    for epoch in range(checkpoint_epoch + 1, checkpoint_epoch + extra_epochs + 1):
        print(f"\n{'='*60}")
        print(f"üî• Fine-tuning Epoch {epoch}/10")
        print(f"{'='*60}")
        train_loss = train_epoch(enc_dec_model_sum, sum_loader, optimizer, scheduler, scaler, epoch, 10)
        val_loss = evaluate(enc_dec_model_sum, val_sum_loader)
        print(f"\nüìä Epoch {epoch}/10 Complete | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
        save_ckpt(enc_dec_model_sum, f"summ_epoch_{epoch}")
    
    print("\n‚úÖ Extended training finished (Epochs 9-10)")
else:
    print(f"‚ùå Checkpoint epoch {checkpoint_epoch} not found!")

üìÇ Loading checkpoint from epoch 8
üîß Training with lower LR: 1.5e-05 for epochs 9-10

üî• Fine-tuning Epoch 9/10
  Epoch 9/10 | Step 50/71779 | Loss: 2.7509
  Epoch 9/10 | Step 100/71779 | Loss: 2.7657
  Epoch 9/10 | Step 150/71779 | Loss: 2.7915
  Epoch 9/10 | Step 200/71779 | Loss: 2.7915
  Epoch 9/10 | Step 250/71779 | Loss: 2.7795
  Epoch 9/10 | Step 300/71779 | Loss: 2.7702
  Epoch 9/10 | Step 350/71779 | Loss: 2.7674
  Epoch 9/10 | Step 400/71779 | Loss: 2.7673
  Epoch 9/10 | Step 450/71779 | Loss: 2.7615
  Epoch 9/10 | Step 500/71779 | Loss: 2.7625
  Epoch 9/10 | Step 550/71779 | Loss: 2.7618
  Epoch 9/10 | Step 600/71779 | Loss: 2.7637
  Epoch 9/10 | Step 650/71779 | Loss: 2.7603
  Epoch 9/10 | Step 700/71779 | Loss: 2.7594
  Epoch 9/10 | Step 750/71779 | Loss: 2.7570
  Epoch 9/10 | Step 800/71779 | Loss: 2.7574
  Epoch 9/10 | Step 850/71779 | Loss: 2.7575
  Epoch 9/10 | Step 900/71779 | Loss: 2.7570
  Epoch 9/10 | Step 950/71779 | Loss: 2.7549
  Epoch 9/10 | Step 1000/71

In [20]:
# üìä ‡πÇ‡∏´‡∏•‡∏î‡πÅ‡∏•‡∏∞‡∏ó‡∏î‡∏™‡∏≠‡∏ö Epoch 10
test_epoch = 10
checkpoint_path = os.path.join(cfg['save_dir'], f"summ_epoch_{test_epoch}")

if os.path.exists(checkpoint_path):
    print(f"üìÇ Loading checkpoint epoch {test_epoch}")
    enc_dec_model_sum = EncoderDecoderModel.from_pretrained(checkpoint_path)
    enc_dec_model_sum.to(device)
    enc_dec_model_sum.eval()
    print(f"‚úÖ Loaded epoch {test_epoch} successfully!")
    
    # ‡∏ó‡∏î‡∏™‡∏≠‡∏ö 5 samples
    print(f"\n{'='*60}")
    print(f"üß™ Testing Epoch {test_epoch} on Multiple Samples")
    print(f"{'='*60}\n")
    
    for idx in [0, 5, 10, 15, 20]:
        sample = dataset['test'][idx]
        text = sample['article']
        reference = sample['highlights']
        
        print(f"\n{'‚îÄ'*60}")
        print(f"üìù Sample {idx+1}")
        print(f"{'‚îÄ'*60}")
        print(f"Article: {text[:200]}...")
        print(f"\n‚úÖ Reference:\n{reference}")
        
        with torch.no_grad():
            inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=cfg['max_src_len']).to(device)
            
            sum_ids = enc_dec_model_sum.generate(
                **inputs,
                max_length=cfg['max_tgt_len'],
                min_length=30,
                num_beams=5,
                no_repeat_ngram_size=3,
                repetition_penalty=2.0,
                length_penalty=1.0,
                early_stopping=True,
                decoder_start_token_id=tokenizer.cls_token_id,
                eos_token_id=tokenizer.sep_token_id,
                pad_token_id=tokenizer.pad_token_id
            )
            
            generated = tokenizer.decode(sum_ids[0], skip_special_tokens=True)
            print(f"\nü§ñ Generated (Epoch {test_epoch}):\n{generated}\n")
    
    print(f"\n{'='*60}")
    print("‚úÖ Epoch 10 Testing Complete!")
    print(f"{'='*60}")
else:
    print(f"‚ùå Checkpoint not found: {checkpoint_path}")

üìÇ Loading checkpoint epoch 10
‚úÖ Loaded epoch 10 successfully!

üß™ Testing Epoch 10 on Multiple Samples


‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üìù Sample 1
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
Article: (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territor...

‚úÖ Reference:
Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

ü§ñ Generated (Epoch 10):
new : the palestinian

In [22]:
# üìä ‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö Epoch 6 vs 8 vs 10 (Same Samples)
test_epochs = [6, 8, 10]
test_samples = [0, 5, 10]

for sample_idx in test_samples:
    sample = dataset['test'][sample_idx]
    text = sample['article']
    reference = sample['highlights']
    
    print(f"\n{'='*70}")
    print(f"üì∞ Sample {sample_idx+1}")
    print(f"{'='*70}")
    print(f"Article: {text[:250]}...")
    print(f"\n‚úÖ Reference:\n{reference}")
    print(f"\n{'‚îÄ'*70}")
    
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', truncation=True, 
                         padding=True, max_length=cfg['max_src_len']).to(device)
        
        for epoch_num in test_epochs:
            checkpoint_path = os.path.join(cfg['save_dir'], f"summ_epoch_{epoch_num}")
            
            if os.path.exists(checkpoint_path):
                # ‡πÇ‡∏´‡∏•‡∏î‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏Ç‡∏≠‡∏á‡πÅ‡∏ï‡πà‡∏•‡∏∞ epoch
                model_temp = EncoderDecoderModel.from_pretrained(checkpoint_path)
                model_temp.to(device)
                model_temp.eval()
                
                sum_ids = model_temp.generate(
                    **inputs,
                    max_length=cfg['max_tgt_len'],
                    min_length=30,
                    num_beams=5,
                    no_repeat_ngram_size=3,
                    repetition_penalty=2.0,
                    length_penalty=1.0,
                    early_stopping=True,
                    decoder_start_token_id=tokenizer.cls_token_id,
                    eos_token_id=tokenizer.sep_token_id,
                    pad_token_id=tokenizer.pad_token_id
                )
                
                generated = tokenizer.decode(sum_ids[0], skip_special_tokens=True)
                print(f"\nü§ñ Epoch {epoch_num}:\n{generated}")
                
                # ‡∏•‡πâ‡∏≤‡∏á memory
                del model_temp
                torch.cuda.empty_cache()
            else:
                print(f"\n‚ö†Ô∏è Epoch {epoch_num}: Checkpoint not found")

print(f"\n{'='*70}")
print("‚úÖ Comparison Complete!")
print(f"{'='*70}")


üì∞ Sample 1
Article: (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremo...

‚úÖ Reference:
Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

ü§ñ Epoch 6:
new : the u. s. state department says it will step down as part of a " judicial review " the palestinian authority has signed a formal complaint with the international criminal court. the ruling is expected to take place on june 13.

ü§ñ Epoch 

# üéì ‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡πÄ‡∏ó‡∏£‡∏ô 10 Epochs

## ‡∏ú‡∏•‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö
- ‚úÖ ‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÄ‡∏ó‡∏£‡∏ô‡∏Ñ‡∏£‡∏ö 10 epochs (8+2 fine-tuning)
- ‚ùå ‡∏¢‡∏±‡∏á‡∏Ñ‡∏á‡∏°‡∏µ‡∏õ‡∏±‡∏ç‡∏´‡∏≤ **hallucination** (‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏°‡∏µ‡πÉ‡∏ô‡∏Ç‡πà‡∏≤‡∏ß‡∏ï‡πâ‡∏ô‡∏â‡∏ö‡∏±‡∏ö)
- üìä ROUGE scores ‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ô‡∏£‡∏∞‡∏î‡∏±‡∏ö‡∏ï‡πà‡∏≥ (‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡∏Å‡∏±‡∏ö pretrained models)

## ‡∏™‡∏≤‡πÄ‡∏´‡∏ï‡∏∏‡∏Ç‡∏≠‡∏á Hallucination
1. **Dataset ‡πÑ‡∏°‡πà‡πÄ‡∏û‡∏µ‡∏¢‡∏á‡∏û‡∏≠**: 287k samples ‡∏ô‡πâ‡∏≠‡∏¢‡πÄ‡∏Å‡∏¥‡∏ô‡πÑ‡∏õ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÇ‡∏°‡πÄ‡∏î‡∏• 247M parameters
2. **‡πÄ‡∏ó‡∏£‡∏ô‡∏à‡∏≤‡∏Å‡∏®‡∏π‡∏ô‡∏¢‡πå**: ‡πÑ‡∏°‡πà‡∏°‡∏µ pre-training knowledge ‡∏î‡πâ‡∏≤‡∏ô‡∏†‡∏≤‡∏©‡∏≤
3. **Seq2seq ‡∏à‡∏≤‡∏Å scratch**: ‡∏¢‡∏≤‡∏Å‡∏Å‡∏ß‡πà‡∏≤‡∏Å‡∏≤‡∏£ fine-tune pretrained model ‡∏°‡∏≤‡∏Å

## ‡∏ï‡∏±‡∏ß‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÉ‡∏ô‡∏Å‡∏≤‡∏£‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç
### 1. ‡πÉ‡∏ä‡πâ‡πÄ‡∏õ‡πá‡∏ô‡∏á‡∏≤‡∏ô‡πÄ‡∏£‡∏µ‡∏¢‡∏ô‡∏£‡∏π‡πâ (‡πÅ‡∏ô‡∏∞‡∏ô‡∏≥)
- ‡πÅ‡∏™‡∏î‡∏á‡πÉ‡∏´‡πâ‡πÄ‡∏´‡πá‡∏ô‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏ï‡∏Å‡∏ï‡πà‡∏≤‡∏á‡∏£‡∏∞‡∏´‡∏ß‡πà‡∏≤‡∏á from-scratch vs pretrained
- ‡πÄ‡∏Ç‡πâ‡∏≤‡πÉ‡∏à‡∏õ‡∏±‡∏ç‡∏´‡∏≤‡∏ó‡∏µ‡πà‡πÄ‡∏Å‡∏¥‡∏î‡∏Ç‡∏∂‡πâ‡∏ô‡πÄ‡∏°‡∏∑‡πà‡∏≠ data ‡πÑ‡∏°‡πà‡πÄ‡∏û‡∏µ‡∏¢‡∏á‡∏û‡∏≠

### 2. ‡πÄ‡∏ó‡∏£‡∏ô‡∏ï‡πà‡∏≠ (‡πÑ‡∏°‡πà‡πÅ‡∏ô‡∏∞‡∏ô‡∏≥)
- Epochs 11-12 ‡∏≠‡∏≤‡∏à‡πÑ‡∏°‡πà‡∏ä‡πà‡∏ß‡∏¢‡∏°‡∏≤‡∏Å‡∏ô‡∏±‡∏Å
- ‡∏õ‡∏±‡∏ç‡∏´‡∏≤‡∏Ñ‡∏∑‡∏≠ dataset size ‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà epoch count

### 3. ‡πÉ‡∏ä‡πâ Pretrained Model (‡∏Ç‡∏±‡∏î‡∏ß‡∏±‡∏ï‡∏ñ‡∏∏‡∏õ‡∏£‡∏∞‡∏™‡∏á‡∏Ñ‡πå)
- ‡∏à‡∏∞‡πÑ‡∏î‡πâ‡∏ú‡∏•‡∏î‡∏µ‡∏Å‡∏ß‡πà‡∏≤‡∏°‡∏≤‡∏Å ‡πÅ‡∏ï‡πà‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà‡∏Å‡∏≤‡∏£‡πÄ‡∏£‡∏µ‡∏¢‡∏ô‡∏£‡∏π‡πâ‡∏Å‡∏≤‡∏£‡πÄ‡∏ó‡∏£‡∏ô‡∏à‡∏≤‡∏Å‡∏®‡∏π‡∏ô‡∏¢‡πå

## Headline Model
- ‚úÖ ‡πÇ‡∏°‡πÄ‡∏î‡∏• headline (pretrained) ‡∏ó‡∏≥‡∏á‡∏≤‡∏ô‡πÑ‡∏î‡πâ‡∏î‡∏µ
- ‡∏™‡∏≤‡∏°‡∏≤‡∏£‡∏ñ‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô‡πÑ‡∏î‡πâ‡∏à‡∏£‡∏¥‡∏á

In [14]:
# üèóÔ∏è Build Headline Model (pretrain base)
enc_dec_model_head = EncoderDecoderModel.from_encoder_decoder_pretrained(cfg['model_name'], cfg['model_name'])
enc_dec_model_head.config.decoder_start_token_id = tokenizer.cls_token_id
enc_dec_model_head.config.pad_token_id = tokenizer.pad_token_id
enc_dec_model_head.config.eos_token_id = tokenizer.sep_token_id
enc_dec_model_head.config.vocab_size = enc_dec_model_head.config.encoder.vocab_size
enc_dec_model_head.to(device)
print("Headline params (M):", sum(p.numel() for p in enc_dec_model_head.parameters())/1e6)

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

Headline params (M): 247.363386


In [15]:
# üöÄ Train Headline Model (5 Epochs)
num_training_steps_head = math.ceil(len(head_loader) / cfg['grad_accum']) * cfg['num_epochs_head']
optimizer_h = torch.optim.AdamW(enc_dec_model_head.parameters(), lr=cfg['lr'], weight_decay=cfg['weight_decay'])
scheduler_h = get_linear_schedule_with_warmup(optimizer_h, int(num_training_steps_head*cfg['warmup_ratio']), num_training_steps_head)
scaler_h = torch.amp.GradScaler('cuda')

for epoch in range(1, cfg['num_epochs_head'] + 1):
    print(f"\n{'='*60}")
    print(f"üî• Starting Headline Training: Epoch {epoch}/{cfg['num_epochs_head']}")
    print(f"{'='*60}")
    train_loss = train_epoch(enc_dec_model_head, head_loader, optimizer_h, scheduler_h, scaler_h, epoch, cfg['num_epochs_head'])
    val_loss = evaluate(enc_dec_model_head, val_head_loader)
    print(f"\nüìä [Headline] Epoch {epoch}/{cfg['num_epochs_head']} Complete | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
    save_ckpt(enc_dec_model_head, f"head_epoch_{epoch}")
print("\n‚úÖ Headline training finished (5 epochs)")


üî• Starting Headline Training: Epoch 1/5
  Epoch 1/5 | Step 50/71779 | Loss: 11.0763
  Epoch 1/5 | Step 100/71779 | Loss: 11.1066
  Epoch 1/5 | Step 150/71779 | Loss: 11.0486
  Epoch 1/5 | Step 200/71779 | Loss: 10.9292
  Epoch 1/5 | Step 250/71779 | Loss: 10.8208
  Epoch 1/5 | Step 300/71779 | Loss: 10.6921
  Epoch 1/5 | Step 350/71779 | Loss: 10.5360
  Epoch 1/5 | Step 400/71779 | Loss: 10.3674
  Epoch 1/5 | Step 450/71779 | Loss: 10.2028
  Epoch 1/5 | Step 500/71779 | Loss: 10.0440
  Epoch 1/5 | Step 550/71779 | Loss: 9.8958
  Epoch 1/5 | Step 600/71779 | Loss: 9.7598
  Epoch 1/5 | Step 650/71779 | Loss: 9.6388
  Epoch 1/5 | Step 700/71779 | Loss: 9.5356
  Epoch 1/5 | Step 750/71779 | Loss: 9.4403
  Epoch 1/5 | Step 800/71779 | Loss: 9.3539
  Epoch 1/5 | Step 850/71779 | Loss: 9.2675
  Epoch 1/5 | Step 900/71779 | Loss: 9.1940
  Epoch 1/5 | Step 950/71779 | Loss: 9.1199
  Epoch 1/5 | Step 1000/71779 | Loss: 9.0500
  Epoch 1/5 | Step 1050/71779 | Loss: 8.9882
  Epoch 1/5 | Step 11

In [1]:
# üß™ Inference Demo ‡πÅ‡∏Å‡πâ Hallucination
enc_dec_model_sum.eval(); enc_dec_model_head.eval()

sample = dataset['test'][0]
text = sample['article']
reference = sample['highlights']
print("üì∞ Article (first 400 chars):", text[:400], "...")
print("üìù Reference:", reference)

with torch.no_grad():
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=cfg['max_src_len']).to(device)
    
    # Summary with anti-hallucination parameters
    sum_ids = enc_dec_model_sum.generate(
        **inputs,
        max_length=cfg['max_tgt_len'],
        min_length=30,
        num_beams=5,                      # ‡πÄ‡∏û‡∏¥‡πà‡∏° beams ‡∏à‡∏≤‡∏Å 4‚Üí5
        no_repeat_ngram_size=3,           # ‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏à‡∏≤‡∏Å 2‚Üí3 ‡∏•‡∏î‡∏Å‡∏≤‡∏£‡∏ó‡∏≥‡∏ã‡πâ‡∏≥
        repetition_penalty=2.0,           # ‚≠ê ‡∏õ‡πâ‡∏≠‡∏á‡∏Å‡∏±‡∏ô‡∏Å‡∏≤‡∏£‡∏ó‡∏≥‡∏ã‡πâ‡∏≥‡∏Ñ‡∏≥
        length_penalty=1.0,               # ‚≠ê ‡∏™‡∏°‡∏î‡∏∏‡∏•‡∏Ñ‡∏ß‡∏≤‡∏°‡∏¢‡∏≤‡∏ß
        early_stopping=True,
        decoder_start_token_id=tokenizer.cls_token_id,
        eos_token_id=tokenizer.sep_token_id,
        pad_token_id=tokenizer.pad_token_id
    )
    
    # Headline generation
    headline_ids = enc_dec_model_head.generate(
        **inputs,
        max_length=cfg['max_head_len'],
        min_length=8,
        num_beams=4,
        no_repeat_ngram_size=2,
        repetition_penalty=1.5,
        decoder_start_token_id=tokenizer.cls_token_id,
        eos_token_id=tokenizer.sep_token_id,
        pad_token_id=tokenizer.pad_token_id
    )

print("\nüìÑ Generated Summary (with anti-hallucination):")
print(tokenizer.decode(sum_ids[0], skip_special_tokens=True))
print("\nüì∞ Generated Headline:")
print(tokenizer.decode(headline_ids[0], skip_special_tokens=True))

NameError: name 'enc_dec_model_sum' is not defined

In [None]:
# üìä ‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö Multiple Samples (‡∏î‡∏π‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏°‡πà‡∏≥‡πÄ‡∏™‡∏°‡∏≠)
print("üî¨ Testing on Multiple Samples\n")

for idx in [0, 5, 10]:  # ‡∏ó‡∏î‡∏™‡∏≠‡∏ö 3 samples
    sample = dataset['test'][idx]
    text = sample['article']
    reference = sample['highlights']
    
    print(f"{'='*60}")
    print(f"üìù Sample {idx+1}")
    print(f"{'='*60}")
    print(f"Article: {text[:200]}...")
    print(f"\n‚úÖ Reference:\n{reference}")
    
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=cfg['max_src_len']).to(device)
        
        sum_ids = enc_dec_model_sum.generate(
            **inputs,
            max_length=cfg['max_tgt_len'],
            min_length=30,
            num_beams=5,
            no_repeat_ngram_size=3,
            repetition_penalty=2.0,
            length_penalty=1.0,
            early_stopping=True,
            decoder_start_token_id=tokenizer.cls_token_id,
            eos_token_id=tokenizer.sep_token_id,
            pad_token_id=tokenizer.pad_token_id
        )
        
        generated = tokenizer.decode(sum_ids[0], skip_special_tokens=True)
        print(f"\nü§ñ Generated:\n{generated}\n")

print("‚úÖ Testing complete!")

# ‚úÖ ‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô‡∏Å‡∏≤‡∏£‡∏£‡∏±‡∏ô
1) Intro (cell 1)
2) Setup & Imports (cell 2)
3) Config (cell 3)
4) Load dataset (cell 4)
5) Tokenizer (cell 5)
6) Dataset wrappers (cell 6)
7) DataLoaders (cell 7)
8) Summarization model from scratch (cell 8)
9) Train helpers (cell 9)
10) Train Summarization (cell 10)
11) Headline model (pretrain) (cell 11)
12) Train Headline (cell 12)
13) Inference demo (cell 13)

> ‡∏õ‡∏£‡∏±‡∏ö‡∏à‡∏≥‡∏ô‡∏ß‡∏ô epoch ‡πÑ‡∏î‡πâ ‡πÅ‡∏ï‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏£‡∏±‡∏ô‡∏Ñ‡∏£‡∏ö‡∏ó‡∏∏‡∏Å epoch ‡∏ó‡∏µ‡πà‡∏ï‡∏±‡πâ‡∏á‡πÑ‡∏ß‡πâ ‡πÑ‡∏°‡πà‡∏Ç‡πâ‡∏≤‡∏°‡∏Ç‡∏±‡πâ‡∏ô