In [11]:
# Imports
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm
from collections import Counter
import json

In [12]:
# GPU device selection
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [13]:
# Load preprocessed splits
train_df = pd.read_csv('../../data/summarization/processed_train_split.csv')
val_df = pd.read_csv('../../data/summarization/processed_val_split.csv')

save_path = '../../models/summarization/'

In [14]:
# Tokenizer
def tokenize(text):
    return text.split()

In [15]:
# Build vocab
def build_vocab(samples, min_freq=2, max_vocab_size=50000):
    counter = Counter()
    for text in samples:
        counter.update(tokenize(text))
    vocab = [w for w, f in counter.items() if f >= min_freq][:max_vocab_size-2]
    word2idx = {w: i+2 for i, w in enumerate(vocab)}
    word2idx['<PAD>'] = 0
    word2idx['<UNK>'] = 1
    return word2idx

# Joint vocab from both articles and summaries
combined_text = list(train_df['clean_article']) + list(train_df['clean_highlights'])
vocab = build_vocab(combined_text)

# Save vocab
vocab_file = os.path.join(save_path, 'vocab.json')
with open(vocab_file, 'w') as f:
    json.dump(vocab, f)
    
print(f"Vocab saved to: {vocab_file}")
print(f"Vocab size: {len(vocab)}")

article_vocab = vocab
summary_vocab = vocab

Vocab saved to: ../../models/summarization/vocab.json
Vocab size: 50000


In [16]:
# Dataset class
class SummarizationDataset(Dataset):
    def __init__(self, df, article_vocab, summary_vocab, max_article_len=400, max_summary_len=50):
        self.articles = df['clean_article'].values
        self.summaries = df['clean_highlights'].values
        self.article_vocab = article_vocab
        self.summary_vocab = summary_vocab
        self.max_article_len = max_article_len
        self.max_summary_len = max_summary_len

    def encode(self, text, vocab, max_len):
        tokens = tokenize(text)
        ids = [vocab.get(w, vocab['<UNK>']) for w in tokens][:max_len]
        ids += [vocab['<PAD>']] * (max_len - len(ids))
        return ids

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        src = torch.tensor(self.encode(self.articles[idx], self.article_vocab, self.max_article_len), dtype=torch.long)
        tgt = torch.tensor(self.encode(self.summaries[idx], self.summary_vocab, self.max_summary_len), dtype=torch.long)
        return src, tgt

train_data = SummarizationDataset(train_df, vocab, vocab)
val_data = SummarizationDataset(val_df, vocab, vocab)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)

In [17]:
# Baseline seq2seq model with LSTM
class Seq2SeqBaseline(nn.Module):
    def __init__(self, input_vocab_size, target_vocab_size, emb_dim=128, hidden_dim=256):
        super().__init__()
        self.encoder_emb = nn.Embedding(input_vocab_size, emb_dim, padding_idx=0)
        self.encoder_lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.decoder_emb = nn.Embedding(target_vocab_size, emb_dim, padding_idx=0)
        self.decoder_lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, target_vocab_size)

    def forward(self, src, tgt):
        _, (hidden, cell) = self.encoder_lstm(self.encoder_emb(src))
        decoder_outputs, _ = self.decoder_lstm(self.decoder_emb(tgt), (hidden, cell))
        logits = self.fc_out(decoder_outputs)
        return logits

In [18]:
# Model and training setup
input_vocab_size = len(article_vocab)
target_vocab_size = len(summary_vocab)
model = Seq2SeqBaseline(input_vocab_size, target_vocab_size)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=0)

In [20]:
# Training loop
model = model.to(device)

for epoch in range(5):
    # Training
    model.train()
    train_loss = 0
    
    for src, tgt in train_loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        output = output.reshape(-1, output.shape[-1])
        target = tgt[:, 1:].reshape(-1)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    
    with torch.no_grad():
        for src, tgt in val_loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            output = output.reshape(-1, output.shape[-1])
            target = tgt[:, 1:].reshape(-1)
            loss = criterion(output, target)
            val_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)
    
    print(f"Epoch {epoch+1} — Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

Epoch 1: 100%|██████████████████████████████████████████████████████████████| 1515/1515 [02:44<00:00,  9.23it/s]


Epoch 1 Completed — Avg Loss: 6.5584


Epoch 2: 100%|██████████████████████████████████████████████████████████████| 1515/1515 [02:45<00:00,  9.13it/s]


Epoch 2 Completed — Avg Loss: 5.8737


Epoch 3: 100%|██████████████████████████████████████████████████████████████| 1515/1515 [02:49<00:00,  8.91it/s]


Epoch 3 Completed — Avg Loss: 5.5072


Epoch 4: 100%|██████████████████████████████████████████████████████████████| 1515/1515 [02:48<00:00,  8.97it/s]


Epoch 4 Completed — Avg Loss: 5.2372


Epoch 5: 100%|██████████████████████████████████████████████████████████████| 1515/1515 [02:51<00:00,  8.85it/s]

Epoch 5 Completed — Avg Loss: 5.0265





In [21]:
# Save
torch.save(model.state_dict(), os.path.join(save_path, 'best_summarization_model.pt'))

print("Model saved.")

Model saved.
