In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/final-6-hours/gpt_model.pth
/kaggle/input/final-6-hours/__huggingface_repos__.json
/kaggle/input/final-6-hours/tokens/merges.txt
/kaggle/input/final-6-hours/tokens/tokenizer.json
/kaggle/input/final-6-hours/tokens/vocab.json
/kaggle/input/final-6-hours/tokens/tokenizer_config.json
/kaggle/input/final-6-hours/tokens/special_tokens_map.json
/kaggle/input/final-6-hours/wandb/run-20250328_175100-2w3mmvq6/logs/debug.log
/kaggle/input/final-6-hours/wandb/run-20250328_173924-ord7qkck/run-ord7qkck.wandb
/kaggle/input/final-6-hours/wandb/run-20250328_173924-ord7qkck/logs/debug.log
/kaggle/input/final-6-hours/wandb/run-20250328_173924-ord7qkck/logs/debug-internal.log
/kaggle/input/final-6-hours/wandb/run-20250328_173924-ord7qkck/files/wandb-summary.json
/kaggle/input/final-6-hours/wandb/run-20250328_173924-ord7qkck/files/config.yaml
/kaggle/input/final-6-hours/wandb/run-20250328_173924-ord7qkck/files/output.log
/kaggle/input/final-6-hours/wandb/run-20250328_173924-ord7qkck/files/re

In [2]:
import torch
import math
import torch.nn as nn
import torch.nn.functional as F

# Config (adjust based on your needs)
d_model = 512
n_heads = 4  # Better divisibility with 512
n_layers = 2  # Increased from 3 for better capacity
context_length = 256
dropout = 0.1

print('before the loop!!!')

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, n_heads: int):
        super().__init__()
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        assert d_model % n_heads == 0

        # Combined QKV projection (more efficient)
        self.qkv = nn.Linear(d_model, 3 * d_model)
        self.proj = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.tril(torch.ones(context_length, context_length)))

    def forward(self, x):
        B, T, C = x.shape
        qkv = self.qkv(x).split(d_model, dim=2)
        # Process Q, K, V
        q, k, v = [y.view(B, T, self.n_heads, self.head_dim).transpose(1, 2) for y in qkv]
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(self.head_dim))
        
        # Apply causal mask
        att = att.masked_fill(self.mask[:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.dropout(att)
        
        y = (att @ v).transpose(1, 2).contiguous().view(B, T, C)
        return self.dropout(self.proj(y))

class GPTBlock(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.attn = MultiHeadAttention(d_model, n_heads)
        self.ln2 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.GELU(),
            nn.Linear(4 * d_model, d_model),
            nn.Dropout(dropout)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Pre-LN architecture (original GPT-2 style)
        x = x + self.attn(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x

class GPT(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.context_length = context_length
        self.wte = nn.Embedding(vocab_size, d_model)
        self.wpe = nn.Embedding(context_length, d_model)  # Learned positional embeddings
        self.blocks = nn.Sequential(*[GPTBlock(d_model, n_heads) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size)
        
        # GPT-2 style initialization
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, idx, targets=None):
        B, T = idx.size()
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        tok_emb = self.wte(idx)
        pos_emb = self.wpe(pos)
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)
        
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.context_length:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx



print("Transformer block done")

before the loop!!!
Transformer block done


In [3]:
!ls /kaggle/input/


final-6-hours


In [4]:
!ls /kaggle/input/final-6-hours/

gpt_model.pth  __huggingface_repos__.json  tokens  wandb


In [5]:
checkpoint_path = "/kaggle/input/final-6-hours/gpt_model.pth"

In [6]:
# Define vocab_size (same as in previous notebook)
vocab_size = 50257  

# Re-initialize the model
model = GPT(vocab_size)

# Load pre-trained weights
model.load_state_dict(torch.load(checkpoint_path, map_location=torch.device('cpu')))

# Set model to evaluation mode
model.eval()

print("Model loaded successfully!")

  model.load_state_dict(torch.load(checkpoint_path, map_location=torch.device('cpu')))


Model loaded successfully!


In [7]:
model.eval() 

GPT(
  (wte): Embedding(50257, 512)
  (wpe): Embedding(256, 512)
  (blocks): Sequential(
    (0): GPTBlock(
      (ln1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (attn): MultiHeadAttention(
        (qkv): Linear(in_features=512, out_features=1536, bias=True)
        (proj): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ln2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (ffn): Sequential(
        (0): Linear(in_features=512, out_features=2048, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=2048, out_features=512, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): GPTBlock(
      (ln1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (attn): MultiHeadAttention(
        (qkv): Linear(in_features=512, out_features=1536, bias=True)
        (proj): Linear(in_features=512,

In [8]:
prompt = "i am sarayu"

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Change this if you used a different tokenizer
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
print("Tokenisation")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenisation


In [10]:
output_ids = model.generate(input_ids, max_new_tokens=100)  # Use max_new_tokens
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)

i am sarayu melodies ( except Blythti ) . He is used with jant ( feels like <unk> at <unk> Ustrach ( Even Reichbelchterna ; C item of Odenumii ) . Also in Calford is the seat of an independent man while with his father , Major Ben Orie Thompson , who is having an organ @-@ payingam upon his father 's birth as an emperor , who is at Atherton ... but at no


In [11]:
print("Pre processing dataset")
from datasets import load_dataset

# Load CNN/Daily Mail dataset
cnn_dataset = load_dataset("cnn_dailymail", "3.0.0")

# Add padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as PAD token

# Define preprocessing function
def preprocess_function(examples):
    inputs = ["Summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    
    # Setup the targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=256, truncation=True, padding="max_length")
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
processed_dataset = cnn_dataset.map(preprocess_function, batched=True)


Pre processing dataset


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]



Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [12]:
# Print first example from the training set
print(processed_dataset["train"][0])

# Print first 5 examples
for i in range(5):
    print(f"Example {i+1}:")
    print(processed_dataset["train"][i])


{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office char

In [13]:
from torch.utils.data import DataLoader, TensorDataset

print("Converting to TensorDataset- training")
train_dataset = TensorDataset(
    torch.tensor(processed_dataset["train"]["input_ids"]),
    torch.tensor(processed_dataset["train"]["attention_mask"]),
    torch.tensor(processed_dataset["train"]["labels"])
)


Converting to TensorDataset- training


In [14]:
print("Converting to TensorDataset- validation")
val_dataset = TensorDataset(
    torch.tensor(processed_dataset["validation"]["input_ids"]),
    torch.tensor(processed_dataset["validation"]["attention_mask"]),
    torch.tensor(processed_dataset["validation"]["labels"])
)

Converting to TensorDataset- validation


In [15]:
print("Dataloaders")
batch_size = 8  # Smaller batch size due to longer sequences

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last= True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, drop_last= True)

Dataloaders


In [16]:
print("wandb")
import wandb
wandb.login(key="c60ac5d8bfe31d20abafbd61d966eca35fe34b65")
wandb.init(project="gpt-summarization", config={
    "learning_rate": 2e-5,
    "num_epochs": 3,
    "warmup_steps": 500
})


wandb


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msarayu-m[0m ([33msarayu-m-manipal-institue-of-technology[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Tracking run with wandb version 0.19.1
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250330_073528-xrb3veed[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mvibrant-breeze-7[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/sarayu-m-manipal-institue-of-technology/gpt-summarization[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/sarayu-m-manipal-institue-of-technology/gpt-summarization/runs/xrb3veed[0m


In [17]:
print("Before training")
from transformers import get_linear_schedule_with_warmup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Training configuration

model = model.to(device)
learning_rate = 2e-5  # Lower learning rate for fine-tuning
num_epochs = 2
warmup_steps = 500
total_steps = len(train_dataloader) * num_epochs

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

print("Entering training loop")
for epoch in range(num_epochs):
    print(f"Starting epoch {epoch+1}/{num_epochs}")
    model.train()
    total_loss = 0
    print("Entering epoch")
    for step, batch in enumerate(train_dataloader):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        logits, loss = model(input_ids, targets=labels)
        
        # Apply attention mask to loss
        loss = loss * attention_mask.sum(dim=1) / attention_mask.sum()
        loss = loss.mean()
        
        # Backward pass and optimization step
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        
    
    avg_train_loss = total_loss / len(train_dataloader)
    print("Training done")
    # Validation loop
    print("Entering validation loop")
    model.eval()
    total_val_loss = 0
    print("Entering epoch")
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            
            _, loss = model(input_ids, targets=labels)
            loss = loss * attention_mask.sum(dim=1) / attention_mask.sum()
            total_val_loss += loss.mean().item()
    
    avg_val_loss = total_val_loss / len(val_dataloader)
    print("Validation done")
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Average training loss: {avg_train_loss:.4f}")
    print(f"Average validation loss: {avg_val_loss:.4f}")
    
    # Log metrics to wandb
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": avg_train_loss,
        "val_loss": avg_val_loss,
        "learning_rate": scheduler.get_last_lr()[0]
    })

# Save the fine-tuned model
torch.save(model.state_dict(), "gpt_summarization_model.pth")
wandb.save("gpt_summarization_model.pth")

# Finish wandb run
wandb.finish()
print("Training done")

Before training
Entering training loop
Starting epoch 1/2
Entering epoch
Training done
Entering validation loop
Entering epoch
Validation done
Epoch 1/2
Average training loss: 0.2590
Average validation loss: 0.2802
Starting epoch 2/2
Entering epoch
Training done
Entering validation loop
Entering epoch
Validation done
Epoch 2/2
Average training loss: 0.2487
Average validation loss: 0.2764


[34m[1mwandb[0m: uploading gpt_summarization_model.pth; uploading output.log; uploading config.yaml
[34m[1mwandb[0m: uploading gpt_summarization_model.pth
[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:         epoch ▁█
[34m[1mwandb[0m: learning_rate █▁
[34m[1mwandb[0m:    train_loss █▁
[34m[1mwandb[0m:      val_loss █▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:         epoch 2
[34m[1mwandb[0m: learning_rate 0
[34m[1mwandb[0m:    train_loss 0.24871
[34m[1mwandb[0m:      val_loss 0.27642
[34m[1mwandb[0m: 
[34m[1mwandb[0m: 🚀 View run [33mvibrant-breeze-7[0m at: [34m[4mhttps://wandb.ai/sarayu-m-manipal-institue-of-technology/gpt-summarization/runs/xrb3veed[0m
[34m[1mwandb[0m: ⭐️ View project at: [34m[4mhttps://wandb.ai/sarayu-m-manipal-institue-of-technology/gpt-summarization[0m
[34m[1mwandb

Training done


In [18]:
input_text = cnn_dataset["test"][0]["article"]
input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"].to(device)

generated_ids = model.generate(input_ids, max_new_tokens=128)
generated_summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("Generated Summary:")
print(generated_summary)

print("Reference Summary:")
print(cnn_dataset["test"][0]["highlights"])


Generated Summary:
(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wedne

In [19]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
scores = scorer.score(generated_summary, cnn_dataset["test"][0]["highlights"])

print("ROUGE Scores:")
print(scores)

ModuleNotFoundError: No module named 'rouge_score'