In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=6151f4af874ac2816b890ec62ee469f8c4bc3abd6c9b79c8cefa9232c0897947
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [3]:
import torch
import math
import torch.nn as nn
import torch.nn.functional as F

# Config (adjust based on your needs)
d_model = 512
n_heads = 4  # Better divisibility with 512
n_layers = 2  # Increased from 3 for better capacity
context_length = 256
dropout = 0.1

print('before the loop!!!')

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, n_heads: int):
        super().__init__()
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        assert d_model % n_heads == 0

        # Combined QKV projection (more efficient)
        self.qkv = nn.Linear(d_model, 3 * d_model)
        self.proj = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask', torch.tril(torch.ones(context_length, context_length)))

    def forward(self, x):
        B, T, C = x.shape
        qkv = self.qkv(x).split(d_model, dim=2)
        # Process Q, K, V
        q, k, v = [y.view(B, T, self.n_heads, self.head_dim).transpose(1, 2) for y in qkv]
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(self.head_dim))
        
        # Apply causal mask
        att = att.masked_fill(self.mask[:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.dropout(att)
        
        y = (att @ v).transpose(1, 2).contiguous().view(B, T, C)
        return self.dropout(self.proj(y))

class GPTBlock(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.attn = MultiHeadAttention(d_model, n_heads)
        self.ln2 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.GELU(),
            nn.Linear(4 * d_model, d_model),
            nn.Dropout(dropout)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Pre-LN architecture (original GPT-2 style)
        x = x + self.attn(self.ln1(x))
        x = x + self.ffn(self.ln2(x))
        return x

class GPT(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.context_length = context_length
        self.wte = nn.Embedding(vocab_size, d_model)
        self.wpe = nn.Embedding(context_length, d_model)  # Learned positional embeddings
        self.blocks = nn.Sequential(*[GPTBlock(d_model, n_heads) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size)
        
        # GPT-2 style initialization
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, idx, targets=None):
        B, T = idx.size()
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)
        tok_emb = self.wte(idx)
        pos_emb = self.wpe(pos)
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)
        
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.context_length:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx



print("Transformer block done")

before the loop!!!
Transformer block done


In [6]:
checkpoint_path = "/kaggle/input/Fine Tuning/gpt_summarization_model.pth"

In [12]:
from transformers import AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [13]:
vocab_size = 50257  
model = GPT(vocab_size)
model.eval()

GPT(
  (wte): Embedding(50257, 512)
  (wpe): Embedding(256, 512)
  (blocks): Sequential(
    (0): GPTBlock(
      (ln1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (attn): MultiHeadAttention(
        (qkv): Linear(in_features=512, out_features=1536, bias=True)
        (proj): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ln2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (ffn): Sequential(
        (0): Linear(in_features=512, out_features=2048, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=2048, out_features=512, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): GPTBlock(
      (ln1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (attn): MultiHeadAttention(
        (qkv): Linear(in_features=512, out_features=1536, bias=True)
        (proj): Linear(in_features=512,

In [14]:
print("Pre processing dataset")
from datasets import load_dataset

# Load CNN/Daily Mail dataset
cnn_dataset = load_dataset("cnn_dailymail", "3.0.0")

# Add padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as PAD token

# Define preprocessing function
def preprocess_function(examples):
    inputs = ["Summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    
    # Setup the targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=256, truncation=True, padding="max_length")
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
processed_dataset = cnn_dataset.map(preprocess_function, batched=True)


Pre processing dataset


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]



Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [17]:
input_text = cnn_dataset["test"][0]["article"]

inputs = tokenizer(input_text, max_length=256, truncation=True, padding="max_length", return_tensors="pt")
input_ids = inputs.input_ids

output_ids = model.generate(input_ids, max_new_tokens=150)  # Using your generate function
generated_summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("\n🔹 Generated Summary:")
print(generated_summary)



🔹 Generated Summary:
(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at We

In [19]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
scores = scorer.score(generated_summary, cnn_dataset["test"][0]["highlights"])

print("\n🔹 ROUGE Scores:")
print(scores)


🔹 ROUGE Scores:
{'rouge1': Score(precision=0.8823529411764706, recall=0.0949367088607595, fmeasure=0.17142857142857143), 'rouge2': Score(precision=0.5454545454545454, recall=0.05714285714285714, fmeasure=0.10344827586206896), 'rougeL': Score(precision=0.6764705882352942, recall=0.07278481012658228, fmeasure=0.13142857142857145)}


In [20]:
dataset = load_dataset("wikitext", "wikitext-103-v1")  # or "wikitext-2-v1"

# Initialize tokenizer (e.g., GPT-2)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Required for GPT-2
tokenizer.padding_side = "right"

# Filter empty text entries
#dataset = dataset.filter(lambda x: x["text"] is not None and len(x["text"]) > 0)

# Remove empty or whitespace-only entries
dataset = dataset.filter(
    lambda x: x["text"] is not None and len(x["text"].strip()) > 0
)

print("Filtered text:", dataset["train"][1]["text"])
print()
# Should output meaningful text like:
# " = Valkyria Chronicles III =  Senjō no Valkyria 3 : Unrecorded Chronicles ..."

# Define tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["text"],  # Tokenize the "text" column
        truncation=True,
        max_length=256,    # Match GPT-2's context window
        padding="max_length",
    )

# Tokenize the dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],  # Only remove the "text" column (others don't exist)
)

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/722k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/156M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/156M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/655k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4358 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3760 [00:00<?, ? examples/s]

Filtered text:  Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . 




Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

Map:   0%|          | 0/1165029 [00:00<?, ? examples/s]

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

In [31]:
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT(vocab_size)
model.to(device)
model.eval()

def collate_fn(batch):
    return {key: torch.tensor([item[key] for item in batch]) for key in batch[0]}

tokenizer = AutoTokenizer.from_pretrained("gpt2") 
test_dataset = tokenized_dataset["test"]
test_loader = DataLoader(test_dataset, batch_size=16,collate_fn=collate_fn)

In [33]:
total_loss = 0.0
total_tokens = 0

model.eval()

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)

        # Compute loss
        logits, loss = model(input_ids, targets=input_ids)  # No attention_mask

        # Accumulate loss and token count
        total_loss += loss.item() * input_ids.shape[1]  # Multiply by sequence length
        total_tokens += input_ids.shape[1]

# Compute perplexity
avg_loss = total_loss / total_tokens
perplexity = torch.exp(torch.tensor(avg_loss)).item()

print(f"Perplexity: {perplexity:.4f}")

Perplexity: 58174.4180
