In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!pip install --upgrade --quiet transformers datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h

In [2]:
from datasets import load_dataset

# Load the WikiText-2 dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
print(dataset)

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})


In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
# GPT-2 does not have a pad token by default, so we add one
tokenizer.pad_token = tokenizer.eos_token

In [4]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
print(tokenized_datasets)

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3760
    })
})


In [5]:
### Model selection

In [6]:
from transformers import GPT2LMHeadModel

# Load the pretrained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
# Align the model's padding token with the tokenizer
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

2025-07-09 16:20:58.856747: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752078058.875792     138 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752078058.882513     138 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [7]:
from transformers import TrainingArguments, Trainer

# Split the dataset
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=1,
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()



<IPython.core.display.Javascript object>

In [None]:
# Save model and tokenizer to a directory
output_dir = "./gpt2-finetuned-wikitext2"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
import math

eval_results = trainer.evaluate()
eval_loss = eval_results["eval_loss"]
perplexity = math.exp(eval_loss)
print(f"Perplexity: {perplexity:.2f}")

In [None]:
import torch

def compute_top_k_accuracy(model, dataset, k=5):
    model.eval()
    correct = 0
    total = 0
    for batch in torch.utils.data.DataLoader(dataset, batch_size=8):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            # Get the logits for the last token in each sequence
            last_token_logits = logits[:, -2, :]  # -2 because -1 is usually padding/eos
            next_token = input_ids[:, -1]
            top_k = torch.topk(last_token_logits, k, dim=-1).indices
            for i in range(next_token.size(0)):
                if next_token[i] in top_k[i]:
                    correct += 1
                total += 1
    return correct / total

top5_acc = compute_top_k_accuracy(model, eval_dataset, k=5)
print(f"Top-5 Accuracy: {top5_acc:.2%}")