In [None]:
!pip install --quiet transformers tokenizers datasets evaluate torch

In [None]:
import transformers
print(transformers.__version__)

In [None]:
#Clean broken cache
!rm -rf ~/.cache/huggingface/datasets/wikitext
!rm -rf /content/hf_cache

#Downgrade fsspec to avoid glob error
!pip install -U "fsspec<2023.9.0" datasets transformers --quiet

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

dataset = load_dataset("wikitext", "wikitext-2-raw-v1", cache_dir="/content/hf_cache")

def clean_and_tokenize(example):
    if example["text"].strip() == "":
        return {"input_ids": [], "attention_mask": []}
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_attention_mask=True
    )

tokenized_datasets = dataset.map(clean_and_tokenize, batched=False, remove_columns=["text"])
tokenized_datasets = tokenized_datasets.filter(lambda x: len(x["input_ids"]) > 0)

# testing tokenizer
print(tokenized_datasets["train"][0])


In [None]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments
# grouping the data into chunks of fixed length for training and model intitialization
block_size = 128

def group_texts(examples):
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated["input_ids"])
    total_length = (total_length // block_size) * block_size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_datasets.map(group_texts, batched=True)


model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))


In [None]:
# training the model
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2-wikitext2",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    eval_strategy="epoch",
    num_train_epochs=3,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=1,
    report_to="none",
    fp16= True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["validation"],
    processing_class=tokenizer,
)

trainer.train()

In [None]:
# Evaluating perplexity and top k accuracy
import math
import torch
import numpy as np
from evaluate import load

perplexity = load("perplexity")
results = perplexity.compute(model_id='gpt2', predictions=["The capital of France is"])
print("Perplexity:", results)

def compute_top_k_accuracy(logits, labels, k=5):
    _, top_k_preds = torch.topk(logits, k, dim=-1)
    correct = top_k_preds.eq(labels.unsqueeze(-1)).any(-1).float()
    return correct.mean().item()

model.eval()

# Get the device of the model
device = next(model.parameters()).device

sample = lm_dataset["validation"][0]
inputs = torch.tensor([sample["input_ids"]]).to(device) # Move inputs to the same device as the model
labels = torch.tensor([sample["labels"]]).to(device) # Move labels to the same device as the model

with torch.no_grad():
    outputs = model(inputs)
    logits = outputs.logits[:, :-1, :]
    labels = labels[:, 1:]
    top_k_acc = compute_top_k_accuracy(logits[0], labels[0])
    print(f"Top-K Accuracy: { top_k_acc*100 :.3f}% ")

In [None]:
trainer.save_model("./gpt2-wikitext2-final")
tokenizer.save_pretrained("./gpt2-wikitext2-final")

In [None]:
# reloading model to test
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("./gpt2-wikitext2-final")
model.eval()

In [None]:
from transformers import pipeline, AutoTokenizer

# Reloading model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("./gpt2-wikitext2-final")
model = GPT2LMHeadModel.from_pretrained("./gpt2-wikitext2-final")

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = "The theory of relativity states that"
outputs = generator(prompt, max_new_tokens=100, do_sample=True, top_k=20)

print(outputs[0]["generated_text"])

prompts = [

    "The theory of relativity states that",
    "Quantum computers are expected to",
    "Artificial intelligence can help in",
    "The capital of India is"
]
 # with temp, which allow weights to tokens with low prob
for prompt in prompts:
    print("Prompt:", prompt)
    print("GPT-2 :", generator(prompt, max_new_tokens=50, do_sample=True, top_k=40, temperature=0.8))
