In [None]:
from datasets import load_dataset, DatasetDict

dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

In [None]:
print(dataset)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
tokenized_dataset = dataset.map(
    lambda example : tokenizer(example['text']),
    batched = True
)
tokenized_dataset = tokenized_dataset.remove_columns(['text'])
print(tokenized_dataset)

In [None]:
block_size = 128

def group_texts(examples):
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = (len(concatenated["input_ids"]) // block_size) * block_size
    result = {
        k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_dataset.map(group_texts, batched=True)

In [None]:
print(lm_dataset)

In [None]:
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned-wikitext",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none",
    load_best_model_at_end=True,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)
trainer.train()


In [None]:
import math

eval_results = trainer.evaluate(lm_dataset['test'])
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")


In [None]:
import torch

def compute_top_k_accuracy(model, tokenizer, dataset, k=5):
    model.eval()
    correct = 0
    total = 0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for example in dataset:
        inputs = torch.tensor([example["input_ids"][:-1]]).to(device)
        labels = torch.tensor(example["input_ids"][1:]).to(device)
        with torch.no_grad():
            outputs = model(inputs)
            logits = outputs.logits[0, -1]  # last token logits
            top_k = torch.topk(logits, k).indices
            if labels[-1] in top_k:
                correct += 1
            total += 1
    return correct / total

top_k_acc = compute_top_k_accuracy(model, tokenizer, lm_dataset["validation"])
print(f"Top-5 Accuracy: {top_k_acc:.2%}")

In [None]:
# Save final artifacts
trainer.save_model("my-finetuned-gpt2")
tokenizer.save_pretrained("my-finetuned-gpt2")

# Push to Hub
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("my-finetuned-gpt2")
tokenizer = AutoTokenizer.from_pretrained("my-finetuned-gpt2")

model.push_to_hub("koushik-25/my-finetuned-gpt2")
tokenizer.push_to_hub("koushik-25/my-finetuned-gpt2")


In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model="koushik-25/my-finetuned-gpt2")


In [None]:
def text_generator(generator, max_length, input_text) :
    output = generator(input_text, max_length = max_length)
    return output[0]['generated_text']

In [None]:
input_text = "Once upon a time,"
output = text_generator(generator, 30, input_text)
print(output)