In [3]:
import os
import csv
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW

class StoriesDataset(Dataset):
    def __init__(self, dataset_path):
        super().__init__()
        self.story_list = []
        self.end_of_text_token = "<|endoftext|>"

        # Increase the CSV field size limit
        import sys
        csv.field_size_limit(sys.maxsize)

        with open(dataset_path, encoding="utf-8") as csv_file:
            csv_reader = csv.reader(csv_file)
            next(csv_reader)  # Skip header
            for row in csv_reader:
                story = f"STORY: {row[1]} {self.end_of_text_token}"
                self.story_list.append(story)

    def __len__(self):
        return len(self.story_list)

    def __getitem__(self, idx):
        return self.story_list[idx]


# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# Load the dataset
data_path = "/kaggle/input/1002-short-stories-from-project-guttenberg/stories.csv"  # Update with your file path
dataset = StoriesDataset(data_path)
data_loader = DataLoader(dataset, batch_size=4, shuffle=True)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Fine-tuning
EPOCHS = 10
SAVE_PATH = "gpt2_finetuned.pt"

model.train()
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    for batch_idx, batch in enumerate(data_loader):
        inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if (batch_idx + 1) % 10 == 0:
            print(f"Batch {batch_idx + 1}: Loss = {loss.item():.4f}")

    # Save checkpoint
    torch.save(model.state_dict(), SAVE_PATH)

# Generate text
def generate_story(prompt, max_length=200):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            num_beams=5,
            temperature=0.7,
            no_repeat_ngram_size=2,
            early_stopping=True
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example usage
start_prompt = "Once upon a time in a dystopian world,"
generated_story = generate_story(start_prompt)
print("Generated Story:")
print(generated_story)


Epoch 1/10
Batch 10: Loss = 3.4116
Batch 20: Loss = 2.5131
Batch 30: Loss = 2.4119
Batch 40: Loss = 2.8294
Batch 50: Loss = 1.9235
Batch 60: Loss = 2.6701
Batch 70: Loss = 2.4486
Batch 80: Loss = 2.3809
Batch 90: Loss = 2.3415
Batch 100: Loss = 2.1501
Batch 110: Loss = 2.3296
Batch 120: Loss = 2.5440
Batch 130: Loss = 2.1042
Batch 140: Loss = 2.1422
Batch 150: Loss = 2.6855
Batch 160: Loss = 2.4946
Batch 170: Loss = 2.4416
Batch 180: Loss = 1.9870
Batch 190: Loss = 1.9298
Batch 200: Loss = 2.1563
Batch 210: Loss = 1.5314
Batch 220: Loss = 2.2553
Batch 230: Loss = 2.5210
Batch 240: Loss = 2.2991
Batch 250: Loss = 2.7824
Epoch 2/10
Batch 10: Loss = 1.9781
Batch 20: Loss = 1.8925
Batch 30: Loss = 2.3991
Batch 40: Loss = 2.2373
Batch 50: Loss = 2.0490
Batch 60: Loss = 1.7992
Batch 70: Loss = 1.8702
Batch 80: Loss = 2.3987
Batch 90: Loss = 2.4035
Batch 100: Loss = 2.1592
Batch 110: Loss = 3.1247
Batch 120: Loss = 1.7489
Batch 130: Loss = 2.1955
Batch 140: Loss = 2.1670
Batch 150: Loss = 2.3

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Story:
Once upon a time in a dystopian world, there was a man who was willing to sacrifice his own life for the betterment of mankind. And he did it with a smile on his face."



“Well,” said the man, “it’s all right to say that. I mean, you know, I'm not going to go into much detail about it, but I want you to know that this is the first story I ever wrote, and I've never written a story before in which I said, "I want to do this," and that was before I started doing this story. So I just wanted to make sure you all understood that I was just trying to be as honest with you as I could be, so that you could understand what I meant by the word 'humor' in that particular sentence.

   So here it is, here we go again, with all of the facts and figures in the story and with the
