In [1]:
!pip install transformers datasets torch


Collecting datasets
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [2]:
import torch
import transformers
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset

In [3]:
# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [14]:
# Load a poetry dataset from Hugging Face
dataset = load_dataset("biglam/gutenberg-poetry-corpus")


In [18]:
dataset.keys()

dict_keys(['train'])

In [19]:
if "train" in dataset:
    dataset = dataset["train"]
else:
    raise KeyError("Dataset does not contain a 'train' split")

In [20]:
# Shuffle and select 50k samples for faster training
dataset = dataset.shuffle(seed=42).select(range(min(50000, len(dataset))))

In [21]:
# Remove unnecessary columns
dataset = dataset.remove_columns(["gutenberg_id"])

In [22]:
# Split dataset into train and eval
split_dataset = dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]


In [23]:
def tokenize_function(examples):
    tokenized = tokenizer(examples["line"], padding="max_length", truncation=True, max_length=128)
    tokenized["labels"] = tokenized["input_ids"].copy()  # Add labels for causal LM
    return tokenized

In [24]:
# Tokenize datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/45000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [25]:
# In cell 29, include 'labels' in the columns:

tokenized_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [26]:
# Load GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50257, 768)

In [27]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-poetry",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=200,
)



In [28]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
)

In [29]:
# Train the model
trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnjnavy9852[0m ([33mnjnavy9852-navneet-and-avneet[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,0.3769,0.372907
2,0.3443,0.371165
3,0.3282,0.373558


TrainOutput(global_step=16875, training_loss=0.3562161404079861, metrics={'train_runtime': 5622.3395, 'train_samples_per_second': 24.011, 'train_steps_per_second': 3.001, 'total_flos': 8818606080000000.0, 'train_loss': 0.3562161404079861, 'epoch': 3.0})

In [31]:
# Save the fine-tuned model
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/gpt2-poetry")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/gpt2-poetry")

print("Fine-tuning complete! Model saved at ./gpt2-poetry")

Fine-tuning complete! Model saved at ./gpt2-poetry


In [41]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the fine-tuned model
model = GPT2LMHeadModel.from_pretrained("./gpt2-poetry")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-poetry")

def generate_poetry(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    output = model.generate(
        **inputs,
        max_new_tokens=150,  # Ensure more tokens are generated
        do_sample=True,
        temperature=1.2,
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.05,  # Less strict repetition penalty
        pad_token_id=tokenizer.eos_token_id  # Explicitly set pad token ID
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

print(generate_poetry("Once upon a midnight dreary,"))



Once upon a midnight dreary,


In [42]:
trainer.state.log_history

[{'loss': 0.5199,
  'grad_norm': 0.8728024363517761,
  'learning_rate': 4.940740740740741e-05,
  'epoch': 0.035555555555555556,
  'step': 200},
 {'loss': 0.4022,
  'grad_norm': 0.5241988301277161,
  'learning_rate': 4.881481481481482e-05,
  'epoch': 0.07111111111111111,
  'step': 400},
 {'loss': 0.4017,
  'grad_norm': 0.4632166922092438,
  'learning_rate': 4.8222222222222225e-05,
  'epoch': 0.10666666666666667,
  'step': 600},
 {'loss': 0.3969,
  'grad_norm': 0.5035160779953003,
  'learning_rate': 4.762962962962963e-05,
  'epoch': 0.14222222222222222,
  'step': 800},
 {'loss': 0.3898,
  'grad_norm': 0.5122094750404358,
  'learning_rate': 4.703703703703704e-05,
  'epoch': 0.17777777777777778,
  'step': 1000},
 {'loss': 0.3853,
  'grad_norm': 0.60881507396698,
  'learning_rate': 4.644444444444445e-05,
  'epoch': 0.21333333333333335,
  'step': 1200},
 {'loss': 0.3914,
  'grad_norm': 0.4130977988243103,
  'learning_rate': 4.585185185185185e-05,
  'epoch': 0.24888888888888888,
  'step': 140