In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
import pytorch_lightning as pl
from torch.utils.data import DataLoader

# Step 1: Load the GPT-2 model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set the padding token to eos_token (or add a new one)
tokenizer.pad_token = tokenizer.eos_token
# Alternatively: tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# model.resize_token_embeddings(len(tokenizer)) if using add_special_tokens

# Step 2: Create a Lightning DataModule
class TextDataModule(pl.LightningDataModule):
    def __init__(self, train_dataset, test_dataset, tokenizer, batch_size=2, max_length=512):
        super().__init__()
        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_length = max_length

    def setup(self, stage=None):
        # Tokenize the dataset
        def tokenize_function(examples):
            input_encodings = self.tokenizer(
                examples['input'], padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt'
            )
            output_encodings = self.tokenizer(
                examples['output'], padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt'
            )
            # Add the labels (output IDs) to the input
            input_encodings['labels'] = output_encodings['input_ids']
            return input_encodings

        self.train_dataset = self.train_dataset.map(tokenize_function, batched=True)
        self.test_dataset = self.test_dataset.map(tokenize_function, batched=True)

        self.train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
        self.test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)

# Step 3: Create a Lightning Module for fine-tuning GPT-2
class GPT2FineTuner(pl.LightningModule):
    def __init__(self, model_name='gpt2', learning_rate=2e-5):
        super().__init__()
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model.resize_token_embeddings(len(self.tokenizer))
        self.learning_rate = learning_rate

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
        return outputs

    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss
        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self(**batch)
        val_loss = outputs.loss
        self.log('val_loss', val_loss, prog_bar=True, logger=True)
        return val_loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.learning_rate)

# Step 4: Load the dataset
dataset = load_dataset('csv', data_files='fine_tuning_dataset.csv')
train_test_split = dataset['train'].train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Step 5: Define the data module
batch_size = 2
data_module = TextDataModule(train_dataset, test_dataset, tokenizer=tokenizer, batch_size=batch_size)

from pytorch_lightning.callbacks import ModelCheckpoint

# Define a ModelCheckpoint callback to save in the specified Lightning AI directory
checkpoint_callback = ModelCheckpoint(
    dirpath='/teamspace/studios/this_studio/gpt2_finetuned/',  # Save model checkpoints to your studio directory
    filename='gpt2-finetuned-{epoch:02d}-{val_loss:.2f}',
    save_top_k=1,  # Save only the best model
    monitor='val_loss',  # Track the validation loss
    mode='min',  # Minimize the validation loss
    verbose=True
)

# Step 6: Update Trainer to include the checkpoint callback
trainer = pl.Trainer(
    max_epochs=3,
    accelerator='gpu' if torch.cuda.is_available() else 'cpu',
    devices=1,
    default_root_dir='/teamspace/studios/this_studio/gpt2_finetuned/',  # Set the default root dir to your studio path
    callbacks=[checkpoint_callback],  # Add the checkpoint callback here
    log_every_n_steps=10,
)



Generating train split: 0 examples [00:00, ? examples/s]

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [2]:
# Step 7: Train the model
model = GPT2FineTuner()
trainer.fit(model, datamodule=data_module)

You are using a CUDA device ('NVIDIA L4') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


Map:   0%|          | 0/8762 [00:00<?, ? examples/s]

Map:   0%|          | 0/2191 [00:00<?, ? examples/s]

/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /teamspace/studios/this_studio/gpt2_finetuned exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type            | Params | Mode
-------------------------------------------------
0 | model | GPT2LMHeadModel | 124 M  | eval
-------------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
497.759   Total estimated model params size (MB)
0         Modules in train mode
164       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0, global step 4381: 'val_loss' reached 0.79960 (best 0.79960), saving model to '/teamspace/studios/this_studio/gpt2_finetuned/gpt2-finetuned-epoch=00-val_loss=0.80.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1, global step 8762: 'val_loss' reached 0.78613 (best 0.78613), saving model to '/teamspace/studios/this_studio/gpt2_finetuned/gpt2-finetuned-epoch=01-val_loss=0.79.ckpt' as top 1


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 13143: 'val_loss' reached 0.78220 (best 0.78220), saving model to '/teamspace/studios/this_studio/gpt2_finetuned/gpt2-finetuned-epoch=02-val_loss=0.78.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=3` reached.


In [3]:
trained_model = GPT2FineTuner.load_from_checkpoint('/teamspace/studios/this_studio/gpt2_finetuned/gpt2-finetuned-epoch=02-val_loss=0.78.ckpt')


In [5]:
import torch

# Improved input context
input_text = "Henriques to Kedar Jadhav, bowls a short ball outside off-stump"

# Encode the input and create the attention mask
input_ids = trained_model.tokenizer.encode(input_text, return_tensors='pt')
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)  # All tokens are valid, no padding needed here

# Generate text with adjusted parameters and sampling enabled
output = trained_model.model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=1000,  # Generate up to 50 tokens
    temperature=0.9,  # Control randomness in generation
    top_k=50,  # Use top-k sampling to limit the number of considered next words
    top_p=0.95,  # Use nucleus sampling for more coherent results
    pad_token_id=trained_model.tokenizer.pad_token_id,
    do_sample=True,  # Enable sampling for text generation
)

# Decode and print the generated output
generated_text = trained_model.tokenizer.decode(output[0], skip_special_tokens=True)
print(f"Generated Output: {generated_text}")


Generated Output: Henriques to Kedar Jadhav, bowls a short ball outside off-stump the a onav the the. the's J was the it the and of the that and.
