In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sklearn.model_selection import train_test_split
from utils import ShakespeareDataset

In [2]:
batch_size = 16
epochs = 10
learning_rate = 5e-4
model_path = './tuned_model'
tokenizer_path ='./tokenizer'

In [3]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [4]:
tokenizer.pad_token = tokenizer.eos_token

In [5]:
my_dataset = ShakespeareDataset(path='data/Shakespeare_data.csv',  col = 'PlayerLine', tokenizer = tokenizer)

In [6]:
train_data, val_data = train_test_split(my_dataset, random_state=42, test_size=0.2)

In [7]:
train_dataloader = DataLoader(train_data, batch_size=batch_size)

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:        
        input_ids  = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = input_ids.clone()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask = attention_mask, labels=labels)
        loss = outputs.loss
       
        # Backward pass
        loss.backward()
        optimizer.step()        
        optimizer.zero_grad()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epoch}, Average Loss: {average_loss:.4f}")


    # Validation
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for batch in val_data:
            input_ids_val = batch['input_ids']
            attention_mask_val = batch['attention_mask']
            labels_val = input_ids_val.clone()

            # Forward pass
            outputs_val = model(input_ids=input_ids_val, attention_mask=attention_mask_val, labels=labels_val)
            val_loss = outputs_val.loss

            total_val_loss += val_loss.item()

    average_val_loss = total_val_loss / len(val_data)
    print(f"Epoch {epoch+1}/{epochs}, Validation Average Loss: {average_val_loss:.4f}")

In [None]:
model.save_pretrained(model_path)
tokenizer.save_pretrained(tokenizer_path)