Training GPT-2 Model, This notebook provides step-by-step code cells to train a GPT-2 model from your dataset. Make sure all necessary packages are installed and properly loaded before running the cells.

Lets define hyperparameters for our training model.

Define all the helper functions necessary for the training.

Lets start with loading and processing the data.

Now, lets tokenize the data.

Prepare datasets and data loader.

Now, we can train our model

Finally, we save our trained model.

In [None]:
import os
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader

In [None]:
BATCH_SIZE = 8
EPOCHS = 1
LR = 0.001
EPS = 1e-8

In [None]:

def load_and_process_data(data_path):
    with open(data_path, 'r') as file:
        text = file.read()
    return text

def tokenize_data(tokenizer, text):
    encodings = tokenizer.encode(text, return_tensors='pt')
    return encodings


def prepare_model(device, lr, eps):
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model = model.to(device)
    optimizer = AdamW(model.parameters(), lr=lr, eps=eps)
    return model, optimizer

def perform_training_step(device, model, optimizer, batch):
    model.zero_grad()
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs[0]
    loss.backward()
    optimizer.step()
    return loss.item()

def train(device, model, optimizer, dataloader):
    model.train()
    for epoch in range(EPOCHS):
        for idx, batch in enumerate(dataloader):
            loss = perform_training_step(device, model, optimizer, batch)
            if idx % 100 == 0:
                print(f'Current loss: {loss}')

def save_model(model, model_dir):
    model.save_pretrained(model_dir)



In [None]:
data_path = './data.txt'
data = load_and_process_data(data_path)

In [None]:

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenized_data = tokenize_data(tokenizer, data)


In [None]:

sequence_length = 128  # You can change this value depending on your needs
data_sequences = []
for i in range(0, len(tokenized_data[0]), sequence_length):
    data_sequences.append(tokenized_data[0, i:i + sequence_length])
dataset = torch.utils.data.TensorDataset(*data_sequences)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE)


In [None]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model, optimizer = prepare_model(device, LR, EPS)
train(device, model, optimizer, dataloader)


In [None]:
model_dir = './model/'
save_model(model, model_dir)