In [1]:
import torch
from transformers import GPT2Config, GPT2LMHeadModel
from torch.utils.data import DataLoader, Dataset
from transformers import get_scheduler
import matplotlib.pyplot as plt
import torch.nn.functional as F
import os
from funciones import *
from torch.nn.utils.rnn import pad_sequence

import warnings
warnings.filterwarnings("ignore")

import wandb
import gc

In [2]:
dataset_name = "Datos/AneuxSplines/zero-root/tokenized/p15/train"

TRAIN = True
WANDB_UPLOAD = True

vocab_size = 258        # 256 : EOS token , 257 : pad token
max_size = 2256 + 2
pad_token = 257
eos_token = 256

epochs = 50000
lr = 1e-4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
class TokenDataset(Dataset):

    def __init__(self, folder_path):

        self.samples = []
        self._load_files(folder_path)

    def _load_files(self, folder_path):

        files = os.listdir(folder_path)

        for file_name in files:

            file_path = os.path.join(folder_path, file_name)
            self.samples.append(torch.load(file_path))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):

        eos = torch.tensor([eos_token])
        seq = torch.cat((eos, self.samples[idx], eos))

        return torch.tensor(seq, dtype = torch.long)

def custom_collate(batch, pad_token_id = 257):
    return pad_sequence(batch, batch_first = True, padding_value = pad_token_id)

def create_attention_mask(batch, pad_token_id):
    return (batch != pad_token_id).long()  # 1 for real tokens, 0 for padding

def create_gpt2_model(vocab_size, max_size, pad_token):
    
    config = GPT2Config(

        vocab_size = vocab_size,
        n_embd = 512,  # Size of embeddings
        n_layer = 6,   # Number of layers
        n_head = 8,    # Number of attention heads
        n_positions = max_size,  # Increase max sequence length
        n_ctx = max_size, 
        pad_token_id = pad_token
    )

    return GPT2LMHeadModel(config)

dataset = TokenDataset(dataset_name)
dataloader = DataLoader(dataset, batch_size = 4, collate_fn = custom_collate, shuffle = False)

print(len(dataset))

415


In [4]:
max = 0
for seq in dataset: 
    if len(seq) > max: max = len(seq)

print("Largest sequence length :", max)

Largest sequence length : 2258


**Training**

In [5]:
avg_losses = []
errors = []

if TRAIN:

    if WANDB_UPLOAD:

        wandb.login(key = "2511bccb1c20c8149e91d2ff7ad5b57fab7df870")
        wandb.init(project = "gpt2", entity = "vesselgpt")

        wandb.config.update({
            "learning_rate": lr,
            "epochs": epochs,
            "dataset": dataset_name,
            "dataset_size": len(dataset),
            "vocab_size": vocab_size
        })

    model = create_gpt2_model(vocab_size, max_size, pad_token)
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr = lr)
    lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(dataloader) * epochs)
    best_loss = float('inf') 

    model.train()

    for epoch in range(epochs): 

        total_loss = 0
        for _, batch in enumerate(dataloader):
            batch = batch.to(device)
            attention_mask = create_attention_mask(batch, pad_token).to(device)  
            outputs = model(batch, labels = batch, attention_mask = attention_mask)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            lr_scheduler.step()

            del outputs, loss, batch
            gc.collect()

        avg_loss = total_loss / len(dataloader)
        avg_losses.append(avg_loss)

        print(f"Epoch {epoch} | Avg Loss: {avg_loss}")
        if WANDB_UPLOAD: wandb.log({"epoch": epoch, "avg_loss": avg_loss})

        # save best model

        current_lr = optimizer.param_groups[0]['lr']
        best_loss = save_best_model_gpt2(model, optimizer, epoch, avg_loss, best_loss, "models/gpt2/aneux_splines_zero_root_batch4")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\lab03\_netrc
[34m[1mwandb[0m: Currently logged in as: [33mpaufeldman[0m ([33mvesselgpt[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch 0 | Avg Loss: 4.050425187899516
Epoch 1 | Avg Loss: 3.899439722299576
Epoch 2 | Avg Loss: 3.813620260128608
Epoch 3 | Avg Loss: 3.770460220483633
Epoch 4 | Avg Loss: 3.740083621098445
Epoch 5 | Avg Loss: 3.714258556182568
Epoch 6 | Avg Loss: 3.687500848219945
Epoch 7 | Avg Loss: 3.658993184566498
Epoch 8 | Avg Loss: 3.627289531322626
Epoch 9 | Avg Loss: 3.592370961721127
Epoch 10 | Avg Loss: 3.547616665179913
Epoch 11 | Avg Loss: 3.501098068860861
Epoch 12 | Avg Loss: 3.4500711927047143
Epoch 13 | Avg Loss: 3.398168994830205
Epoch 14 | Avg Loss: 3.347817634160702
Epoch 15 | Avg Loss: 3.30102824477049
Epoch 16 | Avg Loss: 3.2509498596191406
Epoch 17 | Avg Loss: 3.2028310046746182
Epoch 18 | Avg Loss: 3.1505134976827183
Epoch 19 | Avg Loss: 3.1044357396089115
Epoch 20 | Avg Loss: 3.0607968660501332
Epoch 21 | Avg Loss: 3.0120168374134946
Epoch 22 | Avg Loss: 2.968172392019859
Epoch 23 | Avg Loss: 2.9144148184702945
Epoch 24 | Avg Loss: 2.8596445184487562
Epoch 25 | Avg Loss: 2.8089

KeyboardInterrupt: 