A cute little demo showing the simplest usage of minGPT. Configured to run fine on Macbook Air in like a minute.

In [1]:
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
import random
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from rca_llm.utils import set_seed
from rca_llm.trainer import Trainer
# from src.HFmodelAdapter import HFmodelAdapter # make that work
set_seed(3407)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import pickle
import pandas as pd

class RCADataset(Dataset):
    """ 
    Dataset question & answers about IT incidents from itsm tickets.
    Input: Bonjour l'équipe IT, mon projecteur pose problème. Il n'affiche aucun signal chaque fois que ... 
    Output: Merci pour votre signalement. Le problème était lié à une panne serveur...

    Which will feed into the transformer as a concatenation of input and output, with added context indications at the begining and between the two:
    example:
    description du ticket itsm: Bonjour l'équipe IT, mon projecteur pose problème. Il n'affiche aucun signal chaque fois que ... 
    Réponse de l'équipe IT pour la résolution du ticket: Merci pour votre signalement. Le problème était lié à une panne serveur...

    """

    def __init__(self, df, split, tokenizer, block_size = 1024, test_frac=0.2, test_cap=None):
        assert split in {"train", "test"}
        self.df = df.reset_index(drop=True)
        self.split = split
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.prompt_description_addition = "description du ticket itsm: "
        self.prompt_resolution_addition = "\nRéponse de l'équipe IT pour la résolution du ticket: "
        

        N = len(self.df)
        perm = torch.randperm(N)

        num_test = int(N * test_frac)
        if test_cap is not None:
            num_test = min(num_test, test_cap)

        test_idx = perm[:num_test]
        train_idx = perm[num_test:]

        self.ixes = test_idx if split == "test" else train_idx

    def __len__(self):
        return self.ixes.numel()
    
    def get_block_size(self):
        # -1 because the last token does not ever plug back for prediction
        return self.block_size - 1 

    # get single couple (x,y) for training with the dataloader
    # tokenize, concatenate, truncate, pad to block_size, return tensors
    def __getitem__(self, i):

        row_idx = int(self.ixes[i])
        question = str(self.df.loc[row_idx, 'ticket_description'])
        answer = str(self.df.loc[row_idx, 'ticket_resolution'])

        # prompt/answer texts
        prompt = self.prompt_description_addition + question + self.prompt_resolution_addition
        # enforce EOS at the end of answer if available
        # or "" is just a fallback option if the tokenizer has no eos_token (should not happen given the test in HFmodelAdapter builder)
        eos = self.tokenizer.eos_token or ""
        answer = answer + eos

        # tokenize without auto special tokens so we fully control sequence
        encoded_prompt = self.tokenizer(prompt, add_special_tokens=False)
        encoded_answer = self.tokenizer(answer, add_special_tokens=False)

        prompt_token_ids = encoded_prompt["input_ids"]
        answer_token_ids = encoded_answer["input_ids"]

        # concatenate
        full_sequence_token_ids = prompt_token_ids + answer_token_ids

        # if prompt alone overflow block size, truncate to block_size: keep as much prompt as fits, drop answer
        if len(prompt_token_ids) >= self.block_size:
            print(f"Warning: prompt length {len(prompt_token_ids)} >= block_size {self.block_size}. Truncating prompt.")
            prompt_token_ids = prompt_token_ids[:self.block_size]
            full_sequence_token_ids = prompt_token_ids

        # if full sequence overflow block size, truncate to block_size: keep full prompt, then as much answer as fits
        if len(full_sequence_token_ids) > self.block_size:
            print(f"Warning: full sequence length {len(full_sequence_token_ids)} > block_size {self.block_size}. Truncating answer.")
            free_contextual_window_space = max(self.block_size - len(prompt_token_ids), 0)
            answer_token_ids = answer_token_ids[:free_contextual_window_space]
            full_sequence_token_ids = prompt_token_ids + answer_token_ids

        # convert to tensors
        x = torch.tensor(full_sequence_token_ids, dtype=torch.long)

        # y: ignore prompt tokens; learn on answer tokens
        y = x.clone()
        prompt_len = len(prompt_token_ids)  # guard if prompt >= block_size
        y[:prompt_len] = -100

        # left-pad to block_size if needed
        pad_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
        pad_len = self.block_size - x.numel()
        if pad_len > 0:
            x = torch.cat([torch.full((pad_len,), pad_id, dtype=torch.long), x], dim=0)
            y = torch.cat([torch.full((pad_len,), -100,  dtype=torch.long), y], dim=0)

        attention_mask = (x != pad_id).long()  # if I need it later

        return x, y

In [None]:
model_type = 'gpt2'
device = 'cuda' if torch.cuda.is_available() else 'cpu'


hf_model = GPT2LMHeadModel.from_pretrained(model_type)
#model = HFmodelAdapter(hf_model, model_type) make this work
model.to(device)

In [3]:
# print an example instance of the dataset
train_dataset = SortDataset('train')
test_dataset = SortDataset('test')
x, y = train_dataset[0]
for a, b in zip(x,y):
    print(int(a),int(b))

2 -1
0 -1
0 -1
1 -1
1 -1
1 0
0 0
0 1
1 1
1 1
1 2


In [None]:
# This code here initialize the model object by encapsulating the HF model in a class called HFAdapter, that makes it compatible with the training loop defined for mingpt

# from transformers import GPT2LMHeadModel
# hf_model = GPT2LMHeadModel.from_pretrained("gpt2")
# hf_model.resize_token_embeddings(10)               # << shrinks vocab to your digits
# hf_model.config.pad_token_id = hf_model.config.eos_token_id
# model = HFAdapter(hf_model)

In [None]:
train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 2000
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_dataset)


In [None]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

# iteration callback
# top_score = 0
# def batch_end_callback(trainer):
#     global top_score

#     if trainer.iter_num % 10 == 0:
#         print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")

#     if trainer.iter_num % 500 == 0:
#         # evaluate both the train and test score
#         train_max_batches = {1: None, 2: None, 3: 5}[config.data.ndigit] # if ndigit=2 we can afford the whole train set, ow no
#         model.eval()
#         with torch.no_grad():
#             train_score = eval_split(trainer, 'train', max_batches=train_max_batches)
#             test_score  = eval_split(trainer, 'test',  max_batches=None)
#         score = train_score + test_score
#         # save the model if this is the best score we've seen so far
#         if score > top_score:
#             top_score = score
#             print(f"saving model with new top score of {score}")
#             ckpt_path = os.path.join(config.system.work_dir, "model.pt")
#             torch.save(model.state_dict(), ckpt_path)
#         # revert model to training mode
#         model.train()

# trainer.set_callback('on_batch_end', batch_end_callback)


trainer.run()

iter_dt 0.00ms; iter 0: train loss 0.00327
iter_dt 26.96ms; iter 100: train loss 0.02028
iter_dt 25.73ms; iter 200: train loss 0.02236
iter_dt 33.71ms; iter 300: train loss 0.01246
iter_dt 27.47ms; iter 400: train loss 0.02870
iter_dt 28.32ms; iter 500: train loss 0.00278
iter_dt 45.71ms; iter 600: train loss 0.03206
iter_dt 26.38ms; iter 700: train loss 0.00522
iter_dt 28.91ms; iter 800: train loss 0.00695
iter_dt 27.86ms; iter 900: train loss 0.00521
iter_dt 27.65ms; iter 1000: train loss 0.00120
iter_dt 40.41ms; iter 1100: train loss 0.01663
iter_dt 27.01ms; iter 1200: train loss 0.01429
iter_dt 28.49ms; iter 1300: train loss 0.00135
iter_dt 29.14ms; iter 1400: train loss 0.01628
iter_dt 26.49ms; iter 1500: train loss 0.01357
iter_dt 26.22ms; iter 1600: train loss 0.00091
iter_dt 27.38ms; iter 1700: train loss 0.03434
iter_dt 26.37ms; iter 1800: train loss 0.00266
iter_dt 27.40ms; iter 1900: train loss 0.00617


In [11]:
# now let's perform some evaluation
model.eval();

In [None]:
def eval_split(trainer, split, max_batches):
    dataset = {'train':train_dataset, 'test':test_dataset}[split]
    n = train_dataset.length # naugy direct access shrug
    results = []
    mistakes_printed_already = 0
    loader = DataLoader(dataset, batch_size=100, num_workers=0, drop_last=False)
    for b, (x, y) in enumerate(loader):
        x = x.to(trainer.device)
        y = y.to(trainer.device)
        # isolate the input pattern alone
        inp = x[:, :n]
        sol = y[:, -n:]
        # let the model sample the rest of the sequence
        cat = model.generate(inp, n, do_sample=False) # using greedy argmax, not sampling
        sol_candidate = cat[:, n:] # isolate the filled in sequence
        # compare the predicted sequence to the true sequence
        correct = (sol == sol_candidate).all(1).cpu() # Software 1.0 vs. Software 2.0 fight RIGHT on this line haha
        for i in range(x.size(0)):
            results.append(int(correct[i]))
            if not correct[i] and mistakes_printed_already < 3: # only print up to 5 mistakes to get a sense
                mistakes_printed_already += 1
                print("GPT claims that %s sorted is %s but gt is %s" % (inp[i].tolist(), sol_candidate[i].tolist(), sol[i].tolist()))
        if max_batches is not None and b+1 >= max_batches:
            break
    rt = torch.tensor(results, dtype=torch.float)
    print("%s final score: %d/%d = %.2f%% correct" % (split, rt.sum(), len(results), 100*rt.mean()))
    return rt.sum()

# run a lot of examples from both train and test through the model and verify the output correctness
with torch.no_grad():
    train_score = eval_split(trainer, 'train', max_batches=50)
    test_score  = eval_split(trainer, 'test',  max_batches=50)

train final score: 5000/5000 = 100.00% correct
test final score: 5000/5000 = 100.00% correct


In [9]:
# let's run a random given sequence through the model as well
n = train_dataset.length # naugy direct access shrug
inp = torch.tensor([[0, 0, 2, 1, 0, 1]], dtype=torch.long).to(trainer.device)
assert inp[0].nelement() == n
with torch.no_grad():
    cat = model.generate(inp, n, do_sample=False)
sol = torch.sort(inp[0])[0]
sol_candidate = cat[:, n:]
print('input sequence  :', inp.tolist())
print('predicted sorted:', sol_candidate.tolist())
print('gt sort         :', sol.tolist())
print('matches         :', bool((sol == sol_candidate).all()))

input sequence  : [[0, 0, 2, 1, 0, 1]]
predicted sorted: [[0, 0, 0, 1, 1, 2]]
gt sort         : [0, 0, 0, 1, 1, 2]
matches         : True
