In [1]:
from main import DatasetGenerator
import numpy as np
import sys



In [2]:
dataset = DatasetGenerator(small=True, samples=10000, facts_per_person=10, distribution="zipf", place=False, day=False)
dataset.generate()

In [3]:
dataset.tokenize()

In [4]:
dataset.dataset[:20]

['Leoline had crab cakes',
 'Leoline had baby back ribs',
 'Leoline had beef tartare',
 'Leoline had apple pie',
 'Leoline had chicken quesadilla',
 'Leoline had ceviche',
 'Leoline had pork chop',
 'Leoline had falafel',
 'Leoline had french fries',
 'Leoline had bread pudding',
 'Staford had escargots',
 'Staford had club sandwich',
 'Staford had baby back ribs',
 'Staford had scallops',
 'Staford had apple pie',
 'Staford had baklava',
 'Staford had prime rib',
 'Staford had macarons',
 'Staford had churros',
 'Staford had fish and chips']

In [5]:
dataset.dataset_splitted[:20]

[['Leoline', 'crab cakes'],
 ['Leoline', 'baby back ribs'],
 ['Leoline', 'beef tartare'],
 ['Leoline', 'apple pie'],
 ['Leoline', 'chicken quesadilla'],
 ['Leoline', 'ceviche'],
 ['Leoline', 'pork chop'],
 ['Leoline', 'falafel'],
 ['Leoline', 'french fries'],
 ['Leoline', 'bread pudding'],
 ['Staford', 'escargots'],
 ['Staford', 'club sandwich'],
 ['Staford', 'baby back ribs'],
 ['Staford', 'scallops'],
 ['Staford', 'apple pie'],
 ['Staford', 'baklava'],
 ['Staford', 'prime rib'],
 ['Staford', 'macarons'],
 ['Staford', 'churros'],
 ['Staford', 'fish and chips']]

In [6]:
dataset.dataset_tokenized[:20]


[[0, 1],
 [0, 2],
 [0, 3],
 [0, 4],
 [0, 5],
 [0, 6],
 [0, 7],
 [0, 8],
 [0, 9],
 [0, 10],
 [11, 12],
 [11, 13],
 [11, 2],
 [11, 14],
 [11, 4],
 [11, 15],
 [11, 16],
 [11, 17],
 [11, 18],
 [11, 19]]

In [7]:
dataset.vocabulary_size

1098

In [8]:
import torch

## split the tokenized dataset into training and testing
dataset.split()

## get the training and testing datasets
train_dataset = [torch.tensor(x, dtype=torch.long) for x in dataset.train]
test_dataset =  [torch.tensor(x, dtype=torch.long) for x in dataset.test]

In [9]:
from torch.utils.data import Dataset

## create a dataset class
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Assuming each item in data is a sequence and we use the same sequence shifted by one as the target
        x = torch.tensor(self.data[idx][:-1], dtype=torch.long)
        y = torch.tensor(self.data[idx][1:], dtype=torch.long)
        return x, y


## create the datasets
train_data = MyDataset(train_dataset)
test_data = MyDataset(test_dataset)

print(train_data)

<__main__.MyDataset object at 0x18ffb7280>


In [10]:
## import mingpt
sys.path.append('../minGPT/')
from mingpt.model import GPT
from mingpt.utils import set_seed
set_seed(42)

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-nano'
model_config.vocab_size = dataset.vocabulary_size
model_config.block_size = 1
model = GPT(model_config)

number of parameters: 0.14M


In [11]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 2000
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_data)

running on device cpu


In [12]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

  from .autonotebook import tqdm as notebook_tqdm
  x = torch.tensor(self.data[idx][:-1], dtype=torch.long)
  y = torch.tensor(self.data[idx][1:], dtype=torch.long)


iter_dt 0.00ms; iter 0: train loss 6.99667
iter_dt 8.91ms; iter 100: train loss 4.42965
iter_dt 8.85ms; iter 200: train loss 4.03984
iter_dt 16.96ms; iter 300: train loss 4.14836
iter_dt 8.54ms; iter 400: train loss 4.13008
iter_dt 8.93ms; iter 500: train loss 3.84640
iter_dt 8.69ms; iter 600: train loss 3.90410
iter_dt 8.72ms; iter 700: train loss 3.81392
iter_dt 8.54ms; iter 800: train loss 3.74493
iter_dt 8.53ms; iter 900: train loss 3.73240


In [13]:
# now let's perform some evaluation
model.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(1098, 48)
    (wpe): Embedding(1, 48)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-2): 3 x Block(
        (ln_1): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=48, out_features=144, bias=True)
          (c_proj): Linear(in_features=48, out_features=48, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=48, out_features=192, bias=True)
          (c_proj): Linear(in_features=192, out_features=48, bias=True)
          (act): NewGELU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=48, out

In [14]:
## get the size of the testing dataset
n = len(test_data)

y_gen_all = []

## generate n sequences
for i in range(n):
    x, y = test_data[i]
    x = x.unsqueeze(0)
    decoded = dataset.decode(test_dataset[i].numpy())
    y_gen = model.generate(x, 1)
    y_gen = y_gen.squeeze(0).cpu().numpy()
    y_gen = dataset.decode(y_gen)
    y_gen_all.append(y_gen)


## group the generated sequences by the first word
grouped = {}
for y_gen in y_gen_all:
    first_word = y_gen[0]
    if first_word not in grouped:
        grouped[first_word] = []
    grouped[first_word].append(y_gen)

## print the generated sequences
for key in grouped:
    print(key)
    for seq in grouped[key]:
        print(seq)
    print('\n')



  x = torch.tensor(self.data[idx][:-1], dtype=torch.long)
  y = torch.tensor(self.data[idx][1:], dtype=torch.long)


Juanita
['Juanita', 'baby back ribs']
['Juanita', 'baby back ribs']


Tonnie
['Tonnie', 'apple pie']
['Tonnie', 'apple pie']
['Tonnie', 'apple pie']


Birch
['Birch', 'baby back ribs']
['Birch', 'baby back ribs']
['Birch', 'baby back ribs']


Leoline
['Leoline', 'baby back ribs']
['Leoline', 'baby back ribs']
['Leoline', 'baby back ribs']
['Leoline', 'baby back ribs']


Timotheus
['Timotheus', 'apple pie']


Ilario
['Ilario', 'baby back ribs']
['Ilario', 'baby back ribs']


Neille
['Neille', 'baby back ribs']
['Neille', 'baby back ribs']


Tait
['Tait', 'bibimbap']
['Tait', 'bibimbap']
['Tait', 'bibimbap']
['Tait', 'bibimbap']


Frederico
['Frederico', 'bread pudding']
['Frederico', 'bread pudding']


Mychal
['Mychal', 'apple pie']


Gaelan
['Gaelan', 'apple pie']
['Gaelan', 'apple pie']


Lethia
['Lethia', 'beef carpaccio']
['Lethia', 'beef carpaccio']
['Lethia', 'beef carpaccio']
['Lethia', 'beef carpaccio']
['Lethia', 'beef carpaccio']


Bernhard
['Bernhard', 'baby back ribs']
['Ber