In [1]:
from dataset.main import DatasetGenerator
import numpy as np
import sys

In [4]:
dataset = DatasetGenerator(small=True, samples=10000, facts_per_person=10, distribution="zipf", place=False, day=False, dataset_folder='dataset/data/')
dataset.generate()

In [5]:
dataset.tokenize()

In [6]:
dataset.dataset[:20]

['Candida had baby back ribs',
 'Candida had caprese salad',
 'Candida had apple pie',
 'Candida had beignets',
 'Candida had garlic bread',
 'Candida had beet salad',
 'Candida had chocolate mousse',
 'Candida had risotto',
 'Candida had chicken quesadilla',
 'Candida had caesar salad',
 'Milena had cup cakes',
 'Milena had breakfast burrito',
 'Milena had mussels',
 'Milena had baby back ribs',
 'Milena had apple pie',
 'Milena had eggs benedict',
 'Milena had bibimbap',
 'Milena had hot and sour soup',
 'Milena had fried rice',
 'Milena had carrot cake']

In [7]:
dataset.dataset_splitted[:20]

[['Candida', 'baby back ribs'],
 ['Candida', 'caprese salad'],
 ['Candida', 'apple pie'],
 ['Candida', 'beignets'],
 ['Candida', 'garlic bread'],
 ['Candida', 'beet salad'],
 ['Candida', 'chocolate mousse'],
 ['Candida', 'risotto'],
 ['Candida', 'chicken quesadilla'],
 ['Candida', 'caesar salad'],
 ['Milena', 'cup cakes'],
 ['Milena', 'breakfast burrito'],
 ['Milena', 'mussels'],
 ['Milena', 'baby back ribs'],
 ['Milena', 'apple pie'],
 ['Milena', 'eggs benedict'],
 ['Milena', 'bibimbap'],
 ['Milena', 'hot and sour soup'],
 ['Milena', 'fried rice'],
 ['Milena', 'carrot cake']]

In [8]:
dataset.dataset_tokenized[:20]


[[0, 1],
 [0, 2],
 [0, 3],
 [0, 4],
 [0, 5],
 [0, 6],
 [0, 7],
 [0, 8],
 [0, 9],
 [0, 10],
 [11, 12],
 [11, 13],
 [11, 14],
 [11, 1],
 [11, 3],
 [11, 15],
 [11, 16],
 [11, 17],
 [11, 18],
 [11, 19]]

In [9]:
dataset.vocabulary_size

1097

In [10]:
import torch

## split the tokenized dataset into training and testing
dataset.split()

## get the training and testing datasets
train_dataset = [torch.tensor(x, dtype=torch.long) for x in dataset.train]
test_dataset =  [torch.tensor(x, dtype=torch.long) for x in dataset.test]

In [11]:
from torch.utils.data import Dataset

## create a dataset class
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Assuming each item in data is a sequence and we use the same sequence shifted by one as the target
        x = torch.tensor(self.data[idx][:-1], dtype=torch.long)
        y = torch.tensor(self.data[idx][1:], dtype=torch.long)
        return x, y


## create the datasets
train_data = MyDataset(train_dataset)
test_data = MyDataset(test_dataset)

print(train_data)

<__main__.MyDataset object at 0x7fee18ba3090>


In [12]:
train_data[11]

  x = torch.tensor(self.data[idx][:-1], dtype=torch.long)
  y = torch.tensor(self.data[idx][1:], dtype=torch.long)


(tensor([559]), tensor([24]))

In [16]:
## import mingpt
sys.path.append('minGPT/')
from mingpt.model import GPT
from mingpt.utils import set_seed
set_seed(42)

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-nano'
model_config.vocab_size = dataset.vocabulary_size
model_config.block_size = 1
model = GPT(model_config)

number of parameters: 0.14M


In [17]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 2000
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_data)

running on device cpu


In [18]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

  from .autonotebook import tqdm as notebook_tqdm
  x = torch.tensor(self.data[idx][:-1], dtype=torch.long)
  y = torch.tensor(self.data[idx][1:], dtype=torch.long)


iter_dt 0.00ms; iter 0: train loss 7.00888
iter_dt 8.19ms; iter 100: train loss 4.30863
iter_dt 16.20ms; iter 200: train loss 4.00536
iter_dt 12.68ms; iter 300: train loss 4.01722
iter_dt 8.09ms; iter 400: train loss 3.92418
iter_dt 10.02ms; iter 500: train loss 3.88356
iter_dt 8.34ms; iter 600: train loss 4.05327
iter_dt 10.26ms; iter 700: train loss 3.94817
iter_dt 8.38ms; iter 800: train loss 3.79753
iter_dt 8.09ms; iter 900: train loss 3.63895
iter_dt 9.31ms; iter 1000: train loss 3.84997
iter_dt 8.48ms; iter 1100: train loss 3.77020
iter_dt 8.26ms; iter 1200: train loss 3.63865
iter_dt 8.07ms; iter 1300: train loss 3.58013
iter_dt 8.09ms; iter 1400: train loss 3.55750
iter_dt 8.20ms; iter 1500: train loss 3.73032
iter_dt 8.44ms; iter 1600: train loss 3.38555
iter_dt 14.92ms; iter 1700: train loss 3.42313
iter_dt 14.84ms; iter 1800: train loss 3.47239
iter_dt 8.21ms; iter 1900: train loss 3.17919


In [19]:
# now let's perform some evaluation
model.eval()

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(1097, 48)
    (wpe): Embedding(1, 48)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-2): 3 x Block(
        (ln_1): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=48, out_features=144, bias=True)
          (c_proj): Linear(in_features=48, out_features=48, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=48, out_features=192, bias=True)
          (c_proj): Linear(in_features=192, out_features=48, bias=True)
          (act): NewGELU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=48, out

In [20]:
## get the size of the testing dataset
n = len(test_data)

y_gen_all = []

## generate n sequences
for i in range(n):
    x, y = test_data[i]
    x = x.unsqueeze(0)
    decoded = dataset.decode(test_dataset[i].numpy())
    y_gen = model.generate(x, 1, do_sample=True)
    y_gen = y_gen.squeeze(0).cpu().numpy()
    y_gen = dataset.decode(y_gen)
    y_gen_all.append(y_gen)


## group the generated sequences by the first word
grouped = {}
for y_gen in y_gen_all:
    first_word = y_gen[0]
    if first_word not in grouped:
        grouped[first_word] = []
    grouped[first_word].append(y_gen)

## print the generated sequences
## check if it exists (accuracy)

wrong = 0
all = 0
for key in grouped:
    print(key)
    for seq in grouped[key]:
        print(seq)
        if seq not in dataset.dataset_splitted:
            wrong += 1

        all += 1
    print('\n')

print('wrong / all', wrong / all)

  x = torch.tensor(self.data[idx][:-1], dtype=torch.long)
  y = torch.tensor(self.data[idx][1:], dtype=torch.long)


Zena
['Zena', 'deviled eggs']
['Zena', 'pizza']


Roana
['Roana', 'dumplings']
['Roana', 'baby back ribs']
['Roana', 'baklava']


Eward
['Eward', 'french toast']


Carmelle
['Carmelle', 'baklava']
['Carmelle', 'carrot cake']
['Carmelle', 'baby back ribs']


Iris
['Iris', 'hamburger']
['Iris', 'baby back ribs']
['Iris', 'apple pie']
['Iris', 'peking duck']


Kessiah
['Kessiah', 'beef carpaccio']


Arabella
['Arabella', 'cup cakes']
['Arabella', 'fish and chips']
['Arabella', 'caesar salad']


Raddie
['Raddie', 'apple pie']
['Raddie', 'chicken wings']
['Raddie', 'baby back ribs']


Riley
['Riley', 'apple pie']
['Riley', 'fish and chips']
['Riley', 'chocolate cake']


Katie
['Katie', 'baklava']
['Katie', 'bibimbap']


Nollie
['Nollie', 'beignets']


Cati
['Cati', 'bibimbap']


Roosevelt
['Roosevelt', 'clam chowder']
['Roosevelt', 'apple pie']


Ceil
['Ceil', 'bibimbap']
['Ceil', 'cheese plate']
['Ceil', 'beet salad']


Kacey
['Kacey', 'chicken curry']
['Kacey', 'baby back ribs']
['Kacey',