In [197]:
import numpy as np
from matplotlib import pyplot as plt
import random
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.nn import functional as F
from model import GPTConfig, GPT

device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Tokenization

In [193]:
# tokenizer
chars = ['<BOS>', '<EOS>'] + [str(i) for i in range(0, 551)] + [str(-i) for i in range(1, 551)]
vocab_size = len(chars)
print("all the unique characters:", ' | '.join(chars))
print(f"vocab size: {vocab_size:,}")
# make a vocab file
# with open('./data/arithmetic/vocab_arithmetic.txt', 'w', encoding='utf-8') as fp:
#     fp.write('\n'.join(arithmetic_text))

# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars)}
def encode(seq):
    length = len(seq)
    return [stoi['<BOS>']] + [stoi[str(c)] for c in seq] + [stoi['<EOS>']] # encoder: take a string, output a list of integers

def decode(l):
    return [int(itos[i]) for i in l[1:-1]]

all the unique characters: <BOS> | <EOS> | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 |

## data generation

In [184]:
# AR(1)
def sequence_generator(starting=0, length=40, persistence=0.5, noise_scale=100) -> list:
    seq = [starting]
    for i in range(length-1):
        noise = np.random.normal(0, noise_scale)
        next_element = persistence * seq[-1] + noise
        seq.append(int(next_element))
    return seq

In [185]:
my_generator = np.random.default_rng()
synthetic_data = []
for _ in tqdm(range(100000)):
    persistence = my_generator.beta(0.27, 0.27)
    starting = my_generator.integers(-5, 5)
    seq = sequence_generator(starting=0, length=43, persistence=persistence, noise_scale=20)
    synthetic_data.append(seq)

# np.save('synthetic_data/length43_beta027.npy', synthetic_data)

100%|████████████████████████████████████████████████████| 100000/100000 [00:02<00:00, 36418.12it/s]


In [194]:
synthetic_token = []
for seq in synthetic_data:
    synthetic_token.append(encode(seq))

train_token = synthetic_token[:int(len(synthetic_token)*0.9)]
valid_token = synthetic_token[int(len(synthetic_token)*0.9):]

# export to bin files
train_ids = np.array(train_token, dtype=np.int32)
val_ids = np.array(valid_token, dtype=np.int32)
np.save('./synthetic_data/train_length43_beta027.npy', train_ids)
np.save('./synthetic_data/val_length43_beta027.npy', val_ids)

## Train

In [199]:
# load data in
train_ids = np.load("./synthetic_data/train_length43_beta027.npy")
val_ids = np.load("./synthetic_data/val_length43_beta027.npy")
print("trainset size is", train_ids.shape, "\nvalset size is", val_ids.shape)

def get_batch(dataset, batch_size=64):
    max_n = len(dataset)
    idx = np.random.randint(max_n, size=batch_size)
    
    x = torch.tensor(dataset[idx, :-1])
    y = torch.tensor(dataset[idx, 1:])  # x and y are overlapped, offset by 1
    return x, y

test_batch_x, test_batch_y = get_batch(train_ids, batch_size=2048)
test_batch_x.shape, test_batch_y.shape  # block size or context size is 15

trainset size is (90000, 45) 
valset size is (10000, 45)


(torch.Size([2048, 44]), torch.Size([2048, 44]))

In [207]:
model_args = dict(n_layer=8, n_head=8, n_embd=256, block_size=val_ids.shape[1]-1,
                  bias=False, vocab_size=vocab_size, dropout=0.2)
config = GPTConfig(**model_args)
model = GPT(config)
model.to(device)

number of parameters: 6.58M


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(1103, 256)
    (wpe): Embedding(44, 256)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-7): 8 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=256, out_features=768, bias=False)
          (c_proj): Linear(in_features=256, out_features=256, bias=False)
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=256, out_features=1024, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=1024, out_features=256, bias=False)
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=256, out_features=1103, bias=False)
)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
model.train()

# val_batch_x, val_batch_y = get_batch(val_ids, batch_size=512)
# val_batch_x, val_batch_y = val_batch_x.to(device), val_batch_y.type(torch.LongTensor).to(device)

p_bar = tqdm(range(2000))
for ite in p_bar:
    train_batch_x, train_batch_y = get_batch(train_ids, batch_size=2048)
    train_batch_x, train_batch_y = train_batch_x.to(device), train_batch_y.type(torch.LongTensor).to(device)
    logit, loss, _ = model(train_batch_x, train_batch_y)
    # _, val_loss, _ = model(val_batch_x, val_batch_y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    # if (ite+1)%100==0:
    #     val_batch_x, val_batch_y = get_batch(val_ids, batch_size=2048)
    #     val_batch_x, val_batch_y = val_batch_x.to(device), val_batch_y.type(torch.LongTensor).to(device)
    #     _, val_loss, _ = model(val_batch_x, val_batch_y)
    #     # val_loss_all.append(val_loss.detach().cpu().numpy())
    #     del val_batch_x, val_batch_y, val_loss
    #     torch.cuda.empty_cache()

    p_bar.set_postfix({'val_loss': loss.detach().cpu().numpy()})

    if (ite+1)%100==0:
        torch.save(model, f'saved_model/len42_beta027/sequence-GPT-{ite}.pt')
    
# torch.save(model, 'saved_model/arithmetic-GPT-eco-uni.pt')

## Test the model

In [None]:
model = torch.load('saved_model/exponential_beta_22/arithmetic-GPT-1999.pt').to(device)
model.eval()