In [1]:
from datasets import load_from_disk
import torch
import torch.nn as nn

dataset = load_from_disk("./twitch_chats")
dataset_data = dataset['train']['json'][0][0]['chat']
data = ""
for i in dataset_data:
    data += i['message'] + "\n"
    

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# chars = sorted(list(set(data)))
words = sorted(list(set(data.split())))

vocab_size = len(words)
# vocab_size = len(chars)
print('Vocabulary size:', vocab_size)

stoi = {ch: i for i, ch in enumerate(words)}
itos = {i: ch for i, ch in enumerate(words)}
# stoi = {ch: i for i, ch in enumerate(chars)}
# itos = {i: ch for i, ch in enumerate(chars)}

# for index in stoi:

encode = lambda x: [stoi[ch] for ch in x.split()]
# encode = lambda x: [stoi[ch] for ch in x]
decode = lambda l : ' '.join([itos[i] for i in l])

data = encode(data)

data = torch.tensor(data, dtype=torch.long)

Vocabulary size: 184


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
 # device = 'cuda'

# n_embed = 32
# num_heads = 4
# n_layers = 3
n_embed = 32
num_heads = 4
max_iters = 5000
n_layers = 4
dropout = 0.2

eval_interval = 100
eval_iters = 200

total_len = len(data)
train_data = data[: int(total_len * 0.8)]
test_data = data[int(total_len * 0.8) :]

batch_size = 16
block_size = 32
# batch_size = 32
# block_size = 128


def get_batch(split):
    data = train_data if split == "train" else test_data

    ix = torch.randint(0, len(data) - block_size, (batch_size,))
    x = torch.stack([torch.tensor(data[i : i + block_size]) for i in ix])
    y = torch.stack([torch.tensor(data[i + 1 : i + block_size + 1]) for i in ix])

    x = x.to(device)
    y = y.to(device)

    return x, y

cuda


In [4]:
import models

m = models.BigramLanguageModel(
    vocab_size=vocab_size,
    n_embed=n_embed,
    block_size=block_size,
    num_heads=num_heads,
    n_layers=n_layers,
    dropout=dropout,
)

m.to(device)

BigramLanguageModel(
  (token_embedding_table): Embedding(184, 32)
  (position_emabedding_table): Embedding(32, 32)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Head(
            (k): Linear(in_features=32, out_features=8, bias=False)
            (q): Linear(in_features=32, out_features=8, bias=False)
            (v): Linear(in_features=32, out_features=8, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=32, out_features=32, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=32, out_features=128, bias=True)
          (1): ReLU()
          (2): Linear(in_features=128, out_features=32, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (ln2): L

In [5]:
optimizer = torch.optim.AdamW(m.parameters(), lr=0.001)


@torch.no_grad()
def estimate_loss():
    out = {}

    m.eval()

    for split in ["train", "test"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x, y = get_batch(split)
            logits, loss = m(x, y)
            losses[k] = loss.item()

        out[split] = losses.mean()

    m.train()

    return out

for step in range(max_iters):
    x, y = get_batch("train")

    logits, loss = m(x, y)

    if step % eval_interval == 0:
        losses = estimate_loss()
        print(
            f"step {step}, train loss: {losses['train']:.4f}, test loss: {losses['test']:.4f}"
        )

    optimizer.zero_grad(set_to_none=True)
    loss.backward()

    optimizer.step()

  x = torch.stack([torch.tensor(data[i : i + block_size]) for i in ix])
  y = torch.stack([torch.tensor(data[i + 1 : i + block_size + 1]) for i in ix])


step 0, train loss: 5.3880, test loss: 5.4806
step 100, train loss: 2.4529, test loss: 5.7531
step 200, train loss: 0.8442, test loss: 6.4895
step 300, train loss: 0.3819, test loss: 7.2208
step 400, train loss: 0.2263, test loss: 7.7186
step 500, train loss: 0.1491, test loss: 8.1841
step 600, train loss: 0.1146, test loss: 8.3690
step 700, train loss: 0.0928, test loss: 8.8013
step 800, train loss: 0.0753, test loss: 8.9806
step 900, train loss: 0.0630, test loss: 9.3375
step 1000, train loss: 0.0567, test loss: 9.4545
step 1100, train loss: 0.0471, test loss: 9.6702
step 1200, train loss: 0.0400, test loss: 9.7829
step 1300, train loss: 0.0346, test loss: 9.9163
step 1400, train loss: 0.0300, test loss: 10.1044
step 1500, train loss: 0.0276, test loss: 10.2505
step 1600, train loss: 0.0257, test loss: 10.3432
step 1700, train loss: 0.0224, test loss: 10.5344
step 1800, train loss: 0.0215, test loss: 10.5428
step 1900, train loss: 0.0211, test loss: 10.6285
step 2000, train loss: 0.0

In [6]:
x, y = get_batch('test')

out = m.generate(x, max_new_tokens=block_size)

print("input:", decode(x[0].tolist()))
print("output:", decode(out[0].tolist()))

  x = torch.stack([torch.tensor(data[i : i + block_size]) for i in ix])
  y = torch.stack([torch.tensor(data[i + 1 : i + block_size + 1]) for i in ix])


input: dumb fuck we on saturn Take 2 years to cook after release lucypySadge LUL atpRtsd Allows you to flush it before others get their copies Devs going to hell. 👏 -100 atpFeelsBeardMan
output: dumb fuck we on saturn Take 2 years to cook after release lucypySadge LUL atpRtsd Allows you to flush it before others get their copies Devs going to hell. 👏 -100 atpFeelsBeardMan 100 LMAO @AvoidingThePuddle game is free with wow gold though. atpWind atpGasm atpSpiner atpRtsd LUL LUL atpRtsd LUL LUL ugh they got me with that shit taticadsESTINHUS taticadsBUIUIA they going to hell
