In [None]:
# imports
import os
import numpy as np
import time
from tinygrad import Tensor
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

os.environ['GPU'] = '1' # set to tinygrad backend to GPU since METAL doesn't work on older intel macs

In [None]:
# import and clean pokemon name data

import csv

names = []

with open('Pokemon_moves.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader) # skips header row
    for row in reader:
        if len(row) > 1:
            names.append(row[1])

In [None]:
names[:10]

In [None]:
len(names)

In [None]:
# build the vocabular of characters and mappings to/from integers
chars = sorted(list(set(''.join(names))))
# stoi = string to int, itos = int to string
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['|'] = 0 # set | as end char, since all other end chars are already used
itos = {i:s for s,i in stoi.items()}
print(itos)

# finding # of unique chars so we can set our Tensor dim. later
all_chars = set(''.join(stoi))
num_unique_chars = len(all_chars)

print('num_unique_chars =', num_unique_chars)
print('all_chars = ', all_chars)

In [None]:
# build the dataset
block_size = 3 # context length: how many chars do we take to predict the next on?
X, Y = [], [] # X = inputs, Y = labels

for n in names:

    #print(n)
    context = [0] * block_size # start with padded context

    # iter over all chars
    for ch in n + '|':
        ix = stoi[ch] # get char in sequence
        X.append(context) # stores current running context
        Y.append(ix) # stores current char
        #print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] # crop and append (rolling window of context)

X = Tensor(X)
Y = Tensor(Y)

# build the dataset ( in terms of train, val, and test sets )
def build_dataset(names):
    block_size = 3 # context length: how many chars do we take to predict the next on?
    X, Y = [], [] # X = inputs, Y = labels
    
    for n in names:
    
        #print(n)
        context = [0] * block_size # start with padded context
    
        # iter over all chars
        for ch in n + '|':
            ix = stoi[ch] # get char in sequence
            X.append(context) # stores current running context
            Y.append(ix) # stores current char
            #print(''.join(itos[i] for i in context), '--->', itos[ix])
            context = context[1:] + [ix] # crop and append (rolling window of context)
    
    X = Tensor(X)
    Y = Tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

import random
random.shuffle(names)
n1 = int(0.8*len(names))
n2 = int(0.9*len(names))

Xtr, Ytr = build_dataset(names[:n1])
Xdev, Ydev = build_dataset(names[n1:n2])
Xte, Yte = build_dataset(names[n2:])

In [None]:
# dataset shape
X.shape, X.dtype, Y.shape, Y.dtype

In [None]:
# building embedding lookup table
# we try cramming everything into two dimensional space

C = Tensor.randn((num_unique_chars, 2))

In [None]:
emb = C[X]
emb.shape

In [None]:
### hidden layer (3x2 = num inputs to this layer (context x embedding dim), and arb. 100 neurons)
W1 = Tensor.randn(6, 100)
b1 = Tensor.randn(100)

In [None]:
# the .view(-1) infers the rest of the shape of the viewed tensor
h = (emb.view(-1,6) @ W1 + b1).tanh()
h.shape

In [None]:
### output layer

W2 = Tensor.randn(100,64)
b2 = Tensor.randn(64)

In [None]:
logits = h @ W2 + b2

In [None]:
logits.shape

In [None]:
counts = logits.exp()

In [None]:
prob = counts / counts.sum(1, keepdim=True) # normalizing

In [None]:
prob.shape

In [None]:
-prob[Tensor.arange(60), Y].log().mean().numpy() # neg log likelyhood loss

In [None]:
#------- cleanup time -------#

In [None]:
Xtr.shape, Ytr.shape # dataset for training

In [None]:
C = Tensor.randn(64, 5)
W1 = Tensor.randn(15, 100)
b1 = Tensor.randn(100)
W2 = Tensor.randn(100, 64)
b2 = Tensor.randn(64)
parameters = [C, W1, b1, W2, b2]

In [None]:
sum(p.numel() for p in parameters) # numb of params in total

In [None]:
lre = Tensor.linspace(-3, 0, 1000)
lrs = 10**lre

In [None]:
lri = []
lossi = []
stepi = []

In [None]:
for p in parameters:
    p.requires_grad = True

In [None]:
# training
import time

lr = 0.01413

start = time.time()

for i in range(1000):

    # minibatch construct
    ix = Tensor.randint(32, low=0, high=Xtr.shape[0], requires_grad=False)

    # forward pass
    emb = C[Xtr[ix]] # (32, 3, 10)
    h = (emb.view(-1, 15) @ W1 + b1).tanh()  # (32, 100)
    logits = h @ W2 + b2    # (32, 64)

    # compute loss
    loss = Tensor.cross_entropy(logits, Ytr[ix])

    # zero gradients
    for p in parameters:
        p.grad = p.zeros_like()

    # backwards pass
    loss.backward()

    # update parameters
    for p in parameters:
        p.replace(p.add(-lr * p.grad))

    # print loss
    print(f'step {i}: {loss.item()}') # we print here since forward is lazy, 
    #                  # doesnt get realized until .backward is called
    
    #lrs[i]

    # track stats
    #lri.append(lre[i].item())
    stepi.append(i)
    lossi.append(loss.item())

end = time.time()
print('Time taken: ', end - start)

In [None]:
plt.plot(stepi, lossi)

In [None]:
# loss over training set

emb = C[Xtr] # (60, 3, 2)
h = (emb.view(-1,15) @ W1 + b1).tanh() # (60, 100)
logits = h @ W2 + b2 # (60, 64)
#counts = logits.exp()
#prob = counts / counts.sum(1, keepdim=True) # normalizing
#loss = -prob[Tensor.arange(60), Y].log().mean()
loss = Tensor.cross_entropy(logits, Ytr)
print(f'empirical loss: {loss.numpy()}')

In [None]:
# loss over dev test

emb = C[Xdev] # (60, 3, 2)
h = (emb.view(-1,15) @ W1 + b1).tanh() # (60, 100)
logits = h @ W2 + b2 # (60, 64)
#counts = logits.exp()
#prob = counts / counts.sum(1, keepdim=True) # normalizing
#loss = -prob[Tensor.arange(60), Y].log().mean()
loss = Tensor.cross_entropy(logits, Ydev)
loss.realize()
print(f'empirical loss: {loss.item()}')

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(C[:,0].data(), C[:,1].data(), s=200)
for i in range(C.shape[0]):
    plt.text(C[i,0].item(), C[i,1].item(), itos[i], ha='center', va='center', color="white")
plt.grid('minor')

In [None]:
# training split, dev/validation split, test split
# 80%, 10%, 10% of data set

In [None]:
# sample from the model

for _ in range(10):
    out = []
    context = [0] * block_size # initialize with all ||| starting chars
    while True:
        emb = C[Tensor(context)] # (1, block_size, d)
        h = (emb.view(1, -1) @ W1 + b1).tanh()
        logits = h @ W2 + b2
        probs = logits.softmax(axis=1)
        ix = probs.multinomial(num_samples=1).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break

    print(''.join(itos[i] for i in out))