# tinygrad MNIST Tutorial
https://docs.tinygrad.org/mnist/

In [2]:
from tinygrad import Device
print(Device.DEFAULT)

CUDA


In [3]:
from tinygrad import Tensor, nn

class Model:
  def __init__(self):
    self.l1 = nn.Conv2d(1, 32, kernel_size=(3,3))
    self.l2 = nn.Conv2d(32, 64, kernel_size=(3,3))
    self.l3 = nn.Linear(1600, 10)

  def __call__(self, x:Tensor) -> Tensor:
    x = self.l1(x).relu().max_pool2d((2,2))
    x = self.l2(x).relu().max_pool2d((2,2))
    return self.l3(x.flatten(1).dropout(0.5))

In [4]:
from tinygrad.nn.datasets import mnist
X_train, Y_train, X_test, Y_test = mnist()
print(X_train.shape, X_train.dtype, Y_train.shape, Y_train.dtype)
# (60000, 1, 28, 28) dtypes.uchar (60000,) dtypes.uchar

(60000, 1, 28, 28) dtypes.uchar (60000,) dtypes.uchar


In [5]:
model = Model()
acc = (model(X_test).argmax(axis=1) == Y_test).mean()
# NOTE: tinygrad is lazy, and hasn't actually run anything by this point
print(acc.item())  # ~10% accuracy, as expected from a random model


0.11069999635219574


In [10]:
optim = nn.optim.Adam(nn.state.get_parameters(model))
batch_size = 128
def step():
  Tensor.training = True  # makes dropout work
  samples = Tensor.randint(batch_size, high=X_train.shape[0])
  X, Y = X_train[samples], Y_train[samples]
  optim.zero_grad()
  loss = model(X).sparse_categorical_crossentropy(Y).backward()
  optim.step()
  return loss


In [12]:
import timeit
timeit.repeat(step, repeat=5, number=1)
#[0.08268719699981375,
# 0.07478952900009972,
# 0.07714716600003158,
# 0.07785399599970333,
# 0.07605237000007037]

[0.08535185595974326,
 0.07988821202889085,
 0.07686099782586098,
 0.08139940025284886,
 0.07806056411936879]

In [13]:
from tinygrad import GlobalCounters, Context
GlobalCounters.reset()
with Context(DEBUG=2): step()

scheduled 52 kernels
memory reduced from 60.51 MB -> 42.70 MB, 24 -> 19 bufs
*** CUDA       1 E_[90mn12[0m                                     arg  1 mem  0.06 GB tm    109.57us/     0.11ms (     0.00 GFLOPS    0.0|0.0     GB/s) ['__imul__']
*** CUDA       2 E_[90mn13[0m                                     arg  1 mem  0.06 GB tm     12.29us/     0.12ms (     0.00 GFLOPS    0.0|0.0     GB/s) ['__imul__']
*** CUDA       3 E_[90mn7[0m                                      arg  1 mem  0.06 GB tm     10.24us/     0.13ms (     0.00 GFLOPS    0.0|0.0     GB/s) ['randint']
*** CUDA       4 E_[90m[0m                                        arg  1 mem  0.06 GB tm     10.24us/     0.14ms (     0.00 GFLOPS    0.0|0.0     GB/s) ['randint']
*** CUDA       5 r_[34m625[0m[90m_[0m[36m32[0m[90m_[0m[31m15000[0m[90m_[0m[33m3[0m[90m_[0m[35m4[0m[90m[0m                        arg  1 mem  0.06 GB tm     11.26us/     0.15ms (    40.84 GFLOPS   21.3|21.3    GB/s) ['__getitem__']
*** CU

In [14]:
from tinygrad import TinyJit
jit_step = TinyJit(step)

In [15]:
import timeit
timeit.repeat(jit_step, repeat=5, number=1)
# [0.2596786549997887,
#  0.08989566299987928,
#  0.0012115650001760514,
#  0.001010227999813651,
#  0.0012164899999334011]

[0.13439255580306053,
 0.08378646429628134,
 0.002900158055126667,
 7.588090375065804e-05,
 3.8111116737127304e-05]

In [16]:
for step in range(7000):
  loss = jit_step()
  if step%100 == 0:
    Tensor.training = False
    acc = (model(X_test).argmax(axis=1) == Y_test).mean().item()
    print(f"step {step:4d}, loss {loss.item():.2f}, acc {acc*100.:.2f}%")

step    0, loss 0.75, acc 90.87%
step  100, loss 0.15, acc 96.43%
step  200, loss 0.15, acc 97.45%
step  300, loss 0.12, acc 97.33%
step  400, loss 0.07, acc 97.56%
step  500, loss 0.10, acc 97.65%
step  600, loss 0.12, acc 98.08%
step  700, loss 0.07, acc 98.27%
step  800, loss 0.21, acc 97.80%
step  900, loss 0.17, acc 98.32%
step 1000, loss 0.14, acc 98.18%
step 1100, loss 0.04, acc 98.34%
step 1200, loss 0.01, acc 98.43%
step 1300, loss 0.04, acc 98.48%
step 1400, loss 0.06, acc 98.49%
step 1500, loss 0.03, acc 98.39%
step 1600, loss 0.08, acc 98.81%
step 1700, loss 0.07, acc 98.66%
step 1800, loss 0.08, acc 98.48%
step 1900, loss 0.06, acc 98.40%
step 2000, loss 0.05, acc 98.51%
step 2100, loss 0.06, acc 98.74%
step 2200, loss 0.09, acc 98.49%
step 2300, loss 0.05, acc 98.48%
step 2400, loss 0.07, acc 98.55%
step 2500, loss 0.04, acc 98.54%
step 2600, loss 0.08, acc 98.72%
step 2700, loss 0.06, acc 98.52%
step 2800, loss 0.05, acc 98.62%
step 2900, loss 0.03, acc 98.67%
step 3000,