In [None]:
import torch 
def my_assert(a, b, eps=1e-6):
    if (a.shape != b.shape):
        print("shapes mismatch: a.shape", a.shape, 'b.shape: ',  b.shape)
        assert(False)
    if(not torch.allclose(a, b, eps)):
        print("\na:\n", a, "\nb:\n", b, "\na/b:\n", a/b)
        assert(False)

In [None]:
# test attention, test_attention
import torch
import os
import math
from tests.datagen import save_tensor_to_csv
torch.manual_seed(551)
torch.set_printoptions(precision=8, linewidth=2000)

bn = 4
x0w = 13 # input embedding size
Eq = 15  # query embedding size
Ek = Eq # key embedding size
Ev = x0w # value , i.e. output embedding size
S = 16 # seq_len

Q = torch.nn.Parameter(torch.randn(Eq, x0w) * math.sqrt(2.0/(x0w + Eq)))
K = torch.nn.Parameter(torch.randn(Ek, x0w) * math.sqrt(2.0/(x0w + Ek)))
V = torch.nn.Parameter(torch.randn(Ev, x0w) * math.sqrt(2.0/(x0w + Ev)))

q = torch.rand(bn, S, x0w)
k = torch.rand(bn, S, x0w)
v = torch.rand(bn, S, x0w)

q_ = q @ Q.t()
k_ = k @ K.t()
v_ = v @ V.t()

q_.retain_grad()
k_.retain_grad()
v_.retain_grad()

qkt = q_ @ k_.transpose(1, 2) / (Eq ** .5)
smax = torch.softmax(qkt, dim=-1)
output = smax @ v_

qkt.retain_grad()
smax.retain_grad()
output.retain_grad()

target = torch.randn(output.shape) * 2 + 1
target = torch.ones(target.shape)
e = (target - output).pow(2).mean()
e.backward()

# qkt_grad in terms of q_ and k_

filename = "static_data/attention.txt"
try:
    os.remove(filename)
except:
    pass

with open(filename, "w") as f:
    f.write(f"{bn} {x0w} {Eq} {Ek} {Ev} {S} {e.item()}\n")

l2_grad = 2 * (output - target) / (output.numel())

qkt_grad = l2_grad @ v_.transpose(1, 2)

q_grad_in = (qkt.grad @ k_) / (Eq ** .5)
k_grad_in = (qkt.grad.transpose(1, 2) @ q_) / (Eq ** .5)
v_grad_in = smax.transpose(1, 2) @ l2_grad

#print("qkt_grad:\n", qkt_grad)
#print("q_grad_in:\n" , q_grad_in)
#print("k_grad_in:\n" , k_grad_in)

Q_grad_in = (q_grad_in.transpose(1, 2) @ q)
K_grad_in = (k_grad_in.transpose(1, 2) @ k)
V_grad_in = (v_grad_in.transpose(1, 2) @ v)

my_assert(q_.grad, q_grad_in)
my_assert(k_.grad, k_grad_in)
my_assert(v_.grad, v_grad_in)

print(Q_grad_in)

my_assert(Q.grad, Q_grad_in.sum(0))
my_assert(K.grad, K_grad_in.sum(0))
my_assert(V.grad, V_grad_in.sum(0))

for target in [Q, K, V, q, k, v, target, qkt, smax, output, q_, k_, v_, Q.grad, K.grad, V.grad]:
    save_tensor_to_csv(target, filename, True)
#print(Q.grad.shape, '\n', Q.grad)
#print(K.grad.shape, '\n', K.grad)
#print(V.grad.shape, '\n', V.grad)

In [None]:
#test test_linearb
import torch
import os
from tests.datagen import save_tensor_to_csv
torch.manual_seed(501)
torch.set_printoptions(precision=8, linewidth=2000, sci_mode=False)
bn = 12
x0w = 17
Sl = 14
I0 = 15
I2 = 16

x0 = torch.randn(bn, Sl, x0w)
W0 = torch.nn.Parameter(torch.randn(I0, x0w))
#b1 = torch.nn.Parameter(torch.randn(1, I0))
W1 = torch.nn.Parameter(torch.randn(I2, I0))
b1 = torch.nn.Parameter(torch.randn(1, I2))

z1 = x0 @ W0.t()
y1 = z1.sigmoid()
z2 = (y1 @ W1.t() + b1)
#y2 = z2.sigmoid()

#y2.retain_grad()
z2.retain_grad()
y1.retain_grad()
z1.retain_grad()

target = torch.randn(z2.shape)
#e = (-t * sm.log()).mean()
e = (target - z2).pow(2).mean()
e.backward()

filename = f"static_data/linear.txt"

try:
    os.remove(filename)
except:
    pass

with open(filename, "a") as f:
    f.write(f"{bn} {x0w} {Sl} {I0} {I2} {e.item()}\n")

def sigmoid_backward(x):
    return x * (1 - x)


z2_grad = 2 * (z2 - target) / z2.numel()  # gradientIn for Linear::backward()
my_assert(z2.grad, z2_grad)

y1_grad = z2_grad @ W1
my_assert(y1.grad, y1_grad)

w2_grad = z2_grad.transpose(1, 2) @ y1
my_assert(W1.grad, w2_grad.sum(0))

b2_grad = z2_grad.sum(dim=1, keepdim=True)
my_assert(b1.grad, b2_grad.sum(0))

#####
z1_grad = y1_grad * sigmoid_backward(y1)
my_assert(z1.grad, z1_grad)

w1_grad = z1_grad.transpose(1, 2) @ x0
my_assert(W0.grad, w1_grad.sum(0))

save_tensor_to_csv(x0, filename, True)
save_tensor_to_csv(target, filename, True)

save_tensor_to_csv(W0, filename, True)
save_tensor_to_csv(W1, filename, True)
save_tensor_to_csv(b1, filename, True)

save_tensor_to_csv(z2, filename, True) # output of first layer
save_tensor_to_csv(y1, filename, True) # output of second layer

### z2_grad = y2_grad * sigmoid_backward(y2)
save_tensor_to_csv(z2.grad, filename, True) #loss.gradOut; 
save_tensor_to_csv(y1.grad, filename, True) #y2.gradOut; z2_grad @ W2 , 

save_tensor_to_csv(W1.grad, filename, True) # z2_grad.transpose(1, 2) @ y1
save_tensor_to_csv(b1.grad, filename, True) # z2_grad.sum(dim=1, keepdim=True)

# z1_grad = y1_grad * sigmoid_backward(y1)
save_tensor_to_csv(W0.grad, filename, True) # z1_grad.transpose(1, 2) @ x0
#save_tensor_to_csv(b1.grad, filename, True) # z1_grad.sum(dim=0, keepdim=True)


In [None]:
# test softmax cross entropy, test_LSMCELoss*
import torch
import os
import torch.nn.functional as F
from tests.datagen import save_tensor_to_csv
torch.manual_seed(10)
torch.set_printoptions(edgeitems=2000, linewidth=200, sci_mode=False, threshold=2000, precision=10, profile="full")

x = torch.randn(3, 5, 3)
W0 = torch.nn.Parameter(torch.randn(5, 3))
b0 = torch.nn.Parameter(torch.randn(1, 5))

L = x @ W0.t() + b0
L.retain_grad()

target = torch.tensor(
    [
        [0, 1, 0, 0, 0],
        [0, 0, 0, 1, 0],
        [1, 0, 0, 0, 0],
        [0, 0, 0, 0, 1],
        [0, 0, 1, 0, 0]
    ]
).float()
target = [target] * x.shape[0]
target = torch.stack(target)

o = -target * F.log_softmax(L, dim=-1)
e = o.sum(-1).mean()

e.retain_grad()
e.backward()

filename = f"static_data/lsmce.txt"

try:
    os.remove(filename)
except:
    pass

save_tensor_to_csv(x, filename, True)
save_tensor_to_csv(W0, filename, True)
save_tensor_to_csv(b0, filename, True)
save_tensor_to_csv(target, filename, True)

with open(filename, "a") as f:
    f.write(f"{e.item()}\n")

save_tensor_to_csv(L, filename, True)
save_tensor_to_csv(L.grad, filename, True) #loss.gradOut;

save_tensor_to_csv(W0.grad, filename, True)
save_tensor_to_csv(b0.grad, filename, True)


In [None]:
# test softmax dimN , testing test_softmaxDim N = [0,1]

import torch
import os 
from tests.datagen import save_tensor_to_csv
torch.set_printoptions(precision=8, linewidth=2000, sci_mode=False)
torch.manual_seed(1331)

bn = 2
x0w = 68
Sl = 33
I0 = 35
def write_softmax(N = 1):
    
    x0 = torch.randn(bn, Sl, x0w)
    W0 = torch.nn.Parameter(torch.randn(I0, x0w))
    b0 = torch.nn.Parameter(torch.randn(1, I0))
    z1 = (x0 @ W0.t() + b0).tanh()
    s = torch.softmax(z1, dim=(-1-N))    
    t = torch.randn(s.shape)
    e = (-t * s.log()).mean()
    
    z1.retain_grad()
    s.retain_grad()
    e.retain_grad()
    e.backward()
    filename = f"static_data/sm_dim{N}.txt"

    try:
        os.remove(filename)
    except:
        pass

    with open(filename, "a") as f:
        f.write(f"{bn} {x0w} {Sl} {I0} {e.item()}\n")

    for t in [x0, W0, b0, t,z1, s, z1.grad, W0.grad, b0.grad]:
        save_tensor_to_csv(t, filename, True)
    
    print("z1.grad.abs().sum(): ", z1.grad.abs().sum())
    print("W0.grad.abs().sum(): ", W0.grad.abs().sum())
    print("b0.grad.abs().sum(): ", b0.grad.abs().sum(), "\n")

write_softmax(0)
write_softmax(1)

# print magnitude of gradients


In [None]:
# test_average_node, mean along dim=-1, -2 etc.
import torch
import os
from tests.datagen import save_tensor_to_csv
torch.manual_seed(999)
torch.set_printoptions(precision=8, linewidth=2000, sci_mode=False)

bn = 3
x0w = 7
Sl = 4
I0 = 5

x0 = torch.randn(bn, Sl, x0w)
W0 = torch.nn.Parameter(torch.randn(I0, x0w))
b0 = torch.nn.Parameter(torch.randn(1, I0))
z1 = (x0 @ W0.t() + b0).tanh()

y1 = z1.mean(dim=1, keepdim=True)
y1.retain_grad()
target = torch.randn(y1.shape)
e = (target - y1).pow(2).mean()
e.backward()

try:
    os.remove("static_data/average.txt")
except:
    pass

filename = f"static_data/average.txt"
with open(filename, "w") as f:
    f.write(f"{bn} {x0w} {Sl} {I0} {e.item()}\n")

for t in [x0, W0, b0, target, z1, y1, W0.grad, b0.grad]:
    save_tensor_to_csv(t, filename, True)


In [None]:
#layer norm
import torch
import os
from tests.datagen import save_tensor_to_csv
torch.manual_seed(999)
torch.set_printoptions(precision=8, linewidth=2000, sci_mode=False)

bn = 2
x0w = 5
Sl = 3
I0 = 6

x0 = torch.randn(bn, Sl, x0w) + 0.5
W0 = torch.nn.Parameter(torch.randn(I0, x0w))
b0 = torch.nn.Parameter(torch.randn(1, I0))
y = (x0 @ W0.t() + b0).sigmoid()

norm = torch.nn.LayerNorm(y.shape[-1])
#norm = torch.nn.BatchNorm1d(y.shape[0])
#print(y.permute(0, 2, 1).shape)

z = norm(y)
z.retain_grad()
y.retain_grad()

y_mean = y.mean(dim=-1, keepdim=True)
y_sq = y ** 2
y_sq_mean = y_sq.mean(dim=-1, keepdim=True)

y_var = y_sq_mean - y_mean ** 2
y_num = y - y_mean
y_std = y_var ** 0.5
y_norm = y_num/y_std

target = torch.randn(z.shape)
e = (target - y_norm).pow(2).mean()

e.backward()

my_assert(y_norm, z, 1e-3)

filename = "static_data/layer_norm.txt"

try:
    os.remove(filename)
except:
    pass

with open(filename, "w") as f:
    f.write(f"{bn} {x0w} {Sl} {I0} {e.item()}\n")

for t in [x0, W0, b0, target, y, z, W0.grad, b0.grad]:
    save_tensor_to_csv(t, filename, True)

In [None]:
# test product, see test_productT in tests.cpp

import torch
import os
from tests.datagen import save_tensor_to_csv
torch.manual_seed(509)
torch.set_printoptions(precision=8, linewidth=2000, sci_mode=False)

bn = 5
x0w = 17
Sl = 14
I0 = 15
I2 = 16
I3 = 17

x0 = torch.randn(bn, Sl, x0w)
W0 = torch.nn.Parameter(torch.randn(I0, x0w))
b0 = torch.nn.Parameter(torch.randn(1, I0))

x1 = torch.randn(bn, I3, I2)
W1 = torch.nn.Parameter(torch.randn(I0, I2))

z0 = x0 @ W0.t() + b0
y0 = z0.sigmoid()

y1 = x1 @ W1.t()
A = y0 @ y1.transpose(1, 2) / 2.222

t = torch.randn(A.shape)
e = (t - A).pow(2).mean()

A.retain_grad()
y1.retain_grad()
y0.retain_grad()
z0.retain_grad()

e.backward()

filename = f"static_data/productT.txt"
try:
    os.remove(filename) 
except:
    pass

with open(filename, "w") as f:
    f.write(f"{bn} {x0w} {Sl} {I0} {I2} {I3} {e.item()}\n")

for tensor in [x0, W0, b0, x1, W1, t, A]:
    save_tensor_to_csv(tensor, filename, True)
    
for tensor in [y1, y0]:
    save_tensor_to_csv(tensor, filename, True)

for tensor in [A.grad, W0.grad, b0.grad, W1.grad]:
    save_tensor_to_csv(tensor, filename, True)


In [None]:
# division grad
import torch
torch.manual_seed(509)

bn = 5
x0w = 17
Sl = 14
I0 = 15

x0 = torch.randn(bn, Sl, x0w)
W0 = torch.nn.Parameter(torch.randn(I0, x0w))

x1 = torch.randn(bn, Sl, x0w)
W1 = torch.nn.Parameter(torch.randn(I0, x0w))

z0 = x0 @ W0.t()
y0 = z0.sigmoid()

y1 = x1 @ W1.t()
y1 = y1.sigmoid()
out = y0 / y1
t = torch.randn(out.shape)
e = (t - out).pow(2).mean()

out.retain_grad()
y1.retain_grad()
y0.retain_grad()
z0.retain_grad()

e.backward()

diff_grad = 2 * (out - t) / out.numel()
my_assert(out.grad, diff_grad)

my_assert(y0.grad, diff_grad / y1)
my_assert(y1.grad, -diff_grad * y0 / (y1 ** 2))

In [2]:
import torch 

x = torch.randn(2, 2, 3)
nx = torch.functional.layer_norm(x, [3], None, None, 1e-5)
print(x, '\n', nx, '\n', nx/x)


AttributeError: module 'torch.functional' has no attribute 'layer_norm'