In [None]:
# test attention
import torch
import os
import math
from tests.datagen import save_tensor_to_csv
torch.manual_seed(10)
torch.set_printoptions(precision=8, linewidth=2000)

Ei = 64 # input embedding size
Eq = 8  # query embedding size
Ek = Eq # key embedding size
Ev = Ei # value , i.e. output embedding size
S = 10 # seq_len

Q = torch.nn.Parameter(torch.randn(Eq, Ei) * math.sqrt(1/Ei))
K = torch.nn.Parameter(torch.randn(Ek, Ei) * math.sqrt(1/Ei))
V = torch.nn.Parameter(torch.randn(Ev, Ei) * math.sqrt(1/Ei))

q = torch.rand(S, Ei)
k = torch.rand(S, Ei)
v = torch.rand(S, Ei)

q_ = q @ Q.t()
k_ = k @ K.t()
v_ = v @ V.t()

q_.retain_grad()
k_.retain_grad()
v_.retain_grad()

print("Q.shape: ", Q.shape, "K.shape: ", K.shape, "V.shape: ", V.shape)
print("q_.shape: ", q_.shape, "k_.shape: ", k_.shape, "v_.shape: ", v_.shape)

qkt = q_ @ k_.t() / (Eq ** .5)
print("qkt.shape: ", qkt.shape)
qkt.retain_grad()
smax = qkt.exp() / qkt.exp().sum(dim=1, keepdim=True)
smax.retain_grad()
output = smax @ v_
output.retain_grad()
print("otuput shape: ", output.shape)

t = torch.ones_like(output)
l = (t - output).pow(2).mean()
l.backward()

l2_grad = 2 * (output - t) / (S * Ev)

qkt_grad_in = torch.mm(smax.t(), l2_grad)

v_grad_in = torch.mm(smax.t(), l2_grad)
q_grad_in = torch.mm(qkt.grad, k_) / (Eq ** .5)
k_grad_in = torch.mm(q_.t(), qkt.grad).t() / (Eq ** .5)

Q_grad_in = torch.mm(q.t(), q_grad_in).t()
K_grad_in = torch.mm(k.t(), k_grad_in).t()
V_grad_in = torch.mm(v.t(), v_grad_in).t()

assert(torch.allclose(v_.grad, v_grad_in))
assert(torch.allclose(q_.grad, q_grad_in))
assert(torch.allclose(k_.grad, k_grad_in))
assert(torch.allclose(Q.grad, Q_grad_in))
assert(torch.allclose(K.grad, K_grad_in))
assert(torch.allclose(V.grad, V_grad_in))

filename = "static_data/attention.txt"
try:
    os.remove(filename)
except:
    pass
save_tensor_to_csv(Q, filename, True)
save_tensor_to_csv(K, filename, True)
save_tensor_to_csv(V, filename, True)
save_tensor_to_csv(q, filename, True)
save_tensor_to_csv(k, filename, True)
save_tensor_to_csv(v, filename, True)
save_tensor_to_csv(qkt, filename, True)
save_tensor_to_csv(smax, filename, True)
save_tensor_to_csv(output, filename, True)
save_tensor_to_csv(Q.grad, filename, True)
save_tensor_to_csv(K.grad, filename, True)
save_tensor_to_csv(V.grad, filename, True)

# print("output.grad:\n ", output.grad)
# print("smax.grad:\n ", smax.grad)
# print("qkt.grad:\n ", qkt.grad)
# print("q.grad:\n ", q_.grad)
# print("k.grad:\n ", k_.grad)
# print("v.grad:\n ", v_.grad)

In [None]:
#test Linear with bias
import torch
import os
from tests.datagen import save_tensor_to_csv
torch.manual_seed(50)
torch.set_printoptions(precision=8, linewidth=2000, sci_mode=False)
Ei = 3
Sl = 4
I1 = 5
I2 = 6

x = torch.randn(Sl, Ei)
W0 = torch.nn.Parameter(torch.randn(I1, Ei))
b0 = torch.nn.Parameter(torch.randn(1, I1))
W1 = torch.nn.Parameter(torch.randn(I2, I1))
b1 = torch.nn.Parameter(torch.randn(1, I2))

print(torch.mm(x, W0.t()).shape, b0.t().shape)
y0 = torch.mm(x, W0.t()) + b0
y1 = torch.mm(y0.sigmoid(), W1.t()) + b1
z = y1.tanh()

z.retain_grad()
y1.retain_grad()
y0.retain_grad()

t = torch.ones_like(z)
l = (z - t).pow(2).mean()
l.backward()

filename = "static_data/linearb.txt"
try:
    os.remove(filename)
except:
    pass
save_tensor_to_csv(x, filename, True)
save_tensor_to_csv(W0, filename, True)
save_tensor_to_csv(b0, filename, True)
save_tensor_to_csv(W1, filename, True)
save_tensor_to_csv(b1, filename, True)

save_tensor_to_csv(z, filename, True)
save_tensor_to_csv(W0.grad, filename, True)
save_tensor_to_csv(b0.grad, filename, True)
save_tensor_to_csv(W1.grad, filename, True)
save_tensor_to_csv(b1.grad, filename, True)
#print("l: ", l, "\nx: ", x, "\nz: ", z)

z_grad = 2 * (z - t) / (Sl * I2)
assert(torch.allclose(z.grad, z_grad))

y1_grad = z_grad * (1 - z * z)
assert(torch.allclose(y1.grad, y1_grad))

W1_grad_in = torch.mm(y1_grad.t(), y0)
b1_grad_in = y1_grad.sum(dim=0, keepdim=True)

y0_grad = torch.mm(y1_grad, W1)

W0_grad_in = torch.mm(y0_grad.t(), x)
