# Setup

In [1]:
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
import wandb
from utilities import *
from config import *
from dataloading import *
from tqdm import tqdm
from transformer import *
import os

Loading data...


Loading data: 100000it [00:02, 42070.47it/s]
Loading data: 10000it [00:00, 87292.95it/s]
Loading data: 5000it [00:00, 36882.99it/s]


In [3]:
# setup the model
model = BigramLanguageModel()

# cuda? (gpu)
if torch.cuda.is_available():
  device = "cuda:0"
else:
  device = "cpu"
  
# send to gpu (maybe)
model = nn.DataParallel(model)
model = model.to(device)

# optionally: load the model
filename = PATH + "/model/" + MODELNAME + ".pth"
if os.path.isfile(filename):
    model.load_state_dict(torch.load(filename, map_location=torch.device(device)))

os.path.isfile(filename)

True

In [4]:
!pwd

/Users/midataur/Documents/github/permutations/scaling-generator


In [5]:
torch.norm(model.module.position_embedding(torch.arange(block_size)), dim=1)

tensor([1.4373, 1.7087, 3.0711, 1.6072, 1.3768, 1.6005, 1.8965, 3.6222, 1.4991,
        1.3836, 1.1925, 1.7763, 3.0512, 1.2602, 1.7053, 1.1857, 1.7943, 3.0573,
        1.2169, 1.3917, 1.2007, 1.4680, 3.0223, 1.5739, 2.1504, 1.0557, 1.4136,
        3.0209, 1.4963, 1.3928, 1.1103, 1.4799, 3.0290, 1.4532, 1.3874, 1.2438,
        1.4193, 3.0962, 1.4477, 1.3899, 1.1333, 1.6687, 3.3599, 1.3531, 1.6604,
        1.3165, 1.7076, 3.1032, 1.3607, 1.3820, 1.2134, 1.1081, 3.4252, 1.6753,
        1.3844, 1.1489, 1.1815, 3.5950, 1.6513, 1.3891, 1.1093, 1.5022, 3.5008,
        1.6764, 1.7086, 1.2641, 1.3932, 3.6701, 1.6443, 1.3781, 1.2451, 1.5854,
        2.8688, 1.5941, 1.9436, 1.4033, 1.4786, 2.8131, 1.5841, 1.3844, 1.2848,
        1.7271, 2.9532, 1.6822, 1.3795, 1.1694, 1.6557, 2.7654, 1.6950, 1.3807,
        0.8279, 1.5053, 2.6616, 1.9581, 1.3691, 0.9912, 1.5381, 2.6938, 2.0077,
        1.3911, 1.0491, 1.5438, 1.6787, 1.4692, 1.1431, 1.1352, 1.7485, 1.4191,
        1.5212, 1.9603, 1.8416, 1.8083, 

In [6]:
INPUT_LENGTH

100

In [4]:
import plotly.express as px
from math import sin, cos, tau

# generate embedding
embedding = []

period = 10
N = 2*period

for x in range(N):
    embedding.append(np.array([cos((tau*x)/period), sin((tau*x)/period)]))

similarity = []

for x in embedding:
    row = []
    for y in embedding:
        row.append(np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y)))
    similarity.append(row)

figure = px.imshow(similarity, title=f"Periodic embedding similarity matrix")
figure

In [4]:
import plotly.express as px

embedding = model.module.position_embedding(torch.arange(block_size)).detach().numpy()

similarity = []

for x in embedding:
    row = []
    for y in embedding:
        row.append(np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y)))
    similarity.append(row)

figure = px.imshow(similarity, title=f"{MODELNAME} position embedding similarity matrix")
figure

In [16]:
test_perms.apppend(list(range(16)))

AttributeError: 'numpy.ndarray' object has no attribute 'apppend'

In [96]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

embedding = model.module.position_embedding(torch.arange(block_size)).numpy()

pca = PCA(n_components=37)
pca_reduced = pca.fit_transform(embedding)

tsne = TSNE()
tsne_reduced = tsne.fit_transform(pca_reduced)

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

In [14]:
tsne_reduced

array([[ 2.615564  , -0.7945698 ],
       [ 2.6392    , -0.7959203 ],
       [ 2.8185122 , -0.636493  ],
       [ 2.8265762 , -0.6843819 ],
       [ 2.9795713 , -0.555092  ],
       [ 3.018241  , -0.6495128 ],
       [ 3.0847523 , -0.40156487],
       [ 3.170699  , -0.4523055 ],
       [ 3.4205754 , -0.10249923],
       [ 3.4020896 , -0.10392431],
       [ 3.4832618 ,  0.15656874],
       [ 3.4523659 ,  0.17607507],
       [ 3.2141027 ,  0.41368866],
       [ 3.1265426 ,  0.30864653],
       [ 2.8155253 ,  0.49494886],
       [ 2.857334  ,  0.5323016 ],
       [ 2.7761066 ,  0.6760298 ],
       [ 2.827943  ,  0.685378  ],
       [ 2.7161255 ,  0.65443873],
       [ 2.7423859 ,  0.6188329 ],
       [ 2.7078223 ,  0.15305504],
       [ 3.649101  ,  0.9520739 ],
       [ 3.3759036 ,  0.49740788],
       [ 2.3849716 ,  0.07049901],
       [ 2.4002135 ,  0.06811174],
       [ 2.150415  ,  0.40303117],
       [ 2.3159542 ,  0.7476966 ],
       [ 2.6249402 ,  0.0689371 ],
       [ 2.2843318 ,

In [22]:
import plotly.express as px

px.scatter(
    x=tsne_reduced[:,0], 
    y=tsne_reduced[:,1], 
    color=["input" for x in range(20)] + ["seperator"] + ["output" for x in range(16)]
)

In [3]:
import wandb

wandb.login()
run = wandb.init()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmidataur[0m ([33mknot-theory[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [8]:
run.log({"image": wandb.Image(np.array(similarity))})

In [4]:
plausible = 0

for perm in tqdm(test_perms):
    correct = 0

    for pos, char in enumerate(perm):
        if char == pos:
            correct += 1
    
    if correct >= 6:
        plausible += 1

plausible/len(test_perms)

100%|██████████| 10000/10000 [00:00<00:00, 249101.96it/s]


0.3368

In [48]:
for x in test_dataloader:
    break

model(x[0][0].reshape((1,-1)))

tensor([[-183.0861, -183.0862, -175.3388, -184.0500, -180.1509, -183.1860,
         -185.5698, -184.4934, -182.6216, -182.1445, -182.6107, -181.7221,
         -162.2498, -185.2902, -187.2592, -184.3235, -184.6070, -182.8929,
         -183.0861, -183.0862, -180.7818]], grad_fn=<SliceBackward0>)

In [52]:
model(x[0][0].reshape((1,-1))).argsort(dim=1, descending=True)[0]

tensor([12,  2,  4, 20, 11,  9, 10,  8, 17, 18,  0,  1, 19,  5,  3, 15,  7, 16,
        13,  6, 14])

In [6]:
test_perms[0]

array([ 8, 14,  2, 11,  4,  5, 15,  7,  3,  6, 10, 13,  1, 12,  9,  0])

In [25]:
a.argmax()

tensor(2)

In [3]:
15+num_trans

17

In [8]:
best = float("inf")

for perm in test_perms:
    fixed = 0

    for pos, char in enumerate(perm):
        fixed += pos == char
    
    best = min(fixed, best)

best

np.int64(0)

In [5]:
import plotly.express as px

embedding = model.module.token_embedding_table(torch.arange(vocab_size)).detach().cpu().numpy()

similarity = []

for x in embedding:
    row = []
    for y in embedding:
        row.append(np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y)))
    similarity.append(row)

similarity = np.array(similarity)

px.imshow(similarity.round(3), title=f"{MODELNAME} token embedding similarity matrix")

In [49]:
with open(f"./embedding_pictures/position/{MODELNAME}.npy", "wb") as file:
    np.save(file, similarity)

In [94]:
with open(f"./embedding_pictures/position/{MODELNAME}.npy", "rb") as file:
    b = np.load(file)

b.shape

(2, 27, 27)

This next plot is found by taking the embeddings of $s_{i,j}$ at different positions and dotting them together (normed)

In [5]:
def binarise(pos1, pos2, d=4):
    return torch.tensor([int(x) for x in format(2**(d)*pos1+pos2, f'#0{2*d+2}b')[2:]])

def process_tensor(tensor):
    return tensor.detach().numpy().reshape(-1)

similarity = []

digits = 5

for swap1 in range(2**(digits-1)):
    swap1_slice = []

    for swap2 in range(2**(digits-1)):
        swap2_slice = []

        for x in range(10):
            row = []

            for y in range(10):
                token_embedding = process_tensor(
                    model.module.token_embedding_table(binarise(swap1, swap2, digits))
                )

                vectors = []

                for j in (x, y):
                    position_embedding = process_tensor(
                        model.module.position_embedding(torch.arange(j*digits, (j+2)*digits))
                    )

                    swap_embedding = token_embedding + position_embedding
                    vectors.append(swap_embedding / np.linalg.norm(swap_embedding))
                
                row.append(np.dot(vectors[0], vectors[1]))
            swap2_slice.append(row)
        swap1_slice.append(swap2_slice)
    similarity.append(swap1_slice)

similarity = np.array(similarity)

IndexError: index out of range in self

In [12]:
import plotly.graph_objects as go

# the following code is chatgpt generated (mostly)

def frame_name(i, j):
    return f"swap_{i}_{j}"

# Create initial heatmap (first slice)
initial_data = similarity[0, 0]
fig = go.Figure(data=go.Heatmap(z=initial_data))

zmin = np.min(similarity)
zmax = np.max(similarity)

# Define frames
frames = [
    go.Frame(
        data=go.Heatmap(z=similarity[i, j], zmin=zmin, zmax=zmax), 
        name=frame_name(i, j)
    )
    for i in range(similarity.shape[0])
    for j in range(similarity.shape[1])
]

# Add frames to the figure
fig.frames = frames

# Create slider steps for both sliders
slider1_steps = [
    {
        "args": [
            [frame_name(i, 0)],
            {
                "frame": {"duration": 300, "redraw": True},
                "mode": "immediate",
                "transition": {"duration": 300},
            },
        ],
        "label": f"Index {i}",
        "method": "animate",
    }
    for i in range(similarity.shape[0])
]

slider2_steps = [
    {
        "args": [
            [frame_name(0, j)],
            {
                "frame": {"duration": 300, "redraw": True},
                "mode": "immediate",
                "transition": {"duration": 300},
            },
        ],
        "label": f"Index {j}",
        "method": "animate",
    }
    for j in range(similarity.shape[1])
]

# Update layout with sliders
fig.update_layout(
    sliders=[
        {
            "currentvalue": {
                "font": {"size": 10},
                "prefix": "Tranposition 1: ",
                "visible": True,
                "xanchor": "center",
            },
            "steps": slider1_steps,
            "yanchor": "top",
            "xanchor": "left",
            "y": -0.1
        },
        {
            "currentvalue": {
                "font": {"size": 10},
                "prefix": "Transposition 2: ",
                "visible": True,
                "xanchor": "center",
            },
            "steps": slider2_steps,
            "yanchor": "top",
            "xanchor": "left",
            "y": -0.2
        }
    ],
    xaxis=dict(
        scaleanchor="y",
        scaleratio=1,
        constrain="domain"
    ),
    yaxis=dict(
        scaleanchor="x",
        scaleratio=1,
        constrain="domain"
    )
)

# Show the figure
fig.show()


In [27]:
similarity = []

digits = 5

for swap in range(2**digits):
    swap_slice = []

    for kindex in range(MAX_TRANS_NUMBER):

        token_embedding = model.module.token_embedding_table(binarise(swap, swap+1, digits)).detach().numpy()

        matrices = []

        for j in (1, kindex):
            position_embedding = model.module.position_embedding(torch.arange(j*digits, (j+2)*digits)).detach().numpy()

            swap_embedding = token_embedding + position_embedding
            matrices.append(swap_embedding)

        swap_slice.append(np.dot(matrices[0], np.transpose(matrices[1])))
    
    similarity.append(swap_slice)

similarity = np.array(similarity)

ValueError: operands could not be broadcast together with shapes (11,102) (10,102) 

In [36]:
swap+1

32

In [16]:
from plotly import optional_imports
nbformat = optional_imports.get_module("nbformat")
print(nbformat)

None


In [8]:
import pandas as pd

results = pd.read_csv("./results/window-7.0.csv")

In [9]:
num_results = results["results"].to_numpy().astype(int)

successes = []
failures = []

for perm, result in zip(test_perms, num_results):
    if result:
        successes.append(perm)
    else:
        failures.append(perm)

In [10]:
results.head(10)

Unnamed: 0,results
0,False
1,True
2,True
3,True
4,False
5,True
6,False
7,True
8,True
9,True


In [44]:
val_perms[0]

array([ 0,  1,  2,  3,  4,  5,  9,  6, 12, 11, 14, 13, 15,  8, 10,  7])

np.int64(37)

In [6]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)

fit = pca.fit_transform(model.module.position_embedding(torch.arange(block_size)).detach().numpy())
fit

array([[ 4.56129104e-01, -1.72061384e+00, -6.31371379e-01],
       [ 4.73234206e-01, -1.69271410e+00, -7.25310147e-01],
       [ 1.28970981e+00,  2.65495945e-02, -1.35467708e+00],
       [ 1.36588347e+00,  6.16256818e-02, -1.39985502e+00],
       [ 1.65531397e+00,  5.08190691e-01,  3.24675351e-01],
       [ 1.73166049e+00,  5.15716493e-01,  3.68282616e-01],
       [ 9.22570646e-01, -5.27477920e-01,  1.52274334e+00],
       [ 8.78905594e-01, -5.92262626e-01,  1.61128533e+00],
       [-2.61793211e-02, -2.96814919e-01,  6.76155746e-01],
       [ 1.14902435e-02, -2.49039501e-01,  6.12203181e-01],
       [-6.21682480e-02,  1.32397127e+00, -3.98209631e-01],
       [ 3.48626426e-03,  1.33885586e+00, -4.09683257e-01],
       [-2.07545653e-01,  1.78181314e+00,  2.89803267e-01],
       [-1.89323261e-01,  1.56268394e+00,  3.49922001e-01],
       [-8.93277884e-01,  3.32210213e-01,  1.07849467e+00],
       [-9.98443604e-01,  1.78505585e-01,  1.11210680e+00],
       [-1.64116502e+00, -6.82362378e-01

In [7]:
pca.explained_variance_ratio_

array([0.1193774 , 0.10322382, 0.09974902], dtype=float32)

In [45]:
val_seqs[0]

array([ 7, 15, 12, 11,  8, 10, 13, 10,  6, 12,  8, 11, 11, 14,  7, 12, 10,
       11,  9,  6])

## Custom ouput tests

In [26]:
def get_perm_hybrid(seq, size):
    og = list(range(size))

    swaps = []

    for pos, x in enumerate(seq):
        if pos % 2 == 0:
            swaps.append([x])
        else:
            swaps[-1].append(x)
    
    for x, y in swaps:
        og[x], og[y] = og[y], og[x]

    return np.array(og)

def convert_to_hybrid(seq, size):
    new = []
    
    for char in seq:
        new.append(int(char//size))
        new.append(int(char%size))
    
    return new

def convert_to_general(seq, size):
    new = []

    for x, y in zip(seq[::2], seq[1::2]):
        new.append(x*size + y)

    return new

custom = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

true = get_perm_hybrid(
    custom, 
    16
)

genned = model.module.generate(
    convert_to_general(custom, 16), 
    force_valid=False, 
    debug=False
)

print("True: ", true)
print("Model:", genned)
print("Same:", (true == genned).all())
print("Sortd:", np.sort(genned))

IndexError: index 27 is out of bounds for dimension 0 with size 27

In [29]:
a = range(20)

for x, y in zip(a[::2], a[1::2]):
    print(x, y)

0 1
2 3
4 5
6 7
8 9
10 11
12 13
14 15
16 17
18 19


In [25]:
convert_to_general(custom, 16)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [57]:
[0 for x in range(20)]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [65]:
# wrong
b = torch.tensor([-265.7672, -265.7672, -265.7672, -265.7672, -265.7672, -265.7672,
         -265.7672, -265.7672, -265.7672, -265.7672, -265.7672, -265.7672,
         -265.7672, -265.7672, -265.7672, -265.7672, -240.0325, -247.6589,
         -246.5491, -257.4102, -257.1504, -257.8494, -255.2135, -254.7709,
         -253.9813, -256.4985, -256.2157, -261.1152, -266.6082, -258.3374,
         -273.0349, -261.6855, -265.7672, -265.7672])

import plotly.express as px
px.bar(b[16:32]-b[16:32].max()-1)

In [59]:
train_perms[0]

array([ 0,  1,  2,  3,  4,  5, 10, 11, 13,  6,  9, 12, 15, 14,  8,  7])

In [34]:
success_fixed_count = []

for perm in successes:
    fixed_count = 0

    for pos, char in enumerate(perm):
        if pos == char:
            fixed_count += 1
    
    success_fixed_count.append(fixed_count)

sum(success_fixed_count)/len(success_fixed_count)

5.069490635392963

In [48]:
failure_fixed_count = []

for perm in failures:
    fixed_count = 0

    for pos, char in enumerate(perm):
        if pos == char:
            fixed_count += 1
    
    failure_fixed_count.append(fixed_count)

sum(failure_fixed_count)/len(failure_fixed_count)

4.768602752507581

In [49]:
import csv

with open('win-7-succs.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["successes"])

    for x in success_fixed_count:
        writer.writerow([x])


In [50]:
import csv

with open('win-7-fails.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["failures"])

    for x in failure_fixed_count:
        writer.writerow([x])

In [1]:
from importlib import import_module

In [2]:
import_module("nbformat")

ImportError: dlopen(/Users/midataur/Library/Python/3.11/lib/python/site-packages/rpds/rpds.cpython-311-darwin.so, 0x0002): tried: '/Users/midataur/Library/Python/3.11/lib/python/site-packages/rpds/rpds.cpython-311-darwin.so' (mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64')), '/System/Volumes/Preboot/Cryptexes/OS/Users/midataur/Library/Python/3.11/lib/python/site-packages/rpds/rpds.cpython-311-darwin.so' (no such file), '/Users/midataur/Library/Python/3.11/lib/python/site-packages/rpds/rpds.cpython-311-darwin.so' (mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64'))

In [1]:
import nbformat

In [20]:
!pip uninstall rpds-py

Found existing installation: rpds-py 0.18.1
Uninstalling rpds-py-0.18.1:
  Would remove:
    /Users/midataur/Documents/github/permutations/binary-classifier/venv/lib/python3.11/site-packages/rpds/*
    /Users/midataur/Documents/github/permutations/binary-classifier/venv/lib/python3.11/site-packages/rpds_py-0.18.1.dist-info/*
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [10]:
!pip show nbformat

Name: nbformat
Version: 5.10.4
Summary: The Jupyter Notebook format
Home-page: 
Author: 
Author-email: Jupyter Development Team <jupyter@googlegroups.com>
License: BSD 3-Clause License

- Copyright (c) 2001-2015, IPython Development Team
- Copyright (c) 2015-, Jupyter Development Team

All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its
   contributors may be used to endorse or promote products derived from
   this software without specific prior written permission.

THIS

In [8]:
!pip install --upgrade notebook jupyter jupyterlab

Collecting notebook
  Obtaining dependency information for notebook from https://files.pythonhosted.org/packages/32/b4/b0cdaf52c35a3a40633136bee5152d6670acb555c698d23a3458dca65781/notebook-7.2.1-py3-none-any.whl.metadata
  Downloading notebook-7.2.1-py3-none-any.whl.metadata (10 kB)
Collecting jupyter
  Obtaining dependency information for jupyter from https://files.pythonhosted.org/packages/83/df/0f5dd132200728a86190397e1ea87cd76244e42d39ec5e88efd25b2abd7e/jupyter-1.0.0-py2.py3-none-any.whl.metadata
  Downloading jupyter-1.0.0-py2.py3-none-any.whl.metadata (995 bytes)
Collecting jupyterlab
  Obtaining dependency information for jupyterlab from https://files.pythonhosted.org/packages/2d/d1/69edb0a70473089057afc1e10936a743d985f5f80774da982f47b84c4d55/jupyterlab-4.2.3-py3-none-any.whl.metadata
  Downloading jupyterlab-4.2.3-py3-none-any.whl.metadata (16 kB)
Collecting jupyter-server<3,>=2.4.0 (from notebook)
  Obtaining dependency information for jupyter-server<3,>=2.4.0 from https://fil

In [9]:
!pip install --upgrade plotly

Collecting plotly
  Obtaining dependency information for plotly from https://files.pythonhosted.org/packages/0b/f8/b65cdd2be32e442c4efe7b672f73c90b05eab5a7f3f4115efe181d432c60/plotly-5.22.0-py3-none-any.whl.metadata
  Downloading plotly-5.22.0-py3-none-any.whl.metadata (7.1 kB)
Collecting tenacity>=6.2.0 (from plotly)
  Obtaining dependency information for tenacity>=6.2.0 from https://files.pythonhosted.org/packages/e3/ee/b179c3ab5cb842d75c65339c4b86b572eaf8f43407890bd1d2c7b72eb829/tenacity-8.4.2-py3-none-any.whl.metadata
  Downloading tenacity-8.4.2-py3-none-any.whl.metadata (1.2 kB)
Downloading plotly-5.22.0-py3-none-any.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading tenacity-8.4.2-py3-none-any.whl (28 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.22.0 tenacity-8.4.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new re

In [None]:
00110100

In [20]:
convert_tokens_to_perm([12,13,14,15,16,17])

[10, 11, 12, 13, 14, 15]

In [4]:
k = 0
seq = val_seqs[k]
perm = val_perms[k]
perm

array([ 0,  1, 11,  2,  5,  8,  7,  3,  6,  9, 10,  4, 12, 13, 14, 15])

In [6]:
a = model.module.generate(seq)

In [19]:
# Calculate and print accuracy after each epoch
with torch.no_grad():
    model.eval()  # Set the model to evaluation mode

    # calculate validation stats
    total_accuracy = 0.0
    total_loss = 0.0

    num_batches = 0

    print("Evaluating...")
    for inputs, targets in tqdm(val_dataloader):
        outputs = model(inputs)

        0/0

        # calculate the val accuracy
        accuracy = calculate_accuracy(outputs, targets)
        total_accuracy += accuracy

        # Calculate the val loss
        loss = criterion(outputs, targets)
        total_loss += loss.item()
        num_batches += 1

    average_accuracy = total_accuracy / num_batches
    val_loss = total_loss / num_batches

    metrics = {
        "validation_accuracy": average_accuracy,
        "loss": val_loss,
        "training_accuracy": average_train_accuracy,
        "training_loss": train_loss,
    }

Evaluating...


  0%|          | 0/5313 [00:00<?, ?it/s]


ZeroDivisionError: division by zero

In [62]:
inputs[2]

tensor([ 1,  1,  1,  0,  0,  1,  0,  0,  0,  0,  1,  1,  1,  0,  0,  0,  0,  1,
         0,  0,  0,  0,  0,  0,  0,  1,  1,  0,  0,  0,  1,  1,  0,  0,  0,  1,
         1,  0,  0,  0,  1,  1,  0,  0,  0,  1,  0,  0,  0,  0,  0,  1,  0,  0,
         0,  0,  0,  1,  0,  0,  1,  0,  0,  1,  0,  1,  1,  0,  0,  0,  0,  1,
         0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  1,  0,  0,  0,
         0,  1,  0,  0,  0,  1,  1,  1,  0,  0, 19,  5,  3, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18])

In [63]:
outputs[2]

tensor([-6464.0293, -6464.0288, -6445.1235, -6450.2651, -6455.7666, -6448.2124,
        -6398.2354, -6451.0259, -6451.6304, -6452.4790, -6443.1245, -6442.4336,
        -6453.8066, -6463.5933, -6480.9731, -6465.3164, -6468.9014, -6464.4336,
        -6464.0283, -6464.0308, -6466.9629])

In [64]:
torch.argmax(outputs[2])

tensor(6)

In [57]:
model.module.softmax(outputs)[1]

tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.])

In [59]:
targets[1]

tensor(3)

In [61]:
val_perms[0]

array([ 3,  1,  4,  9,  2,  5,  6,  8,  0,  7, 10, 11, 12, 13, 14, 15])

In [None]:
import pyperclip

def np_to_mathematica(array, copy=True):
    formatted = str(array.tolist()).replace("[", "{").replace("]", "}")
    return formatted

In [None]:
pyperclip.copy(np_to_mathematica(embedding_pca))
print("Copied!")

In [None]:
from sklearn.decomposition import PCA
import numpy as np

embedding = np.array(model.module.token_embedding_table.weight.cpu().detach().numpy())
pos_embedding = np.array(model.module.position_embedding.weight.cpu().detach().numpy())

pca = PCA(n_components=3)
pca.fit(embedding)

embedding_pca = pca.transform(embedding)

pca = PCA(n_components=2)
pca.fit(pos_embedding)

pos_embedding_pca = pca.transform(pos_embedding)

In [None]:
import plotly.express as px

similarity = []

for x in embedding:
    row = []
    for y in embedding:
        row.append(np.dot(x, y))
    similarity.append(row)

px.imshow(similarity)

In [None]:
np.dot(pos_embedding[MAX_LENGTH], embedding[START_PREDICTION_TOKEN])

In [None]:
convert_to_transposition(13)

In [None]:
threshold = 30

for pos1, x in enumerate(embedding):
    for pos2, y in enumerate(embedding):
        if np.dot(x, y) > threshold and pos1 != pos2:
            print("x", pos1, "y", pos2, "dot", np.dot(x, y))

In [None]:
px.imshow(model.module.token_embedding_table.weight.detach())

In [None]:
torch.no_grad()
model.eval()

In [None]:
targets

In [None]:
# calculate validation stats
criterion = nn.CrossEntropyLoss()

total_accuracy = 0.0
total_loss = 0.0

num_batches = 0

print("Evaluating...")
for inputs, targets in tqdm(val_dataloader):
    outputs = model(inputs)

    # calculate the val accuracy
    accuracy = calculate_accuracy(outputs, targets)
    total_accuracy += accuracy

    # Calculate the val loss
    loss = criterion(outputs, targets)
    total_loss += loss.item()
    num_batches += 1

average_accuracy = total_accuracy / num_batches
val_loss = total_loss / num_batches

In [None]:
val_loss

In [None]:
vocab_size

In [None]:
output = model(data)

In [None]:
train

In [None]:
train.shape

In [None]:
output.shape

In [None]:
output[1]

In [None]:
model.get_device()

In [None]:
dev

In [None]:
if cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"

In [None]:
conver

array([1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0])

In [None]:
model.eval()

# use gpu for processing
if cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"

# create an initial input
input_tensor = torch.ones(block_size, dtype=int).to(dev)
input_tensor *= TO_PREDICT_TOKEN
input_tensor[:len(seq)] = torch.tensor(seq, dtype=int).to(dev)
input_tensor[len(seq)] = START_PREDICTION_TOKEN

In [None]:
input_tensor.unsqueeze(0)

In [None]:
model(input_tensor.unsqueeze(0))

In [None]:
torch.argmax(model(input_tensor.unsqueeze(0)), dim=1)

In [None]:
prediction_tensor = torch.zeros(block_size, dtype=int).to(dev)

In [None]:
sequence = [1,2,3,1,2,3]

In [None]:
prediction_tensor[:len(sequence)] = torch.tensor(sequence, dtype=int).to(dev)

In [None]:
prediction_tensor

In [None]:
input_tensor = torch.ones(block_size, dtype=int).to(dev)

In [None]:
input_tensor *= TO_PREDICT_TOKEN

In [None]:
input_tensor