# Setup

In [1]:
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
import wandb
from utilities import *
from config import *
from dataloading import *
from tqdm import tqdm
from transformer import *
import os

Loading data...


Loading data: 200000it [00:23, 8455.76it/s] 
Loading data: 10000it [00:00, 35793.41it/s]
Loading data: 10000it [00:00, 36107.06it/s]


In [2]:
# setup the model
model = BigramLanguageModel()

# cuda? (gpu)
if torch.cuda.is_available():
  device = "cuda:0"
else:
  device = "cpu"
  
# send to gpu (maybe)
model = nn.DataParallel(model)
model = model.to(device)

# optionally: load the model
filename = PATH + "/model/" + MODELNAME + ".pth"
if os.path.isfile(filename):
    model.load_state_dict(torch.load(filename, map_location=torch.device(device)))

os.path.isfile(filename)

True

In [4]:
!pwd

/Users/midataur/Documents/github/permutations/scaling-generator


In [5]:
torch.norm(model.module.position_embedding(torch.arange(block_size)), dim=1)

tensor([1.4373, 1.7087, 3.0711, 1.6072, 1.3768, 1.6005, 1.8965, 3.6222, 1.4991,
        1.3836, 1.1925, 1.7763, 3.0512, 1.2602, 1.7053, 1.1857, 1.7943, 3.0573,
        1.2169, 1.3917, 1.2007, 1.4680, 3.0223, 1.5739, 2.1504, 1.0557, 1.4136,
        3.0209, 1.4963, 1.3928, 1.1103, 1.4799, 3.0290, 1.4532, 1.3874, 1.2438,
        1.4193, 3.0962, 1.4477, 1.3899, 1.1333, 1.6687, 3.3599, 1.3531, 1.6604,
        1.3165, 1.7076, 3.1032, 1.3607, 1.3820, 1.2134, 1.1081, 3.4252, 1.6753,
        1.3844, 1.1489, 1.1815, 3.5950, 1.6513, 1.3891, 1.1093, 1.5022, 3.5008,
        1.6764, 1.7086, 1.2641, 1.3932, 3.6701, 1.6443, 1.3781, 1.2451, 1.5854,
        2.8688, 1.5941, 1.9436, 1.4033, 1.4786, 2.8131, 1.5841, 1.3844, 1.2848,
        1.7271, 2.9532, 1.6822, 1.3795, 1.1694, 1.6557, 2.7654, 1.6950, 1.3807,
        0.8279, 1.5053, 2.6616, 1.9581, 1.3691, 0.9912, 1.5381, 2.6938, 2.0077,
        1.3911, 1.0491, 1.5438, 1.6787, 1.4692, 1.1431, 1.1352, 1.7485, 1.4191,
        1.5212, 1.9603, 1.8416, 1.8083, 

In [6]:
INPUT_LENGTH

100

In [3]:
import plotly.express as px

embedding = model.module.position_embedding(torch.arange(block_size)).detach().numpy()

similarity = []

for x in embedding:
    row = []
    for y in embedding:
        row.append(np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y)))
    similarity.append(row)

px.imshow(similarity, title=f"{MODELNAME} position embedding similarity matrix")

In [16]:
test_perms.apppend(list(range(16)))

AttributeError: 'numpy.ndarray' object has no attribute 'apppend'

In [4]:
plausible = 0

for perm in tqdm(test_perms):
    correct = 0

    for pos, char in enumerate(perm):
        if char == pos:
            correct += 1
    
    if correct >= 6:
        plausible += 1

plausible/len(test_perms)

100%|██████████| 10000/10000 [00:00<00:00, 249101.96it/s]


0.3368

In [5]:
import plotly.express as px

embedding = model.module.token_embedding_table(torch.arange(vocab_size)).detach().numpy()

similarity = []

for x in embedding:
    row = []
    for y in embedding:
        row.append(np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y)))
    similarity.append(row)

px.imshow(similarity, title=f"{MODELNAME} token embedding similarity matrix")

This next plot is found by taking the embeddings of $s_{i,j}$ at different positions and dotting them together (normed)

In [10]:
def binarise(pos1, pos2, d=4):
    return torch.tensor([int(x) for x in format(2**(d)*pos1+pos2, f'#0{2*d+2}b')[2:]])

def process_tensor(tensor):
    return tensor.detach().numpy().reshape(-1)

similarity = []

digits = 5

for swap1 in range(2**(digits-1)):
    swap1_slice = []

    for swap2 in range(2**(digits-1)):
        swap2_slice = []

        for x in range(10):
            row = []

            for y in range(10):
                token_embedding = process_tensor(
                    model.module.token_embedding_table(binarise(swap1, swap2, digits))
                )

                vectors = []

                for j in (x, y):
                    position_embedding = process_tensor(
                        model.module.position_embedding(torch.arange(j*digits, (j+2)*digits))
                    )

                    swap_embedding = token_embedding + position_embedding
                    vectors.append(swap_embedding / np.linalg.norm(swap_embedding))
                
                row.append(np.dot(vectors[0], vectors[1]))
            swap2_slice.append(row)
        swap1_slice.append(swap2_slice)
    similarity.append(swap1_slice)

similarity = np.array(similarity)

In [12]:
import plotly.graph_objects as go

# the following code is chatgpt generated (mostly)

def frame_name(i, j):
    return f"swap_{i}_{j}"

# Create initial heatmap (first slice)
initial_data = similarity[0, 0]
fig = go.Figure(data=go.Heatmap(z=initial_data))

zmin = np.min(similarity)
zmax = np.max(similarity)

# Define frames
frames = [
    go.Frame(
        data=go.Heatmap(z=similarity[i, j], zmin=zmin, zmax=zmax), 
        name=frame_name(i, j)
    )
    for i in range(similarity.shape[0])
    for j in range(similarity.shape[1])
]

# Add frames to the figure
fig.frames = frames

# Create slider steps for both sliders
slider1_steps = [
    {
        "args": [
            [frame_name(i, 0)],
            {
                "frame": {"duration": 300, "redraw": True},
                "mode": "immediate",
                "transition": {"duration": 300},
            },
        ],
        "label": f"Index {i}",
        "method": "animate",
    }
    for i in range(similarity.shape[0])
]

slider2_steps = [
    {
        "args": [
            [frame_name(0, j)],
            {
                "frame": {"duration": 300, "redraw": True},
                "mode": "immediate",
                "transition": {"duration": 300},
            },
        ],
        "label": f"Index {j}",
        "method": "animate",
    }
    for j in range(similarity.shape[1])
]

# Update layout with sliders
fig.update_layout(
    sliders=[
        {
            "currentvalue": {
                "font": {"size": 10},
                "prefix": "Tranposition 1: ",
                "visible": True,
                "xanchor": "center",
            },
            "steps": slider1_steps,
            "yanchor": "top",
            "xanchor": "left",
            "y": -0.1
        },
        {
            "currentvalue": {
                "font": {"size": 10},
                "prefix": "Transposition 2: ",
                "visible": True,
                "xanchor": "center",
            },
            "steps": slider2_steps,
            "yanchor": "top",
            "xanchor": "left",
            "y": -0.2
        }
    ],
    xaxis=dict(
        scaleanchor="y",
        scaleratio=1,
        constrain="domain"
    ),
    yaxis=dict(
        scaleanchor="x",
        scaleratio=1,
        constrain="domain"
    )
)

# Show the figure
fig.show()


In [27]:
similarity = []

digits = 5

for swap in range(2**digits):
    swap_slice = []

    for kindex in range(MAX_TRANS_NUMBER):

        token_embedding = model.module.token_embedding_table(binarise(swap, swap+1, digits)).detach().numpy()

        matrices = []

        for j in (1, kindex):
            position_embedding = model.module.position_embedding(torch.arange(j*digits, (j+2)*digits)).detach().numpy()

            swap_embedding = token_embedding + position_embedding
            matrices.append(swap_embedding)

        swap_slice.append(np.dot(matrices[0], np.transpose(matrices[1])))
    
    similarity.append(swap_slice)

similarity = np.array(similarity)

ValueError: operands could not be broadcast together with shapes (11,102) (10,102) 

In [36]:
swap+1

32

In [16]:
from plotly import optional_imports
nbformat = optional_imports.get_module("nbformat")
print(nbformat)

None


In [12]:
import pandas as pd

results = pd.read_csv("./results/window-7.0.csv")

In [25]:
num_results = results["results"].to_numpy().astype(int)

successes = []
failures = []

for perm, result in zip(test_perms, num_results):
    if result:
        successes.append(perm)
    else:
        failures.append(perm)

In [60]:
results

Unnamed: 0,results
0,False
1,True
2,True
3,True
4,False
...,...
9995,False
9996,True
9997,False
9998,False


In [31]:
test_perms[4]

array([13,  1, 11, 14,  0, 10,  6,  7,  8,  9,  4,  5, 15,  2,  3, 12])

In [30]:
model.module.generate(test_seqs[4])

array([13,  1, 11, 14,  2, 10,  6,  7,  8,  9,  4,  5, 15, 13,  3, 12])

In [57]:
successes.sort(key=lambda x: str(x))
successes

[array([ 0,  1,  2,  3,  4,  5,  6, 10,  8, 13, 11,  7, 12,  9, 14, 15]),
 array([ 0,  1,  2,  3,  4,  5,  6, 11, 15,  9, 14, 13,  7,  8, 10, 12]),
 array([ 0,  1,  2,  3,  4,  5,  7,  6, 12, 11,  8, 10, 14, 13,  9, 15]),
 array([ 0,  1,  2,  3,  4,  5,  7, 12,  8, 14, 10,  9, 11, 13,  6, 15]),
 array([ 0,  1,  2,  3,  4,  5, 12, 11, 15,  9,  6,  7, 10, 13,  8, 14]),
 array([ 0,  1,  2,  3,  4,  5, 13,  7,  8,  9, 10,  6, 11, 15, 14, 12]),
 array([ 0,  1,  2,  3,  4,  5, 15,  6, 11, 13, 12,  8, 14, 10,  9,  7]),
 array([ 0,  1,  2,  3,  4,  6, 15, 11,  5,  9, 10,  7, 12, 13,  8, 14]),
 array([ 0,  1,  2,  3,  4,  8,  6,  7, 15, 14,  9, 11, 13,  5, 10, 12]),
 array([ 0,  1,  2,  3,  4,  8, 10, 13,  6,  7,  5,  9, 12, 11, 14, 15]),
 array([ 0,  1,  2,  3,  4,  9, 11,  7, 14, 10, 15,  6, 13, 12,  8,  5]),
 array([ 0,  1,  2,  3,  4,  9, 13, 14,  8, 10,  7, 11, 12, 15,  5,  6]),
 array([ 0,  1,  2,  3,  4, 10,  6, 14,  8,  9, 13, 15, 12,  7,  5, 11]),
 array([ 0,  1,  2,  3,  4, 11,  8, 12

In [59]:
train_perms[0]

array([ 0,  1,  2,  3,  4,  5, 10, 11, 13,  6,  9, 12, 15, 14,  8,  7])

In [56]:
failures.sort(key=lambda x: str(x))
failures

[array([ 0,  1,  2,  3,  4,  7,  6,  9,  8, 11,  5, 13, 10, 15, 12, 14]),
 array([ 0,  1,  2,  3, 10,  5, 11,  7,  9, 14,  4,  6, 12, 13,  8, 15]),
 array([ 0,  1,  2,  3, 12,  5, 10,  7,  8, 15,  6,  4, 13, 11, 14,  9]),
 array([ 0,  1,  2,  3, 13, 12,  8,  7,  6,  9, 14, 15,  4, 11,  5, 10]),
 array([ 0,  1,  2,  4, 13, 10, 12,  7,  8, 15,  5, 11,  6,  3, 14,  9]),
 array([ 0,  1,  2,  4, 14, 11, 13,  5,  8,  6, 10,  7, 12,  9, 15,  3]),
 array([ 0,  1,  2,  5,  8, 15,  6, 11,  4,  7,  9, 12, 14, 13, 10,  3]),
 array([ 0,  1,  2,  5, 14,  8,  4, 10, 15,  9,  7, 11,  3,  6, 13, 12]),
 array([ 0,  1,  2,  6,  4,  5, 13,  8, 12,  7, 11, 14, 15,  9, 10,  3]),
 array([ 0,  1,  2,  6,  4, 13,  3,  7, 11, 15, 10,  8, 12, 14,  5,  9]),
 array([ 0,  1,  2,  6, 11, 14, 10,  7, 12, 15,  9,  4,  8, 13,  5,  3]),
 array([ 0,  1,  2,  7,  4,  5, 15, 11,  6, 12, 10,  9,  3, 13, 14,  8]),
 array([ 0,  1,  2,  8, 12,  6, 10, 15, 11, 14,  4,  5,  3,  7,  9, 13]),
 array([ 0,  1,  2,  8, 13,  5,  6,  7

In [30]:
failures

[array([10,  4,  2, 15, 14,  9,  1,  6,  8,  7,  0, 11, 12,  5, 13,  3]),
 array([13,  1, 11, 14,  0, 10,  6,  7,  8,  9,  4,  5, 15,  2,  3, 12]),
 array([15,  1, 11,  7,  4,  9, 13,  3,  8, 14, 10,  5,  6, 12,  2,  0]),
 array([ 2,  9,  0,  3, 13, 10,  5,  8,  4,  7, 15, 11, 12,  6, 14,  1]),
 array([ 0,  1,  8,  3, 13,  5,  6,  7, 12,  2, 14, 10,  9,  4, 15, 11]),
 array([ 9, 11,  2, 10,  4, 14,  6,  7, 15,  0,  3, 13, 12,  8,  5,  1]),
 array([ 7,  1,  2, 13,  5,  4,  8, 11,  6, 14, 10,  9, 12,  3, 15,  0]),
 array([ 8,  1,  2,  3,  0, 10, 12,  5,  9, 11,  7,  4, 13,  6, 14, 15]),
 array([ 0, 12,  2,  3, 14,  5, 13,  9,  7,  4,  8, 11, 15,  6, 10,  1]),
 array([ 0,  9,  1,  3,  4,  8,  6, 10,  5, 11, 15,  7, 12, 13, 14,  2]),
 array([11, 15,  2,  6,  9, 10,  3,  7,  4,  8,  0,  5, 12, 14, 13,  1]),
 array([ 0,  4,  8,  5, 11, 14,  2,  7,  6,  9, 12, 13, 10,  3,  1, 15]),
 array([ 3,  1,  7,  9,  8,  4, 12, 15,  6, 10,  5, 11,  0, 13, 14,  2]),
 array([ 0,  3,  4,  7,  5, 14,  9, 15

In [34]:
success_fixed_count = []

for perm in successes:
    fixed_count = 0

    for pos, char in enumerate(perm):
        if pos == char:
            fixed_count += 1
    
    success_fixed_count.append(fixed_count)

sum(success_fixed_count)/len(success_fixed_count)

5.069490635392963

In [48]:
failure_fixed_count = []

for perm in failures:
    fixed_count = 0

    for pos, char in enumerate(perm):
        if pos == char:
            fixed_count += 1
    
    failure_fixed_count.append(fixed_count)

sum(failure_fixed_count)/len(failure_fixed_count)

4.768602752507581

In [49]:
import csv

with open('win-7-succs.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["successes"])

    for x in success_fixed_count:
        writer.writerow([x])


In [50]:
import csv

with open('win-7-fails.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["failures"])

    for x in failure_fixed_count:
        writer.writerow([x])

In [1]:
from importlib import import_module

In [2]:
import_module("nbformat")

ImportError: dlopen(/Users/midataur/Library/Python/3.11/lib/python/site-packages/rpds/rpds.cpython-311-darwin.so, 0x0002): tried: '/Users/midataur/Library/Python/3.11/lib/python/site-packages/rpds/rpds.cpython-311-darwin.so' (mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64')), '/System/Volumes/Preboot/Cryptexes/OS/Users/midataur/Library/Python/3.11/lib/python/site-packages/rpds/rpds.cpython-311-darwin.so' (no such file), '/Users/midataur/Library/Python/3.11/lib/python/site-packages/rpds/rpds.cpython-311-darwin.so' (mach-o file, but is an incompatible architecture (have 'x86_64', need 'arm64'))

In [1]:
import nbformat

In [20]:
!pip uninstall rpds-py

Found existing installation: rpds-py 0.18.1
Uninstalling rpds-py-0.18.1:
  Would remove:
    /Users/midataur/Documents/github/permutations/binary-classifier/venv/lib/python3.11/site-packages/rpds/*
    /Users/midataur/Documents/github/permutations/binary-classifier/venv/lib/python3.11/site-packages/rpds_py-0.18.1.dist-info/*
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [10]:
!pip show nbformat

Name: nbformat
Version: 5.10.4
Summary: The Jupyter Notebook format
Home-page: 
Author: 
Author-email: Jupyter Development Team <jupyter@googlegroups.com>
License: BSD 3-Clause License

- Copyright (c) 2001-2015, IPython Development Team
- Copyright (c) 2015-, Jupyter Development Team

All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its
   contributors may be used to endorse or promote products derived from
   this software without specific prior written permission.

THIS

In [8]:
!pip install --upgrade notebook jupyter jupyterlab

Collecting notebook
  Obtaining dependency information for notebook from https://files.pythonhosted.org/packages/32/b4/b0cdaf52c35a3a40633136bee5152d6670acb555c698d23a3458dca65781/notebook-7.2.1-py3-none-any.whl.metadata
  Downloading notebook-7.2.1-py3-none-any.whl.metadata (10 kB)
Collecting jupyter
  Obtaining dependency information for jupyter from https://files.pythonhosted.org/packages/83/df/0f5dd132200728a86190397e1ea87cd76244e42d39ec5e88efd25b2abd7e/jupyter-1.0.0-py2.py3-none-any.whl.metadata
  Downloading jupyter-1.0.0-py2.py3-none-any.whl.metadata (995 bytes)
Collecting jupyterlab
  Obtaining dependency information for jupyterlab from https://files.pythonhosted.org/packages/2d/d1/69edb0a70473089057afc1e10936a743d985f5f80774da982f47b84c4d55/jupyterlab-4.2.3-py3-none-any.whl.metadata
  Downloading jupyterlab-4.2.3-py3-none-any.whl.metadata (16 kB)
Collecting jupyter-server<3,>=2.4.0 (from notebook)
  Obtaining dependency information for jupyter-server<3,>=2.4.0 from https://fil

In [9]:
!pip install --upgrade plotly

Collecting plotly
  Obtaining dependency information for plotly from https://files.pythonhosted.org/packages/0b/f8/b65cdd2be32e442c4efe7b672f73c90b05eab5a7f3f4115efe181d432c60/plotly-5.22.0-py3-none-any.whl.metadata
  Downloading plotly-5.22.0-py3-none-any.whl.metadata (7.1 kB)
Collecting tenacity>=6.2.0 (from plotly)
  Obtaining dependency information for tenacity>=6.2.0 from https://files.pythonhosted.org/packages/e3/ee/b179c3ab5cb842d75c65339c4b86b572eaf8f43407890bd1d2c7b72eb829/tenacity-8.4.2-py3-none-any.whl.metadata
  Downloading tenacity-8.4.2-py3-none-any.whl.metadata (1.2 kB)
Downloading plotly-5.22.0-py3-none-any.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading tenacity-8.4.2-py3-none-any.whl (28 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.22.0 tenacity-8.4.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new re

In [None]:
00110100

In [20]:
convert_tokens_to_perm([12,13,14,15,16,17])

[10, 11, 12, 13, 14, 15]

In [4]:
k = 0
seq = val_seqs[k]
perm = val_perms[k]
perm

array([ 3,  1,  4,  9,  2,  5,  6,  8,  0,  7, 10, 11, 12, 13, 14, 15])

In [5]:
model.module.generate(seq)

[3, 1, 4, 9, 2, 5, 6, 8, 0, 7, 10, 11, 12, 13, 14, 15]

In [19]:
# Calculate and print accuracy after each epoch
with torch.no_grad():
    model.eval()  # Set the model to evaluation mode

    # calculate validation stats
    total_accuracy = 0.0
    total_loss = 0.0

    num_batches = 0

    print("Evaluating...")
    for inputs, targets in tqdm(val_dataloader):
        outputs = model(inputs)

        0/0

        # calculate the val accuracy
        accuracy = calculate_accuracy(outputs, targets)
        total_accuracy += accuracy

        # Calculate the val loss
        loss = criterion(outputs, targets)
        total_loss += loss.item()
        num_batches += 1

    average_accuracy = total_accuracy / num_batches
    val_loss = total_loss / num_batches

    metrics = {
        "validation_accuracy": average_accuracy,
        "loss": val_loss,
        "training_accuracy": average_train_accuracy,
        "training_loss": train_loss,
    }

Evaluating...


  0%|          | 0/5313 [00:00<?, ?it/s]


ZeroDivisionError: division by zero

In [62]:
inputs[2]

tensor([ 1,  1,  1,  0,  0,  1,  0,  0,  0,  0,  1,  1,  1,  0,  0,  0,  0,  1,
         0,  0,  0,  0,  0,  0,  0,  1,  1,  0,  0,  0,  1,  1,  0,  0,  0,  1,
         1,  0,  0,  0,  1,  1,  0,  0,  0,  1,  0,  0,  0,  0,  0,  1,  0,  0,
         0,  0,  0,  1,  0,  0,  1,  0,  0,  1,  0,  1,  1,  0,  0,  0,  0,  1,
         0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  1,  0,  0,  0,
         0,  1,  0,  0,  0,  1,  1,  1,  0,  0, 19,  5,  3, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18])

In [63]:
outputs[2]

tensor([-6464.0293, -6464.0288, -6445.1235, -6450.2651, -6455.7666, -6448.2124,
        -6398.2354, -6451.0259, -6451.6304, -6452.4790, -6443.1245, -6442.4336,
        -6453.8066, -6463.5933, -6480.9731, -6465.3164, -6468.9014, -6464.4336,
        -6464.0283, -6464.0308, -6466.9629])

In [64]:
torch.argmax(outputs[2])

tensor(6)

In [57]:
model.module.softmax(outputs)[1]

tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.])

In [59]:
targets[1]

tensor(3)

In [61]:
val_perms[0]

array([ 3,  1,  4,  9,  2,  5,  6,  8,  0,  7, 10, 11, 12, 13, 14, 15])

In [None]:
import pyperclip

def np_to_mathematica(array, copy=True):
    formatted = str(array.tolist()).replace("[", "{").replace("]", "}")
    return formatted

In [None]:
pyperclip.copy(np_to_mathematica(embedding_pca))
print("Copied!")

In [None]:
from sklearn.decomposition import PCA
import numpy as np

embedding = np.array(model.module.token_embedding_table.weight.cpu().detach().numpy())
pos_embedding = np.array(model.module.position_embedding.weight.cpu().detach().numpy())

pca = PCA(n_components=3)
pca.fit(embedding)

embedding_pca = pca.transform(embedding)

pca = PCA(n_components=2)
pca.fit(pos_embedding)

pos_embedding_pca = pca.transform(pos_embedding)

In [None]:
import plotly.express as px

similarity = []

for x in embedding:
    row = []
    for y in embedding:
        row.append(np.dot(x, y))
    similarity.append(row)

px.imshow(similarity)

In [None]:
np.dot(pos_embedding[MAX_LENGTH], embedding[START_PREDICTION_TOKEN])

In [None]:
convert_to_transposition(13)

In [None]:
threshold = 30

for pos1, x in enumerate(embedding):
    for pos2, y in enumerate(embedding):
        if np.dot(x, y) > threshold and pos1 != pos2:
            print("x", pos1, "y", pos2, "dot", np.dot(x, y))

In [None]:
px.imshow(model.module.token_embedding_table.weight.detach())

In [None]:
torch.no_grad()
model.eval()

In [None]:
targets

In [None]:
# calculate validation stats
criterion = nn.CrossEntropyLoss()

total_accuracy = 0.0
total_loss = 0.0

num_batches = 0

print("Evaluating...")
for inputs, targets in tqdm(val_dataloader):
    outputs = model(inputs)

    # calculate the val accuracy
    accuracy = calculate_accuracy(outputs, targets)
    total_accuracy += accuracy

    # Calculate the val loss
    loss = criterion(outputs, targets)
    total_loss += loss.item()
    num_batches += 1

average_accuracy = total_accuracy / num_batches
val_loss = total_loss / num_batches

In [None]:
val_loss

In [None]:
vocab_size

In [None]:
output = model(data)

In [None]:
train

In [None]:
train.shape

In [None]:
output.shape

In [None]:
output[1]

In [None]:
model.get_device()

In [None]:
dev

In [None]:
if cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"

In [None]:
conver

array([1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0])

In [None]:
model.eval()

# use gpu for processing
if cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"

# create an initial input
input_tensor = torch.ones(block_size, dtype=int).to(dev)
input_tensor *= TO_PREDICT_TOKEN
input_tensor[:len(seq)] = torch.tensor(seq, dtype=int).to(dev)
input_tensor[len(seq)] = START_PREDICTION_TOKEN

In [None]:
input_tensor.unsqueeze(0)

In [None]:
model(input_tensor.unsqueeze(0))

In [None]:
torch.argmax(model(input_tensor.unsqueeze(0)), dim=1)

In [None]:
prediction_tensor = torch.zeros(block_size, dtype=int).to(dev)

In [None]:
sequence = [1,2,3,1,2,3]

In [None]:
prediction_tensor[:len(sequence)] = torch.tensor(sequence, dtype=int).to(dev)

In [None]:
prediction_tensor

In [None]:
input_tensor = torch.ones(block_size, dtype=int).to(dev)

In [None]:
input_tensor *= TO_PREDICT_TOKEN

In [None]:
input_tensor