# Setup

In [6]:
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
import wandb
from utilities import *
from config import *
from dataloading import *
from tqdm import tqdm
from transformer import *
import os

In [7]:
# setup the model
model = BigramLanguageModel()

# cuda? (gpu)
if torch.cuda.is_available():
  device = "cuda:0"
else:
  device = "cpu"

# send to gpu (maybe)
model = nn.DataParallel(model)
model = model.to(device)

# optionally: load the model
filename = PATH + "/model/" + MODELNAME + ".pth"
if os.path.isfile(filename):
    model.load_state_dict(torch.load(filename, map_location=torch.device(device)))

os.path.isfile(filename)

True

In [14]:
torch.norm(model.module.position_embedding(torch.arange(block_size)), dim=1)

tensor([1.4373, 1.7087, 3.0711, 1.6072, 1.3768, 1.6005, 1.8965, 3.6222, 1.4991,
        1.3836, 1.1925, 1.7763, 3.0512, 1.2602, 1.7053, 1.1857, 1.7943, 3.0573,
        1.2169, 1.3917, 1.2007, 1.4680, 3.0223, 1.5739, 2.1504, 1.0557, 1.4136,
        3.0209, 1.4963, 1.3928, 1.1103, 1.4799, 3.0290, 1.4532, 1.3874, 1.2438,
        1.4193, 3.0962, 1.4477, 1.3899, 1.1333, 1.6687, 3.3599, 1.3531, 1.6604,
        1.3165, 1.7076, 3.1032, 1.3607, 1.3820, 1.2134, 1.1081, 3.4252, 1.6753,
        1.3844, 1.1489, 1.1815, 3.5950, 1.6513, 1.3891, 1.1093, 1.5022, 3.5008,
        1.6764, 1.7086, 1.2641, 1.3932, 3.6701, 1.6443, 1.3781, 1.2451, 1.5854,
        2.8688, 1.5941, 1.9436, 1.4033, 1.4786, 2.8131, 1.5841, 1.3844, 1.2848,
        1.7271, 2.9532, 1.6822, 1.3795, 1.1694, 1.6557, 2.7654, 1.6950, 1.3807,
        0.8279, 1.5053, 2.6616, 1.9581, 1.3691, 0.9912, 1.5381, 2.6938, 2.0077,
        1.3911, 1.0491, 1.5438, 1.6787, 1.4692, 1.1431, 1.1352, 1.7485, 1.4191,
        1.5212, 1.9603, 1.8416, 1.8083, 

In [18]:
import plotly.express as px

embedding = model.module.position_embedding(torch.arange(block_size)).detach().numpy()

similarity = []

for x in embedding:
    row = []
    for y in embedding:
        row.append(np.dot(x, y))
    similarity.append(row)

px.imshow(similarity)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
import plotly.express as px

embedding = model.module.position_embedding(torch.arange(block_size)).detach().numpy()

similarity = []

for x in embedding:
    row = []
    for y in embedding:
        row.append(np.dot(x, y))
    similarity.append(row)

px.imshow(similarity)

In [4]:
k = 0
seq = val_seqs[k]
perm = val_perms[k]
perm

array([ 3,  1,  4,  9,  2,  5,  6,  8,  0,  7, 10, 11, 12, 13, 14, 15])

In [5]:
model.module.generate(seq)

[3, 1, 4, 9, 2, 5, 6, 8, 0, 7, 10, 11, 12, 13, 14, 15]

In [19]:
# Calculate and print accuracy after each epoch
with torch.no_grad():
    model.eval()  # Set the model to evaluation mode

    # calculate validation stats
    total_accuracy = 0.0
    total_loss = 0.0

    num_batches = 0

    print("Evaluating...")
    for inputs, targets in tqdm(val_dataloader):
        outputs = model(inputs)

        0/0

        # calculate the val accuracy
        accuracy = calculate_accuracy(outputs, targets)
        total_accuracy += accuracy

        # Calculate the val loss
        loss = criterion(outputs, targets)
        total_loss += loss.item()
        num_batches += 1

    average_accuracy = total_accuracy / num_batches
    val_loss = total_loss / num_batches

    metrics = {
        "validation_accuracy": average_accuracy,
        "loss": val_loss,
        "training_accuracy": average_train_accuracy,
        "training_loss": train_loss,
    }

Evaluating...


  0%|          | 0/5313 [00:00<?, ?it/s]


ZeroDivisionError: division by zero

In [62]:
inputs[2]

tensor([ 1,  1,  1,  0,  0,  1,  0,  0,  0,  0,  1,  1,  1,  0,  0,  0,  0,  1,
         0,  0,  0,  0,  0,  0,  0,  1,  1,  0,  0,  0,  1,  1,  0,  0,  0,  1,
         1,  0,  0,  0,  1,  1,  0,  0,  0,  1,  0,  0,  0,  0,  0,  1,  0,  0,
         0,  0,  0,  1,  0,  0,  1,  0,  0,  1,  0,  1,  1,  0,  0,  0,  0,  1,
         0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  1,  0,  0,  0,
         0,  1,  0,  0,  0,  1,  1,  1,  0,  0, 19,  5,  3, 18, 18, 18, 18, 18,
        18, 18, 18, 18, 18, 18, 18, 18, 18, 18])

In [63]:
outputs[2]

tensor([-6464.0293, -6464.0288, -6445.1235, -6450.2651, -6455.7666, -6448.2124,
        -6398.2354, -6451.0259, -6451.6304, -6452.4790, -6443.1245, -6442.4336,
        -6453.8066, -6463.5933, -6480.9731, -6465.3164, -6468.9014, -6464.4336,
        -6464.0283, -6464.0308, -6466.9629])

In [64]:
torch.argmax(outputs[2])

tensor(6)

In [57]:
model.module.softmax(outputs)[1]

tensor([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.])

In [59]:
targets[1]

tensor(3)

In [61]:
val_perms[0]

array([ 3,  1,  4,  9,  2,  5,  6,  8,  0,  7, 10, 11, 12, 13, 14, 15])

In [None]:
import pyperclip

def np_to_mathematica(array, copy=True):
    formatted = str(array.tolist()).replace("[", "{").replace("]", "}")
    return formatted

In [None]:
pyperclip.copy(np_to_mathematica(embedding_pca))
print("Copied!")

In [None]:
from sklearn.decomposition import PCA
import numpy as np

embedding = np.array(model.module.token_embedding_table.weight.cpu().detach().numpy())
pos_embedding = np.array(model.module.position_embedding.weight.cpu().detach().numpy())

pca = PCA(n_components=3)
pca.fit(embedding)

embedding_pca = pca.transform(embedding)

pca = PCA(n_components=2)
pca.fit(pos_embedding)

pos_embedding_pca = pca.transform(pos_embedding)

In [None]:
import plotly.express as px

similarity = []

for x in embedding:
    row = []
    for y in embedding:
        row.append(np.dot(x, y))
    similarity.append(row)

px.imshow(similarity)

In [None]:
np.dot(pos_embedding[MAX_LENGTH], embedding[START_PREDICTION_TOKEN])

In [None]:
convert_to_transposition(13)

In [None]:
threshold = 30

for pos1, x in enumerate(embedding):
    for pos2, y in enumerate(embedding):
        if np.dot(x, y) > threshold and pos1 != pos2:
            print("x", pos1, "y", pos2, "dot", np.dot(x, y))

In [None]:
px.imshow(model.module.token_embedding_table.weight.detach())

In [None]:
torch.no_grad()
model.eval()

In [None]:
targets

In [None]:
# calculate validation stats
criterion = nn.CrossEntropyLoss()

total_accuracy = 0.0
total_loss = 0.0

num_batches = 0

print("Evaluating...")
for inputs, targets in tqdm(val_dataloader):
    outputs = model(inputs)

    # calculate the val accuracy
    accuracy = calculate_accuracy(outputs, targets)
    total_accuracy += accuracy

    # Calculate the val loss
    loss = criterion(outputs, targets)
    total_loss += loss.item()
    num_batches += 1

average_accuracy = total_accuracy / num_batches
val_loss = total_loss / num_batches

In [None]:
val_loss

In [None]:
vocab_size

In [None]:
output = model(data)

In [None]:
train

In [None]:
train.shape

In [None]:
output.shape

In [None]:
output[1]

In [None]:
model.get_device()

In [None]:
dev

In [None]:
if cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"

In [None]:
conver

array([1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0])

In [None]:
model.eval()

# use gpu for processing
if cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"

# create an initial input
input_tensor = torch.ones(block_size, dtype=int).to(dev)
input_tensor *= TO_PREDICT_TOKEN
input_tensor[:len(seq)] = torch.tensor(seq, dtype=int).to(dev)
input_tensor[len(seq)] = START_PREDICTION_TOKEN

In [None]:
input_tensor.unsqueeze(0)

In [None]:
model(input_tensor.unsqueeze(0))

In [None]:
torch.argmax(model(input_tensor.unsqueeze(0)), dim=1)

In [None]:
prediction_tensor = torch.zeros(block_size, dtype=int).to(dev)

In [None]:
sequence = [1,2,3,1,2,3]

In [None]:
prediction_tensor[:len(sequence)] = torch.tensor(sequence, dtype=int).to(dev)

In [None]:
prediction_tensor

In [None]:
input_tensor = torch.ones(block_size, dtype=int).to(dev)

In [None]:
input_tensor *= TO_PREDICT_TOKEN

In [None]:
input_tensor