## LLM

In [26]:
import torch as torch
import numpy as np
import pickle as pkl
from tqdm.notebook import tqdm
from transformer_kristianwold.transformer import Transformer, Inference
from transformer_kristianwold.optimization import train_step, forward_and_loss, group_decay_parameters, save_checkpoint, load_checkpoint
from transformer_kristianwold.utils import saver, loader
from transformer_kristianwold.analysis import word_sim_to_vocab, cosine_similarity
from torch.utils.data import TensorDataset, DataLoader
from IPython.display import clear_output
import matplotlib.pyplot as plt
from transformer_kristianwold.analysis import EmbeddingClustering

print("PyTorch version:", torch.__version__)  
print("CUDA toolkit version PyTorch was built with:", torch.version.cuda)  
print("cuDNN version:", torch.backends.cudnn.version()) 
print("cuda available:", torch.cuda.is_available())

device = torch.device("cpu")
torch.set_float32_matmul_precision('high')

PyTorch version: 2.7.1+cu128
CUDA toolkit version PyTorch was built with: 12.8
cuDNN version: 90701
cuda available: True


## Load

In [106]:
tokenizer = loader("../../tokenizers/cnn_tokenizer3.pkl")
model = None
def load_model(filename=None):
    embed_dim = 64*18
    ff_dim = 4*embed_dim
    heads = 18
    tf_blocks = 18

    model = Transformer(
        embed_dim=embed_dim,
        ff_dim=ff_dim,
        heads=heads,
        tf_blocks=tf_blocks,
        vocab_size=tokenizer.vocab_size,
        max_seq_len=1024,
        dropout=0.1,
        start_token_id=tokenizer.token_to_idx["<s>"],
        use_weight_tying=True
    ).to(device)

    optimizer_grouped_parameters = group_decay_parameters(
        model,
        weight_decay=0.1,
        no_decay=["bias", "LayerNorm.weight"],
        )

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=5e-5)
    scaler = torch.amp.GradScaler("cuda")
    loss_train_list = []
    loss_test_list = []

    num_epochs      = 1
    steps_per_epoch = 1
    warmup_steps    = 1000

    def lr_lambda(step):
        if step < warmup_steps:
            return float(step) / float(max(1, warmup_steps))
        return 1.0

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
    if filename is not None:
        [model, 
        _, 
        _, 
        loss_train_list, 
        loss_test_list] = load_checkpoint(filename, 
                                        model, 
                                        optimizer, 
                                        scheduler, 
                                        loss_train_list, 
                                        loss_test_list)

    return model, loss_train_list, loss_test_list

## Load Model

In [105]:
model, loss_train_list, loss_test_list = load_model("../../models/checkpoint_transformer_5epoch.pth")
word_embed = model.word_embed.weight.data.cpu().numpy()

## London-Luton Relation

In [109]:
text = " luton"
tokens = tokenizer.encode(text)
embed = word_embed[tokens]

print("Word:", text)

words_sim = word_sim_to_vocab(tokens, word_embed, top_k=300)[0]

for i, word in enumerate(words_sim):
    print(f"Rank: {i}, Token: {word}, Word: {tokenizer.decode([word])}")

Word:  luton
Rank: 0, Token: 18773, Word:  luton
Rank: 1, Token: 21513, Word:  bedfordshire
Rank: 2, Token: 18485, Word:  gatwick
Rank: 3, Token: 20192, Word:  southend
Rank: 4, Token: 21182, Word:  swindon
Rank: 5, Token: 20278, Word:  peterborough
Rank: 6, Token: 17491, Word:  ipswich
Rank: 7, Token: 15925, Word:  watford
Rank: 8, Token: 22027, Word:  warrington
Rank: 9, Token: 20397, Word:  colchester
Rank: 10, Token: 14353, Word:  northampton
Rank: 11, Token: 22126, Word:  stockport
Rank: 12, Token: 13564, Word:  hertfordshire
Rank: 13, Token: 23910, Word:  hartlepool
Rank: 14, Token: 12643, Word:  coventry
Rank: 15, Token: 9696, Word:  heathrow
Rank: 16, Token: 17046, Word:  croydon
Rank: 17, Token: 13885, Word:  exeter
Rank: 18, Token: 12144, Word:  bournemouth
Rank: 19, Token: 12401, Word:  portsmouth
Rank: 20, Token: 18583, Word:  huddersfield
Rank: 21, Token: 10890, Word:  norwich
Rank: 22, Token: 14959, Word:  oldham
Rank: 23, Token: 5020, Word:  birmingham
Rank: 24, Token: 1

In [114]:
text = " london"
tokens = tokenizer.encode(text)
embed = word_embed[tokens]

print("Word:", text)

words_sim = word_sim_to_vocab(tokens, word_embed, top_k=300)[0]

for i, word in enumerate(words_sim):
    print(f"Rank: {i}, Token: {word}, Word: {tokenizer.decode([word])}")

Word:  london
Rank: 0, Token: 966, Word:  london
Rank: 1, Token: 7654, Word:  london's
Rank: 2, Token: 14546, Word:  londons
Rank: 3, Token: 7048, Word:  glasgow
Rank: 4, Token: 3421, Word:  paris
Rank: 5, Token: 3703, Word:  sydney
Rank: 6, Token: 1193, Word:  british
Rank: 7, Token: 1710, Word:  manchester
Rank: 8, Token: 21091, Word:  londoners
Rank: 9, Token: 1597, Word:  britain
Rank: 10, Token: 992, Word:  uk
Rank: 11, Token: 7261, Word:  westminster
Rank: 12, Token: 5187, Word:  yorkshire
Rank: 13, Token: 1182, Word:  england
Rank: 14, Token: 13411, Word:  dublin
Rank: 15, Token: 16513, Word:  mayfair
Rank: 16, Token: 13953, Word:  belfast
Rank: 17, Token: 2153, Word:  liverpool
Rank: 18, Token: 5020, Word:  birmingham
Rank: 19, Token: 8038, Word:  surrey
Rank: 20, Token: 7551, Word:  edinburgh
Rank: 21, Token: 17046, Word:  croydon
Rank: 22, Token: 2995, Word:  scotland
Rank: 23, Token: 5902, Word:  melbourne
Rank: 24, Token: 8049, Word:  berlin
Rank: 25, Token: 6020, Word:  ox

## Sierra Leone-Liberia relation

In [None]:
text = " liberia"
tokens = tokenizer.encode(text)
embed = word_embed[tokens]

print("Word:", text)

words_sim = word_sim_to_vocab(tokens, word_embed, top_k=20)[0]

for i, word in enumerate(words_sim):
    print(f"Rank: {i}, Token: {word}, Word: {tokenizer.decode([word])}")

Word:  liberia
Rank: 0, Token: 9977, Word:  liberia
Rank: 1, Token: 7349, Word:  nigeria
Rank: 2, Token: 15883, Word:  uganda
Rank: 3, Token: 17937, Word:  rwanda
Rank: 4, Token: 17461, Word:  ethiopia
Rank: 5, Token: 13441, Word:  mali
Rank: 6, Token: 16926, Word:  guatemala
Rank: 7, Token: 16341, Word:  senegal
Rank: 8, Token: 10011, Word:  haiti
Rank: 9, Token: 17272, Word:  tanzania
Rank: 10, Token: 8010, Word:  sudan
Rank: 11, Token: 15845, Word:  honduras
Rank: 12, Token: 11167, Word:  ghana
Rank: 13, Token: 22227, Word:  lagos
Rank: 14, Token: 8313, Word:  sierra
Rank: 15, Token: 4014, Word:  ebola
Rank: 16, Token: 10476, Word:  guinea
Rank: 17, Token: 14197, Word:  tunisia
Rank: 18, Token: 4725, Word:  niger
Rank: 19, Token: 10763, Word:  leone
Rank: 20, Token: 6227, Word:  libya
Rank: 21, Token: 20295, Word:  darfur
Rank: 22, Token: 8869, Word:  somalia
Rank: 23, Token: 14851, Word:  algeria
Rank: 24, Token: 8709, Word:  colombia
Rank: 25, Token: 8852, Word:  kenya
Rank: 26, T

In [121]:
text = " sierra"
tokens = tokenizer.encode(text)
embed = word_embed[tokens]

print("Word:", text)

words_sim = word_sim_to_vocab(tokens, word_embed, top_k=10)[0]

for i, word in enumerate(words_sim):
    print(f"Rank: {i}, Token: {word}, Word: {tokenizer.decode([word])}")

Word:  sierra
Rank: 0, Token: 8313, Word:  sierra
Rank: 1, Token: 9977, Word:  liberia
Rank: 2, Token: 8148, Word:  sier
Rank: 3, Token: 10763, Word:  leone
Rank: 4, Token: 21344, Word:  namib
Rank: 5, Token: 13441, Word:  mali
Rank: 6, Token: 7349, Word:  nigeria
Rank: 7, Token: 10476, Word:  guinea
Rank: 8, Token: 17461, Word:  ethiopia
Rank: 9, Token: 4725, Word:  niger


## Terror-Syria Relation 

In [125]:
text = " terror"
tokens = tokenizer.encode(text)
embed = word_embed[tokens]

print("Word:", text)

words_sim = word_sim_to_vocab(tokens, word_embed, top_k=300)[0]

for i, word in enumerate(words_sim):
    print(f"Rank: {i}, Token: {word}, Word: {tokenizer.decode([word])}")

Word:  terror
Rank: 0, Token: 1829, Word:  terror
Rank: 1, Token: 4350, Word:  terrorist
Rank: 2, Token: 5642, Word:  terrorism
Rank: 3, Token: 6115, Word:  terrorists
Rank: 4, Token: 8610, Word:  militant
Rank: 5, Token: 12913, Word:  jihadist
Rank: 6, Token: 10190, Word:  extremist
Rank: 7, Token: 8174, Word: terror
Rank: 8, Token: 5757, Word:  jihad
Rank: 9, Token: 4773, Word:  militants
Rank: 10, Token: 9394, Word:  extremists
Rank: 11, Token: 8002, Word:  islamist
Rank: 12, Token: 16725, Word:  jihadi
Rank: 13, Token: 6501, Word:  bombing
Rank: 14, Token: 4041, Word:  violent
Rank: 15, Token: 2423, Word:  attacks
Rank: 16, Token: 7906, Word:  torture
Rank: 17, Token: 6642, Word:  radical
Rank: 18, Token: 5717, Word:  hate
Rank: 19, Token: 6344, Word:  brutal
Rank: 20, Token: 15315, Word:  extremism
Rank: 21, Token: 3276, Word:  isis
Rank: 22, Token: 9609, Word:  bombings
Rank: 23, Token: 14392, Word:  atroc
Rank: 24, Token: 13425, Word:  jihadists
Rank: 25, Token: 7336, Word:  hor

In [126]:
text = " syria"
tokens = tokenizer.encode(text)
embed = word_embed[tokens]

print("Word:", text)

words_sim = word_sim_to_vocab(tokens, word_embed, top_k=300)[0]

for i, word in enumerate(words_sim):
    print(f"Rank: {i}, Token: {word}, Word: {tokenizer.decode([word])}")

Word:  syria
Rank: 0, Token: 2629, Word:  syria
Rank: 1, Token: 10717, Word:  syria's
Rank: 2, Token: 3756, Word:  syrian
Rank: 3, Token: 6227, Word:  libya
Rank: 4, Token: 10544, Word:  damascus
Rank: 5, Token: 12346, Word:  aleppo
Rank: 6, Token: 10227, Word:  lebanon
Rank: 7, Token: 2370, Word:  iran
Rank: 8, Token: 3042, Word:  afghanistan
Rank: 9, Token: 1962, Word:  iraq
Rank: 10, Token: 17748, Word:  homs
Rank: 11, Token: 15139, Word:  syrians
Rank: 12, Token: 3893, Word:  ukraine
Rank: 13, Token: 3339, Word:  egypt
Rank: 14, Token: 4413, Word:  turkey
Rank: 15, Token: 8869, Word:  somalia
Rank: 16, Token: 8010, Word:  sudan
Rank: 17, Token: 6457, Word:  gaza
Rank: 18, Token: 6194, Word:  yemen
Rank: 19, Token: 3276, Word:  isis
Rank: 20, Token: 19583, Word:  raqqa
Rank: 21, Token: 8694, Word:  assad
Rank: 22, Token: 14197, Word:  tunisia
Rank: 23, Token: 2543, Word:  russia
Rank: 24, Token: 19192, Word:  kobane
Rank: 25, Token: 19832, Word:  beirut
Rank: 26, Token: 19837, Word: