In [63]:
import sys
from importlib import reload

import polars as pl
import torch
import torch.nn as nn
from sklearn import metrics
from torch.utils.data import DataLoader, Dataset

from datatools import plotting as dtplot
from datatools import tabular as dttab

sys.path.append("..")

import plotting
import util
from src import text_process
from src import models_torch

reload(util)
reload(plotting)
reload(models_torch)
dtplot.set_plotly_template()

## load data


In [64]:
examples = util.load_examples(
    # util.MAP_TAGS,
    # ["python", "pseudo", "rust"],
).sort("length")

examples = examples.with_columns(
    tags_det=pl.col("tokens").map_elements(
        lambda tks: text_process.process("".join(tks))[1], pl.List(pl.String)
    )
)
# display(examples)
lang_counts = dttab.value_counts(examples["lang"], verbose=True, as_dict=True)


20 unique (lang):  'python', 'matlab', 'pseudo', 'php', 'rust' ,...


In [65]:
# train-val split
train_df, val_df = util.data_split(examples, 0.3)
print(f"split: {len(train_df)} training, {len(val_df)} val")


splitted 84 & 37 (shuffled)
split: 84 training, 37 val


## most common tokens


In [66]:
token_counts = dttab.value_counts(
    examples["tokens"].explode(), verbose=True, as_dict=True
)
tag_counts = dttab.value_counts(examples["tags"].explode(), verbose=True, as_dict=True)


657 unique (tokens):  ' ', '\n', ',', ')', '(' ,...
33 unique (tags):  'ws', 'va', 'pu', 'nl', 'brop' ,...


## make a vocab!

- add padding to both tokens and tags
- also, convert tokens and tags to integers


In [67]:
# vocab for tokens
vocab = ["<pad>", "<unk>"] + list(token_counts.keys())[:50]
token2idx = {t: i for i, t in enumerate(vocab)}

# tags
tag_vocab = ["<pad>", "uk"] + list(tag_counts.keys())
tag2idx = {t: i for i, t in enumerate(tag_vocab)}

print("vocab (tokens):", vocab)
print("vocab (tags)  :", tag_vocab)

# Convert tokens and labels to indices
# these are lists of lists!
train_token_idx = [[token2idx.get(t, 1) for t in seq] for seq in train_df["tokens"]]
train_tag_true_idx = [[tag2idx[t] for t in seq] for seq in train_df["tags"]]
train_tag_det_idx = [[tag2idx.get(t, 1) for t in seq] for seq in train_df["tags_det"]]

# print("\nlists of lists:")
# print(train_token_idx)
# print(train_tag_idx)
print(f"\ntraining examples of length: {[len(e) for e in train_token_idx]}")

# validation data
val_token_idx = [[token2idx.get(t, 1) for t in seq] for seq in val_df["tokens"]]
val_tag_true_idx = [[tag2idx[t] for t in seq] for seq in val_df["tags"]]
val_tag_det_idx = [[tag2idx.get(t, 1) for t in seq] for seq in val_df["tags_det"]]
print(f"validation examples of length: {[len(e) for e in val_token_idx]}")


vocab (tokens): ['<pad>', '<unk>', ' ', '\n', ',', ')', '(', '=', '.', ';', '    ', '0', ']', '[', '1', ':', '\n\n', '  ', 'x', '}', '{', '*', 'import', 'i', 'y', '+', '2', 'for', 'np', 'n', '==', 'if', 'return', 'in', 'data', 'torch', '-', '::', 'int', 'as', 'plt', 'from', '3', 'echo', '<', 'z', 't', 'print', '_player', 'util', 'usize', 'k']
vocab (tags)  : ['<pad>', 'uk', 'ws', 'va', 'pu', 'nl', 'brop', 'brcl', 'nu', 'sy', 'opas', 'id', 'mo', 'fnfr', 'st', 'opbi', 'kwfl', 'fnas', 'kwim', 'pa', 'cl', 'kwty', 'fnme', 'at', 'kwop', 'opcm', 'opun', 'bo', 'kwva', 'kwio', 'kwmo', 'kwfn', 'kwde', 'cofl', 'li']

training examples of length: [59, 73, 8, 20, 9, 29, 3, 26, 8, 92, 30, 17, 39, 36, 85, 98, 24, 44, 38, 20, 14, 26, 85, 4, 5, 9, 15, 21, 33, 40, 11, 29, 155, 18, 42, 8, 33, 25, 22, 25, 111, 32, 11, 20, 17, 13, 60, 22, 14, 91, 47, 44, 95, 14, 10, 28, 9, 63, 15, 53, 29, 35, 3, 314, 12, 38, 25, 12, 76, 26, 30, 21, 48, 43, 17, 25, 28, 13, 17, 39, 15, 32, 51, 8]
validation examples of lengt

### class weights?


In [68]:
reload(dttab)


all_tags = train_df["tags"].explode()

tag_counts = dttab.value_counts(all_tags, sort_by="value", as_dict=True)


def class_weights(tag_counts: dict, tag_vocab: list[str], smoothing=0.1):
    tag_weights = [1 / tag_counts.get(k, torch.inf) + smoothing for k in tag_vocab]
    tag_weights = torch.tensor(tag_weights)
    tag_weights /= sum(tag_weights)
    return tag_weights


tag_weights = class_weights(tag_counts, tag_vocab, 10)

# for k in tag_vocab:
#     print(f"weight({k}) = ", tag_weights[tag2idx[k]])

### Prepare data for model


In [69]:
print("train:")
train_token_tensors = util.seqs2padded_tensor(train_token_idx)
train_tag_true_tensors = util.seqs2padded_tensor(train_tag_true_idx)
train_tag_det_tensors = util.seqs2padded_tensor(train_tag_det_idx)
print("val:")
val_token_tensors = util.seqs2padded_tensor(val_token_idx)
val_tag_true_tensors = util.seqs2padded_tensor(val_tag_true_idx)
val_tag_det_tensors = util.seqs2padded_tensor(val_tag_det_idx)

assert train_token_tensors.shape == train_tag_true_tensors.shape
assert train_tag_det_tensors.shape == train_tag_true_tensors.shape

assert val_token_tensors.shape == val_tag_true_tensors.shape
assert val_tag_det_tensors.shape == val_tag_true_tensors.shape


train:
padded tensor: (84, 314)
padded tensor: (84, 314)
padded tensor: (84, 318)
val:
padded tensor: (37, 71)
padded tensor: (37, 71)
padded tensor: (37, 71)


AssertionError: 

In [47]:
class SequenceDataset(Dataset):
    def __init__(self, tokens, labels_det, labels_true):
        print(len(tokens), len(labels_det), len(labels_true))
        self.tokens = tokens
        self.labels_det = labels_det
        self.labels_true = labels_true

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        return self.tokens[idx], self.labels_det[idx], self.labels_true[idx]


# Create dataset and dataloader

train_loader = DataLoader(
    SequenceDataset(train_token_tensors, train_tag_det_tensors, train_tag_true_tensors),
    batch_size=8,
    shuffle=True,
)
val_loader = DataLoader(
    SequenceDataset(val_token_tensors, val_tag_det_tensors, val_tag_true_tensors),
    batch_size=16,
    shuffle=False,
)


84 84 84
37 37 37


## model


In [None]:
# Parameters
embedding_dim = 6
hidden_dim = 64
n_lstm_layers = 2
dropout_lstm = 0.5
bidi = True
epochs = 30

print("vocab lengths", len(vocab), len(tag_vocab))
reload(models_torch)

model = models_torch.LSTMTagger(
    len(vocab),
    len(tag_vocab),
    embedding_dim,
    hidden_dim,
    n_lstm_layers,
    dropout_lstm,
    bidi,
)
loss_function = nn.CrossEntropyLoss()

loss_function = nn.CrossEntropyLoss(weight=tag_weights)

optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

# Training loop

losses_train = []
losses_val = []

for epoch in range(epochs):
    # TRAINING
    train_loss = models_torch.run_epoch(model, train_loader, loss_function, optimizer)
    losses_train.append(train_loss)

    # VALIDATION
    with torch.no_grad():
        val_loss = models_torch.run_epoch(model, val_loader, loss_function)
        losses_val.append(val_loss)

    # print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")
plotting.scatter(y=[losses_train, losses_val]).show()

print(
    "final loss:\n", f"  train: {losses_train[-1]:.4f}", f"  val : {losses_val[-1]:.4f}"
)

vocab lengths 52 35


RuntimeError: Sizes of tensors must match except in dimension 2. Expected size 314 but got size 318 for tensor number 1 in the list.

## evaluate


In [None]:
model.eval()
with torch.no_grad():
    tag_scores = model(val_token_tensors, val_tag_det_tensors)
    predictions = torch.argmax(tag_scores, dim=-1)  # Shape: (batch_size, seq_len)

pred_tags = []
true_tags = []

for pred, true_t in zip(predictions, val_tag_true_idx):
    true_tags.extend([tag_vocab[t] for t in true_t])
    pred_tags.extend([tag_vocab[t] for t in pred[: len(true_t)]])

print(len(true_tags), len(pred_tags))

acc = metrics.accuracy_score(true_tags, pred_tags)
print("accuracy", acc)

confmat = metrics.confusion_matrix(true_tags, pred_tags, labels=tag_vocab)

dtplot.heatmap(
    confmat,
    tag_vocab,
    log_scale=False,
    pseudo_count=10,
    size=400,
).show()

1080 1080
accuracy 0.924074074074074


### eval only on non-det


In [None]:
pred_tags = []
true_tags = []

for pred, true_t, det in zip(
    predictions, val_tag_true_idx, val_tag_det_idx, strict=True
):
    for p, t, d in zip(pred, true_t, det):
        if tag_vocab[d] == "uk":
            tp = tag_vocab[p.item()]
            tt = tag_vocab[t]
            # if tp != tt:
            #     print(f"{repr(tt)} -> {repr(tp)}")
            true_tags.append(tt)
            pred_tags.append(tp)
print(len(pred_tags), len(true_tags))

labels_left = sorted(set(pred_tags + true_tags))

acc = metrics.accuracy_score(true_tags, pred_tags)
print("accuracy", acc)

confmat = metrics.confusion_matrix(true_tags, pred_tags, labels=labels_left)

dtplot.heatmap(
    confmat,
    labels_left,
    log_scale=False,
    pseudo_count=10,
    size=400,
).show()

356 356
accuracy 0.7949438202247191


## save output


In [None]:
import json

# print(os.listdir())

model.eval()
with torch.no_grad():
    tag_scores = model(val_token_tensors, val_tag_det_tensors)
    predictions = torch.argmax(tag_scores, dim=-1)  # Shape: (batch_size, seq_len)

print("predictions", predictions.size())

outputs = {}
for ex, pred in zip(val_df.iter_rows(named=True), predictions, strict=True):
    pred_tags = []
    for p in pred:
        if p == 0:
            break
        pred_tags.append(tag_vocab[p])
    assert len(ex["tokens"]) == len(pred_tags), "wrong length"
    pred_tags
    print(ex["tags"])
    print(pred_tags)
    print()
    outputs[ex["name"]] = {"tokens": ex["tokens"], "tags": pred_tags}
with open("../output/pred_output.json", "w", encoding="utf-8") as f:
    json.dump(outputs, f)

predictions torch.Size([35, 111])
['cofl', 'nl', 'va', 'ws', 'opas', 'ws', 'va', 'ws', 'opbi', 'ws', 'nu', 'ws', 'opbi', 'ws', 'va', 'pu', 'nl', 'va', 'ws', 'opas', 'ws', 'va', 'ws', 'opbi', 'ws', 'nu', 'ws', 'opbi', 'ws', 'va', 'pu']
['cofl', 'nl', 'va', 'ws', 'opas', 'ws', 'va', 'ws', 'opbi', 'ws', 'nu', 'ws', 'opbi', 'ws', 'va', 'pu', 'nl', 'va', 'ws', 'opas', 'ws', 'va', 'ws', 'opbi', 'ws', 'nu', 'ws', 'opbi', 'ws', 'va', 'pu']



AssertionError: wrong length

# parameter search


In [13]:
from coolsearch import search

reload(search)


def objective(
    embedding_dim,
    hidden_dim,
    n_lstm_layers,
    dropout_lstm,
    epochs,
    class_weight_smoothing,
    bidi,
):
    model = models_torch.LSTMTagger(
        len(vocab),
        len(tag_vocab),
        embedding_dim,
        hidden_dim,
        n_lstm_layers,
        dropout_lstm,
        bidi,
    )
    tag_weights = class_weights(tag_counts, tag_vocab, class_weight_smoothing)

    loss_function = nn.CrossEntropyLoss(weight=tag_weights)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

    # Training loop

    for _ in range(epochs):
        # TRAINING
        models_torch.run_epoch(model, train_loader, loss_function, optimizer)

    model.eval()
    with torch.no_grad():
        val_loss = models_torch.run_epoch(model, val_loader, loss_function)
        tag_scores = model(val_token_tensors, val_tag_det_tensors)
        predictions = torch.argmax(tag_scores, dim=-1)

    pred_tags = []
    true_tags = []

    for pred, true_t in zip(predictions, val_tag_true_idx):
        true_tags.extend([tag_vocab[t] for t in true_t])
        pred_tags.extend([tag_vocab[t] for t in pred[: len(true_t)]])

    acc = metrics.accuracy_score(true_tags, pred_tags)

    return {"val_acc": acc, "val_loss": val_loss}


params = {
    "embedding_dim": [4, 6],
    "hidden_dim": [16, 32],
    "n_lstm_layers": [2],
    "dropout_lstm": [0.3],
    "epochs": 5,
    "class_weight_smoothing": [100.0],
    "bidi": True,
}
cs = search.CoolSearch(
    objective, params, n_jobs=1, samples_file="../search/lstm_03.csv"
)


In [14]:
cs.grid_search(3)
display(cs.samples.sort("val_loss"))

no new points!


bidi,class_weight_smoothing,dropout_lstm,embedding_dim,epochs,hidden_dim,n_lstm_layers,runtime,val_acc,val_loss
bool,f64,f64,i32,i32,i32,i32,f64,f64,f64
True,100.0,0.3,6,10,32,2,0.781659,0.662237,0.431066
True,100.0,0.3,4,10,32,2,0.468184,0.638981,0.450502
True,100.0,0.3,6,10,16,2,0.556546,0.488372,0.632376
True,100.0,0.3,4,10,16,2,0.356832,0.428571,0.700303
True,100.0,0.3,6,5,32,2,0.237014,0.354374,0.789934
True,100.0,0.3,4,5,32,2,0.231413,0.353267,0.809635
True,100.0,0.3,4,5,16,2,0.196189,0.264673,0.856089
True,100.0,0.3,6,5,16,2,0.174387,0.256921,0.89928
True,100.0,0.3,6,3,32,2,0.189315,0.259136,0.921637
True,100.0,0.3,4,3,32,2,0.207486,0.253599,0.927424


In [15]:
plotting.scatter(cs.samples["val_loss"], [cs.samples["val_acc"]]).update_traces(
    mode="markers"
).update_layout(width=200, height=100)

In [16]:
marg = cs.marginals("val_acc")

pars = list(marg.keys())

print(pars)
k = pars[1]
print(k)
plotting.scatter(x=marg[k][k], y=[marg[k]["max"], marg[k]["mean"]])


['embedding_dim', 'hidden_dim', 'n_lstm_layers', 'dropout_lstm', 'epochs', 'class_weight_smoothing', 'bidi']
hidden_dim
