In [6]:
import sys
from importlib import reload

import numpy as np
import polars as pl
import torch
import torch.nn as nn
from sklearn import metrics
from torch.utils.data import DataLoader, Dataset

sys.path.append("..")
from datatools import tabular as dttab, plotting as dtplot


import plotting
import util

reload(util)
reload(plotting)

dtplot.set_plotly_template()

## load data


In [7]:
examples = util.load_examples().sort("length")

lang_counts = dttab.value_counts(examples["lang"], verbose=True, as_dict=True)
print(lang_counts)
# display(examples)

16 unique (lang):  'python', 'matlab', 'pseudo', 'rust', 'lua' ,...
{'python': 26, 'matlab': 14, 'pseudo': 13, 'rust': 10, 'lua': 3, 'csharp': 3, 'kotlin': 2, 'dart': 2, 'cpp': 2, 'c': 2, 'ruby': 1, 'php': 1, 'natural': 1, 'json': 1, 'go': 1, 'bash': 1}


In [8]:
# train-val split
train_df, val_df = util.data_split(examples, 0.3)
print(f"split: {len(train_df)} training, {len(val_df)} val")


splitted 58 & 25 (shuffled)
split: 58 training, 25 val


## most common tokens


In [9]:
token_counts = dttab.value_counts(
    examples["tokens"].explode(), verbose=True, as_dict=True
)
tag_counts = dttab.value_counts(examples["tags"].explode(), verbose=True, as_dict=True)


440 unique (tokens):  ' ', '\n', ')', '(', '=' ,...
32 unique (tags):  'ws', 'br', 'va', 'nl', 'pu' ,...


## make a vocab!

- add padding to both tokens and tags
- also, convert tokens and tags to integers


In [10]:
# vocab for tokens
vocab = ["<pad>", "<unk>"] + list(token_counts.keys())[:10]
token2idx = {t: i for i, t in enumerate(vocab)}

# tags
tag_vocab = ["<pad>"] + list(tag_counts.keys())
tag2idx = {t: i for i, t in enumerate(tag_vocab)}

print("vocab (tokens):", vocab)
print("vocab (tags)  :", tag_vocab)

# Convert tokens and labels to indices
# these are lists of lists!
train_token_idx = [[token2idx.get(t, 1) for t in seq] for seq in train_df["tokens"]]
train_tag_idx = [[tag2idx[t] for t in seq] for seq in train_df["tags"]]
# print("\nlists of lists:")
# print(train_token_idx)
# print(train_tag_idx)
print(f"\ntraining examples of length: {[len(e) for e in train_token_idx]}")

# validation data
val_token_idx = [[token2idx.get(t, 1) for t in seq] for seq in val_df["tokens"]]
val_tag_idx = [[tag2idx[t] for t in seq] for seq in val_df["tags"]]
print(f"validation examples of length: {[len(e) for e in val_token_idx]}")


vocab (tokens): ['<pad>', '<unk>', ' ', '\n', ')', '(', '=', ',', '.', ';', '    ', 'x']
vocab (tags)  : ['<pad>', 'ws', 'br', 'va', 'nl', 'pu', 'opas', 'sy', 'nu', 'mo', 'id', 'opbi', 'fnfr', 'kwfl', 'kwim', 'pa', 'st', 'cl', 'fnas', 'kwty', 'fnme', 'at', 'opcm', 'opun', 'kwop', 'kwva', 'bo', 'kwfn', 'kwmo', 'kwio', 'cofl', 'kwde', 'li']

training examples of length: [11, 31, 15, 25, 26, 62, 3, 17, 17, 14, 20, 20, 38, 47, 3, 25, 28, 35, 21, 26, 30, 3, 15, 15, 63, 29, 44, 9, 22, 43, 9, 9, 47, 73, 25, 63, 32, 13, 28, 38, 10, 25, 17, 8, 31, 26, 38, 13, 71, 3, 29, 32, 9, 32, 29, 22, 33, 29]
validation examples of length: [21, 25, 76, 8, 44, 65, 31, 30, 20, 51, 39, 39, 22, 36, 37, 32, 85, 42, 44, 11, 21, 19, 48, 8, 26]


### Prepare data for model


In [11]:
train_token_tensors = util.seqs2padded_tensor(train_token_idx)
train_tag_tensors = util.seqs2padded_tensor(train_tag_idx)
val_token_tensors = util.seqs2padded_tensor(val_token_idx)
val_tag_tensors = util.seqs2padded_tensor(val_tag_idx)

print(f"token tensor (train): {train_token_tensors.shape}")
print(f"tag tensor   (train): {train_tag_tensors.shape}")
print(f"token tensor (val): {val_token_tensors.shape}")
print(f"tag tensor   (val): {val_tag_tensors.shape}")

padded tensor: (58, 73)
padded tensor: (58, 73)
padded tensor: (25, 85)
padded tensor: (25, 85)
token tensor (train): torch.Size([58, 73])
tag tensor   (train): torch.Size([58, 73])
token tensor (val): torch.Size([25, 85])
tag tensor   (val): torch.Size([25, 85])


In [12]:
class SequenceDataset(Dataset):
    def __init__(self, tokens, labels):
        self.tokens = tokens
        self.labels = labels

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        return self.tokens[idx], self.labels[idx]


# Create dataset and dataloader
train_dataset = SequenceDataset(train_token_tensors, train_tag_tensors)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)


## model


In [13]:
class LSTMTagger(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim):
        super(LSTMTagger, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentences):
        embeds = self.embedding(sentences)
        # Shape: (batch_size, seq_len, embedding_dim)
        lstm_out, _ = self.lstm(embeds)
        # Shape: (batch_size, seq_len, hidden_dim)
        tag_scores = self.hidden2tag(lstm_out)
        # Shape: (batch_size, seq_len, tagset_size)
        return tag_scores


In [15]:
# Parameters
embedding_dim = 32
hidden_dim = 32
vocab_size = len(vocab)
tagset_size = len(tag_vocab)

model = LSTMTagger(vocab_size, tagset_size, embedding_dim, hidden_dim)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

# Training loop
epochs = 40
losses = []
for epoch in range(epochs):
    for examples, labels in train_loader:
        model.zero_grad()

        # Forward pass
        tag_scores = model(examples)

        # Reshape for loss calculation
        loss = loss_function(tag_scores.view(-1, tagset_size), labels.view(-1))

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        losses.append(loss.item())

    # print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")
plotting.scatter(y=[losses])

## evaluate


In [None]:
model.eval()
with torch.no_grad():
    tag_scores = model(val_token_tensors)
    predictions = torch.argmax(tag_scores, dim=-1)  # Shape: (batch_size, seq_len)

pred_tags = []
true_tags = []

for pred, true_t in zip(predictions, val_tag_idx):
    # print(pred.shape)
    # print((len(true_tags)))
    true_tags.extend([tag_vocab[t] for t in true_t])
    pred_tags.extend([tag_vocab[t] for t in pred[: len(true_t)]])

print(len(true_tags), len(pred_tags))


def eval(true_tags, pred_tags):
    acc = metrics.accuracy_score(true_tags, pred_tags)
    print("accuracy", acc)

    confmat = metrics.confusion_matrix(true_tags, pred_tags, labels=tag_vocab)

    dtplot.heatmap(
        confmat,
        tag_vocab,
        log_scale=True,
        pseudo_count=10,
        size=500,
    ).show()


eval(true_tags, pred_tags)

880 880
accuracy 0.6863636363636364


In [17]:
from src import text_process

pred_tags_det = (
    val_df["tokens"]
    .map_elements(lambda tks: text_process.process("".join(tks))[1], pl.List(pl.String))
    .explode()
    .to_list()
)

# put all unknown into some class
# pred_tags_det = [t if t != "uk" else "va" for t in pred_tags_det]

print(len(true_tags), len(pred_tags_det))
print(true_tags)
print(pred_tags_det)


eval(true_tags, pred_tags_det)


880 880
['va', 'ws', 'opas', 'ws', 'bo', 'nl', 'va', 'ws', 'opas', 'ws', 'bo', 'nl', 'va', 'ws', 'opas', 'ws', 'va', 'ws', 'kwop', 'ws', 'va', 'kwva', 'ws', 'va', 'ws', 'opas', 'ws', 'kwfl', 'ws', 'br', 'va', 'ws', 'opbi', 'ws', 'nu', 'ws', 'opcm', 'ws', 'nu', 'br', 'ws', 'st', 'ws', 'kwfl', 'ws', 'st', 'kwva', 'ws', 'kwmo', 'ws', 'va', 'ws', 'opas', 'ws', 'cl', 'sy', 'fnas', 'br', 'br', 'pu', 'nl', 'kwfl', 'ws', 'va', 'ws', 'kwop', 'ws', 'nu', 'sy', 'va', 'ws', 'br', 'nl', 'id', 'kwfl', 'ws', 'va', 'br', 'va', 'br', 'sy', 'fnme', 'br', 'br', 'sy', 'fnme', 'br', 'sy', 'opun', 'pa', 'sy', 'ws', 'pa', 'ws', 'opcm', 'ws', 'nu', 'br', 'ws', 'br', 'nl', 'id', 'id', 'va', 'sy', 'fnme', 'br', 'va', 'br', 'va', 'br', 'sy', 'fnme', 'br', 'br', 'br', 'pu', 'nl', 'id', 'br', 'nl', 'br', 'kwfl', 'ws', 'bo', 'nl', 'id', 'fnfr', 'br', 'br', 'va', 'ws', 'opas', 'ws', 'mo', 'sy', 'fnas', 'br', 'st', 'br', 'nl', 'va', 'ws', 'opas', 'ws', 'mo', 'sy', 'fnas', 'br', 'va', 'opbi', 'nu', 'opbi', 'va', 'opbi

## simple fill


In [18]:
pred_filled = []
for p, pd in zip(pred_tags, pred_tags_det, strict=True):
    pred_filled.append(pd if pd != "uk" else p)

print(pred_filled)
eval(true_tags, pred_filled)

['va', 'ws', 'opas', 'ws', 'va', 'nl', 'va', 'ws', 'opas', 'ws', 'nu', 'nl', 'va', 'ws', 'opas', 'ws', 'va', 'ws', 'kwim', 'ws', 'va', 'va', 'ws', 'va', 'ws', 'opas', 'ws', 'va', 'ws', 'br', 'pa', 'ws', 'va', 'ws', 'nu', 'ws', 'opcm', 'ws', 'nu', 'br', 'ws', 'st', 'ws', 'va', 'ws', 'st', 'va', 'ws', 'va', 'ws', 'kwim', 'ws', 'opas', 'ws', 'nu', 'sy', 'va', 'br', 'br', 'pu', 'nl', 'va', 'ws', 'va', 'ws', 'kwim', 'ws', 'nu', 'sy', 'va', 'ws', 'br', 'nl', 'id', 'kwfl', 'ws', 'va', 'br', 'br', 'br', 'sy', 'fnme', 'br', 'br', 'sy', 'fnme', 'br', 'va', 'nu', 'va', 'sy', 'ws', 'pa', 'ws', 'opcm', 'ws', 'nu', 'br', 'ws', 'br', 'nl', 'id', 'id', 'kwfl', 'sy', 'fnme', 'br', 'va', 'br', 'br', 'br', 'sy', 'fnme', 'br', 'br', 'br', 'pu', 'nl', 'id', 'br', 'nl', 'br', 'va', 'ws', 'va', 'nl', 'id', 'kwfl', 'br', 'br', 'va', 'ws', 'opas', 'ws', 'va', 'sy', 'fnas', 'br', 'st', 'br', 'nl', 'va', 'ws', 'opas', 'ws', 'va', 'sy', 'at', 'br', 'va', 'opbi', 'nu', 'br', 'br', 'opbi', 'nu', 'br', 'nu', 'br', '