## Extra Features

LSTM tagger that includes extra features, generated from the input text


In [2]:
import sys

import torch

sys.path.append("..")
from src import torch_util, util

data = util.load_examples_json(split_idx_id="0301")
display(data["train"].head(5))
vocab, token2idx, tag_vocab, tag2idx = util.make_vocab(data["train"])
print(f"vocab: {len(vocab)} tokens | tag_vocab: {len(tag_vocab)} tags")

Loaded 257 examples
    test: 34
    train: 169
    val: 54


difficulty,tokens,tags,name,lang,id,split,length
str,list[str],list[str],str,str,str,str,u32
"""easy""","[""["", ""2"", … ""]""]","[""brop"", ""nu"", … ""brcl""]","""smplrr""","""json""","""smplrr_json""","""train""",9
"""normal""","[""say"", "" "", """"Hello world""""]","[""kwio"", ""ws"", ""st""]","""hllwrld""","""natural""","""hllwrld_natural""","""train""",3
"""normal""","[""puts"", "" "", """"Hello World""""]","[""kwio"", ""ws"", ""st""]","""hllwrld""","""ruby""","""hllwrld_ruby""","""train""",3
"""normal""","[""if"", "" "", … "")""]","[""kwfl"", ""ws"", … ""brcl""]","""smplndnttb""","""pseudo""","""smplndnttb_pseudo""","""train""",8
"""normal""","[""if"", "" "", … "")""]","[""kwfl"", ""ws"", … ""brcl""]","""smplndntsp""","""pseudo""","""smplndntsp_pseudo""","""train""",8


vocab: 137 tokens | tag_vocab: 38 tags


In [3]:
constr_params = {
    "embedding_dim": 16,
    "hidden_dim": 64,
    "n_lstm_layers": 2,
    "dropout_lstm": 0.3,
    "bidi": True,
    "token_vocab_size": len(vocab),
    "label_vocab_size": len(tag_vocab),
}
model = torch_util.LSTMTagger(**constr_params)
print(model)

LSTMTagger(
  (embedding_tokens): Embedding(137, 16, padding_idx=0)
  (embedding_labels): Embedding(38, 16, padding_idx=0)
  (lstm): LSTM(32, 64, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (hidden2tag): Linear(in_features=128, out_features=38, bias=True)
)


## make features


In [4]:
def make_extra_feats(tokens: list[str]):
    """Prepare extra features for tagger"""
    features = torch.zeros(())
    features = []
    for token in tokens:
        is_capitalized = 1.0 if token[0].isupper() else 0.0
        word_length = min(len(token), 10) / 16  # normalized token length
        line_starts_with = hash(tokens[0]) % 10  # Bucket encoding

        features.append([is_capitalized, word_length, line_starts_with])

    return torch.tensor(features, dtype=torch.float32)


print(make_extra_feats(["Sys", "print"]).shape)
print(make_extra_feats(["print", "(", ")"]))

torch.Size([2, 3])
tensor([[0.0000, 0.3125, 3.0000],
        [0.0000, 0.0625, 3.0000],
        [0.0000, 0.0625, 3.0000]])
