### Data prep

In [19]:
import numpy as np
import torch

np.random.seed(3)

In [2]:
UD_ENGLISH_TRAIN = 'en_partut-ud-train.conllu'
UD_ENGLISH_DEV = 'en_partut-ud-dev.conllu'
UD_ENGLISH_TEST = 'en_partut-ud-test.conllu'
embs_path = 'wiki-news-300d-1M.vec'

In [3]:
import pyconll

def read_conllu(path):
    data = pyconll.load_from_file(path)
    tagged_sentences=[]
    for sentence in data:
        tagged_sentence=[]
        for token in sentence:
            if token.upos:
                tagged_sentence.append((token.form.lower(), token.upos))
        tagged_sentences.append(tagged_sentence)
    return tagged_sentences

In [4]:
from gensim.models.keyedvectors import KeyedVectors

embeddings = KeyedVectors.load_word2vec_format('datasets/' + embs_path, binary=False)

train_sentences = read_conllu('datasets/' + UD_ENGLISH_TRAIN)
val_sentences = read_conllu('datasets/' + UD_ENGLISH_DEV)
test_sentences = read_conllu('datasets/' + UD_ENGLISH_TEST)

In [8]:
dim = embeddings.vectors.shape[1]
pad = np.zeros(dim)
oov =  np.random.uniform(-0.25, 0.25, dim) # out of vocabulary vector

def features_embs(sentence, index, window):
    vec = np.array([])
    for i in range(index-window,index+window+1):
        if i < 0:
            vec = np.append(vec, pad)
            continue
        if i > len(sentence) - 1:
            vec = np.append(vec, pad)
            continue
        try:
            vec = np.append(vec, embeddings[sentence[i]])
        except:
            vec = np.append(vec, oov)
    return vec

In [9]:
from utils import untag

def transform_to_dataset(tagged_sentences, window):
    X, y = [], []
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features_embs(untag(tagged), index, window))
            y.append(tagged[index][1])
    return X, y

def vectorize(train, val, test, window):
    X_train, y_train = transform_to_dataset(train, window)
    X_val, y_val = transform_to_dataset(val, window)   
    X_test, y_test = transform_to_dataset(test, window)
    
    return (
        np.asarray(X_train), 
        np.asarray(y_train), 
        np.asarray(X_val), 
        np.asarray(y_val), 
        np.asarray(X_test), 
        np.asarray(y_test)
    )

In [10]:
X_train, y_train, X_val, y_val, X_test, y_test = vectorize(train_sentences, val_sentences, test_sentences, 1)

X_train.shape

(43503, 900)

### Baseline tagger

In [11]:
import nltk
from sklearn.metrics import classification_report
from utils import tag_list, apply_tagger, tag_sequence

default_tagger = nltk.DefaultTagger('NN')
unigram_tagger = nltk.UnigramTagger(train_sentences+val_sentences, backoff=default_tagger)

y_test = [item for sublist in tag_sequence(test_sentences) for item in sublist]
y_pred = tag_list(apply_tagger(unigram_tagger, test_sentences))

print(classification_report(y_test, y_pred, digits=4, zero_division=np.nan))


              precision    recall  f1-score   support

         ADJ     0.8832    0.7803    0.8286       223
         ADP     0.9452    0.8832    0.9131       488
         ADV     0.9057    0.7500    0.8205       128
         AUX     0.9286    1.0000    0.9630       234
       CCONJ     1.0000    0.9896    0.9948        96
         DET     0.9661    0.9749    0.9705       439
        INTJ     1.0000    1.0000    1.0000         2
          NN     0.0000       nan    0.0000         0
        NOUN     0.9549    0.8143    0.8790       754
         NUM     0.9649    0.9016    0.9322        61
        PART     0.5323    1.0000    0.6947        66
        PRON     0.9333    0.7706    0.8442       109
       PROPN     0.8676    0.6556    0.7468        90
       PUNCT     1.0000    1.0000    1.0000       339
       SCONJ     0.6863    0.6863    0.6863        51
        VERB     0.9316    0.6687    0.7786       326
           X     1.0000    1.0000    1.0000         2

    accuracy              

### Our tagger

In [12]:
# Preprocessing
from sklearn import preprocessing
#from torch.nn.functional import one_hot

X_train, y_train, X_val, y_val, X_test, y_test = vectorize(train_sentences, val_sentences, test_sentences, 1)

le = preprocessing.LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test = le.transform(y_test)
y_val = le.transform(y_val)
#y_train = one_hot(torch.tensor(y_train).long())
#y_test = one_hot(torch.tensor(y_test).long())
#y_val = one_hot(torch.tensor(y_val).long())

In [17]:
import kan
from importlib import reload
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch.nn as nn

reload(kan)

model = kan.KAN([X_train.shape[1], 64, y_train.max()+1])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)
criterion = nn.CrossEntropyLoss()

train_loader = DataLoader([(torch.from_numpy(x).to(torch.float), y) for x, y in zip(X_train, y_train)], batch_size=128, shuffle=True)
val_loader = DataLoader([(torch.from_numpy(x).to(torch.float), y) for x, y in zip(X_val, y_val)], batch_size=128, shuffle=True)
test_loader = DataLoader([(torch.from_numpy(x).to(torch.float), y) for x, y in zip(X_test, y_test)], batch_size=128, shuffle=True)

for epoch in range(10):
    model.train()
    with tqdm(train_loader) as pbar:
        for i, (x, y) in enumerate(pbar):
            x = x.view(-1, X_train.shape[1]).to(device)
            y = y.type(torch.LongTensor)
            optimizer.zero_grad()
            output = model(x)
            loss = criterion(output, y.to(device))
            loss.backward()
            optimizer.step()
            accuracy = (output.argmax(dim=1) == y.to(device)).float().mean()
            pbar.set_postfix(loss=loss.item(), accuracy=accuracy.item(), lr=optimizer.param_groups[0]['lr'])

    model.eval()
    val_loss = 0
    val_accuracy = 0
    with torch.no_grad():
        for x, y in val_loader:
            x = x.view(-1, X_train.shape[1]).to(device)
            y = y.type(torch.LongTensor)
            output = model(x)
            val_loss += criterion(output, y.to(device)).item()
            val_accuracy += ((output.argmax(dim=1) == y.to(device)).float().mean().item())
    val_loss /= len(val_loader)
    val_accuracy /= len(val_loader)

    scheduler.step()

    print(f"Epoch {epoch + 1}, Val Loss: {val_loss}, Val Accuracy: {val_accuracy}")

100%|██████████| 340/340 [00:02<00:00, 144.77it/s, accuracy=0.937, loss=0.309, lr=0.001]


Epoch 1, Val Loss: 0.377565783533183, Val Accuracy: 0.8844836218790575


100%|██████████| 340/340 [00:02<00:00, 133.35it/s, accuracy=0.91, loss=0.319, lr=0.0008]  


Epoch 2, Val Loss: 0.3028968261046843, Val Accuracy: 0.9104069173336029


100%|██████████| 340/340 [00:02<00:00, 168.57it/s, accuracy=0.91, loss=0.29, lr=0.00064]   


Epoch 3, Val Loss: 0.2383726266297427, Val Accuracy: 0.9262199212204326


100%|██████████| 340/340 [00:02<00:00, 160.13it/s, accuracy=0.973, loss=0.12, lr=0.000512]  


Epoch 4, Val Loss: 0.2345898984508081, Val Accuracy: 0.9236296794631265


100%|██████████| 340/340 [00:02<00:00, 113.90it/s, accuracy=0.955, loss=0.101, lr=0.00041] 


Epoch 5, Val Loss: 0.2193894230506637, Val Accuracy: 0.9281625991517847


100%|██████████| 340/340 [00:02<00:00, 140.53it/s, accuracy=0.973, loss=0.0916, lr=0.000328]


Epoch 6, Val Loss: 0.20534061843698675, Val Accuracy: 0.936685326424512


100%|██████████| 340/340 [00:02<00:00, 138.31it/s, accuracy=0.964, loss=0.141, lr=0.000262] 


Epoch 7, Val Loss: 0.192309402606704, Val Accuracy: 0.9435369318181818


100%|██████████| 340/340 [00:02<00:00, 144.85it/s, accuracy=0.964, loss=0.105, lr=0.00021] 


Epoch 8, Val Loss: 0.1989999105307189, Val Accuracy: 0.9413018036972393


100%|██████████| 340/340 [00:02<00:00, 137.61it/s, accuracy=0.991, loss=0.0454, lr=0.000168]


Epoch 9, Val Loss: 0.18485583348030393, Val Accuracy: 0.9424715909090909


100%|██████████| 340/340 [00:02<00:00, 133.22it/s, accuracy=0.964, loss=0.0997, lr=0.000134]


Epoch 10, Val Loss: 0.194587189365517, Val Accuracy: 0.9408631351861086


In [18]:
test_loader = DataLoader([(torch.from_numpy(x).to(torch.float), y) for x, y in zip(X_test, y_test)], batch_size=128, shuffle=True)

model.eval()
test_loss = 0
test_acc = 0
with torch.no_grad():
    for images, labels in test_loader:
        images = images.view(-1, X_train.shape[1]).to(device)
        labels = labels.type(torch.LongTensor)
        output = model(images)
        test_loss += criterion(output, labels.to(device)).item()
        test_acc += (
            (output.argmax(dim=1) == labels.to(device)).float().mean().item()
        )
test_loss /= len(test_loader)
test_acc /= len(test_loader)

print(
    f"Test Loss: {val_loss}, Test Accuracy: {test_acc}"
)

Test Loss: 0.194587189365517, Test Accuracy: 0.9562500008830318
