# Training and Evaluating a POS Tagger

**Goal**
- assign POS tag to each word
    - x = first column of conll, y = second column of conll

**Plan**
- preprocess data
    - numerical classes bsed on unique POS tags
    - encoded strings (look at ways to do that)
- decide on model (have to be able to explain it!)
- train and evaluation loop
- pick appropriate metric (F1! precision? recall?)
- optional: create some nice plots, e.g.: confusion matrix, learning curve, precision-recall curve
- optional: analyse dataset (distribution of POS tages, most common words per POS tag, etc.)

**Model**
- decision tree: create features for each word (https://nlpforhackers.io/training-pos-tagger/)
- LSTM/RNN with word ids based on unique words --> study how LSTM/RNN network works!

if tokenization: use spcy en_core_web_sm

In [410]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [518]:
import pandas as pd
import numpy as np

import spacy
#from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
#from gensim.models.keyedvectors import load_word2vec_format
#from gensim.models import FastText
from gensim.models import Word2Vec, KeyedVectors

import torch
from torch import nn
from sklearn.metrics import f1_score

from utils import POSDataset, Model, train_eval

In [None]:
!python -m spacy download en_core_web_sm

## Preprocessing

### Create dataset

In [413]:
with open('train.txt') as f:
    train_data = f.readlines()

In [414]:
# split dataset into sentences

lines = list()
lines.append(list()) 
current_idx = 0

for string in train_data:
    if string == "\n":
        lines.append(list())
        current_idx += 1
    else:
        lines[current_idx].append(string)

In [415]:
words_per_line = []
for line in lines:
    words_per_line.append(len(line))
    
print(f"Avg. number of words per sentence: {np.mean(words_per_line):.1f}")
print(f"Total number of sentences: {len(lines)}")

Avg. number of words per sentence: 23.7
Total number of sentences: 8937


In [416]:
print(lines[1])

['Chancellor NNP O\n', 'of IN B-PP\n', 'the DT B-NP\n', 'Exchequer NNP I-NP\n', 'Nigel NNP B-NP\n', 'Lawson NNP I-NP\n', "'s POS B-NP\n", 'restated VBN I-NP\n', 'commitment NN I-NP\n', 'to TO B-PP\n', 'a DT B-NP\n', 'firm NN I-NP\n', 'monetary JJ I-NP\n', 'policy NN I-NP\n', 'has VBZ B-VP\n', 'helped VBN I-VP\n', 'to TO I-VP\n', 'prevent VB I-VP\n', 'a DT B-NP\n', 'freefall NN I-NP\n', 'in IN B-PP\n', 'sterling NN B-NP\n', 'over IN B-PP\n', 'the DT B-NP\n', 'past JJ I-NP\n', 'week NN I-NP\n', '. . O\n']


In [417]:
# for each sentence, extract each word and corresponding POS tag

text = list()
target = list()

for line in lines:
    words = list()
    pos_tags = list()
    for string in line:
        word, pos, _ = string.split()
        words.append(word)
        pos_tags.append(pos)
    text.append(words)
    target.append(pos_tags)

In [418]:
df = pd.DataFrame(data={"text": text, "target": target})

In [420]:
df.head()

Unnamed: 0,text,target
0,"[Confidence, in, the, pound, is, widely, expec...","[NN, IN, DT, NN, VBZ, RB, VBN, TO, VB, DT, JJ,..."
1,"[Chancellor, of, the, Exchequer, Nigel, Lawson...","[NNP, IN, DT, NNP, NNP, NNP, POS, VBN, NN, TO,..."
2,"[But, analysts, reckon, underlying, support, f...","[CC, NNS, VBP, VBG, NN, IN, NN, VBZ, VBN, VBN,..."
3,"[This, has, increased, the, risk, of, the, gov...","[DT, VBZ, VBN, DT, NN, IN, DT, NN, VBG, VBN, T..."
4,"[``, The, risks, for, sterling, of, a, bad, tr...","[``, DT, NNS, IN, NN, IN, DT, JJ, NN, NN, VBP,..."


In [421]:
# check if length of X and Y are the same for each sample

for idx in range(len(df)):
    if len(df["text"].iloc[idx]) != len(df["target"].iloc[idx]):
        print(idx)

### Targets

In [422]:
# create POS encodings
unique_pos_tags = set()
for idx in range(len(df)):
    for tag in df["target"].iloc[idx]:
        if tag not in unique_pos_tags:
            unique_pos_tags.add(tag)

print(unique_pos_tags)

{'DT', 'RBS', 'TO', 'VBG', 'MD', 'POS', ',', 'PRP$', '#', 'SYM', 'PRP', 'UH', 'VBP', 'EX', '(', 'VBN', 'NNP', 'CC', 'WP', 'NNS', 'JJR', 'WRB', '.', 'RP', 'FW', 'WP$', 'CD', 'IN', 'NN', 'JJS', ':', '$', 'RBR', "''", 'RB', 'NNPS', 'WDT', ')', '``', 'VBD', 'VB', 'PDT', 'JJ', 'VBZ'}


In [425]:
pos2value = dict()
pos2value[0] = 0
for idx, tag in enumerate(unique_pos_tags):
    pos2value[tag] = idx + 1  # because 0 will be padding token in both text and target

value2pos = {v: k for k, v in pos2value.items()}

print(pos2value)

{0: 0, 'DT': 1, 'RBS': 2, 'TO': 3, 'VBG': 4, 'MD': 5, 'POS': 6, ',': 7, 'PRP$': 8, '#': 9, 'SYM': 10, 'PRP': 11, 'UH': 12, 'VBP': 13, 'EX': 14, '(': 15, 'VBN': 16, 'NNP': 17, 'CC': 18, 'WP': 19, 'NNS': 20, 'JJR': 21, 'WRB': 22, '.': 23, 'RP': 24, 'FW': 25, 'WP$': 26, 'CD': 27, 'IN': 28, 'NN': 29, 'JJS': 30, ':': 31, '$': 32, 'RBR': 33, "''": 34, 'RB': 35, 'NNPS': 36, 'WDT': 37, ')': 38, '``': 39, 'VBD': 40, 'VB': 41, 'PDT': 42, 'JJ': 43, 'VBZ': 44}


In [426]:
targets_encoded = list()
for idx in range(len(df)):
    target_encoded = list()
    for tag in df["target"].iloc[idx]:
        target_encoded.append(pos2value[tag])
    targets_encoded.append(target_encoded)

In [428]:
targets_padded = pad_sequences(targets_encoded, maxlen=max_seq_length, padding='post')
targets_padded = [target.tolist() for target in targets_padded]

In [432]:
print(len(targets_encoded[0]))
print(len(targets_padded[0]))

37
78


In [433]:
df["target_encoded"] = targets_encoded
df["target_padded"] = targets_padded

### Inputs

#### 1 - Tokenizing + Vocabulary Keras

In [435]:
tokenizer = Tokenizer(filters="")
tokenizer.fit_on_texts(text_combined)
text_encoded = tokenizer.texts_to_sequences(text)
word2value = tokenizer.word_index
value2word = {v: k for k, v in word2value.items()}

In [303]:
# tokenizer.word_counts

OrderedDict([('confidence', 15),
             ('in', 3807),
             ('the', 10714),
             ('pound', 14),
             ('is', 1422),
             ('widely', 22),
             ('expected', 114),
             ('to', 5080),
             ('take', 105),
             ('another', 89),
             ('sharp', 21),
             ('dive', 3),
             ('if', 275),
             ('trade', 51),
             ('figures', 44),
             ('for', 1981),
             ('september', 53),
             (',', 10770),
             ('due', 88),
             ('release', 13),
             ('tomorrow', 8),
             ('fail', 4),
             ('show', 42),
             ('a', 4487),
             ('substantial', 18),
             ('improvement', 15),
             ('from', 1058),
             ('july', 34),
             ('and', 3702),
             ('august', 45),
             ("'s", 1918),
             ('near-record', 2),
             ('deficits', 1),
             ('.', 8725),
             ('chancell

In [437]:
sentence_lengths = [len(df["text"].iloc[idx]) for idx in range(len(df))]
max_seq_length = max(sentence_lengths)
print("Median sentence length", np.median(sentence_lengths))
print("Max sentence length", max_seq_length)

Median sentence length 23.0
Max sentence length 78


In [438]:
texts_padded = pad_sequences(text_encoded, maxlen=max_seq_length, padding='post')
texts_padded = [text.tolist() for text in texts_padded]

In [439]:
df["text_encoded"] = text_encoded
df["text_padded"] = texts_padded

In [440]:
print(len(text_encoded[0]))
print(len(texts_padded[0]))

37
78


In [441]:
for idx in range(len(df)):
    if len(df["text_padded"].iloc[idx]) != len(df["target_padded"].iloc[idx]):
        print(idx)

In [442]:
df.head()

Unnamed: 0,text,target,target_encoded,target_padded,text_encoded,text_padded
0,"[Confidence, in, the, pound, is, widely, expec...","[NN, IN, DT, NN, VBZ, RB, VBN, TO, VB, DT, JJ,...","[29, 28, 1, 29, 44, 35, 16, 3, 41, 1, 43, 29, ...","[29, 28, 1, 29, 44, 35, 16, 3, 41, 1, 43, 29, ...","[1536, 7, 2, 1637, 15, 1106, 184, 5, 207, 246,...","[1536, 7, 2, 1637, 15, 1106, 184, 5, 207, 246,..."
1,"[Chancellor, of, the, Exchequer, Nigel, Lawson...","[NNP, IN, DT, NNP, NNP, NNP, POS, VBN, NN, TO,...","[17, 28, 1, 17, 17, 17, 6, 16, 29, 3, 1, 29, 4...","[17, 28, 1, 17, 17, 17, 6, 16, 29, 3, 1, 29, 4...","[1751, 4, 2, 4172, 4173, 1638, 10, 3568, 2350,...","[1751, 4, 2, 4172, 4173, 1638, 10, 3568, 2350,..."
2,"[But, analysts, reckon, underlying, support, f...","[CC, NNS, VBP, VBG, NN, IN, NN, VBZ, VBN, VBN,...","[18, 20, 13, 4, 29, 28, 29, 44, 16, 16, 28, 1,...","[18, 20, 13, 4, 29, 28, 29, 44, 16, 16, 28, 1,...","[31, 195, 8937, 1891, 474, 9, 1456, 36, 64, 49...","[31, 195, 8937, 1891, 474, 9, 1456, 36, 64, 49..."
3,"[This, has, increased, the, risk, of, the, gov...","[DT, VBZ, VBN, DT, NN, IN, DT, NN, VBG, VBN, T...","[1, 44, 16, 1, 29, 28, 1, 29, 4, 16, 3, 41, 29...","[1, 44, 16, 1, 29, 28, 1, 29, 4, 16, 3, 41, 29...","[42, 36, 234, 2, 462, 4, 2, 122, 216, 1147, 5,...","[42, 36, 234, 2, 462, 4, 2, 122, 216, 1147, 5,..."
4,"[``, The, risks, for, sterling, of, a, bad, tr...","[``, DT, NNS, IN, NN, IN, DT, JJ, NN, NN, VBP,...","[39, 1, 20, 28, 29, 28, 1, 43, 29, 29, 13, 35,...","[39, 1, 20, 28, 29, 28, 1, 43, 29, 29, 13, 35,...","[13, 2, 1538, 9, 1456, 4, 6, 948, 481, 912, 28...","[13, 2, 1538, 9, 1456, 4, 6, 948, 481, 912, 28..."


#### 2 - Tokenizing + Vocabulary Custom

In [212]:
text = list()
for idx in range(len(df)):
    text.append([word.lower() for word in df["text"].iloc[idx]])
text_combined = [word for sentence in text for word in sentence]

In [214]:
df["text_lower"] = text

In [195]:
# create vocabulary based on unique words and IDs

unique_words = set()
for idx in range(len(df)):
    for word in df["text"].iloc[idx]:
        if word.lower() not in unique_words:
            unique_words.add(word.lower())

print(len(unique_tokens))

word2value = dict()
for idx, word in enumerate(unique_words):
    word2value[word] = idx

17258


In [164]:
texts_encoded = list()
for idx in range(len(df)):
    text_encoded = list()
    for word in df["text"].iloc[idx]:
        text_encoded.append(word2value[word.lower()])
    texts_encoded.append(text_encoded)

In [165]:
df["text_encoded"] = texts_encoded

#### Word Embeddings

##### 1 - downloaded word2vec

In [296]:
word2vec = load_word2vec_format("/home/hkortschak/Repositories/commonlit_kaggle/xund/GoogleNews-vectors-negative300.bin", binary=True)

In [297]:
embedding_size = 300
vocabulary_size = len(word2value) + 1

In [308]:
embedding_weights = np.zeros((vocabulary_size, embedding_size))
count = 0
for word, idx in word2value.items():
    try: 
        embedding_weights[idx] = word2vec[word]
    except KeyError:
        count += 1
        # print(word, tokenizer.word_counts[word])
print(count)

4808


In [309]:
embedding_weights = torch.tensor(embedding_weights)

##### 2 - gensim word2vec

In [220]:
word2vec_model = Word2Vec(sentences=df["text_lower"], vector_size=100, window=5, min_count=1, workers=4)
word_vectors = word2vec_model.wv
# word_vectors.save("word2vec.wordvectors")

In [233]:
vector = word2vec_model.wv['man']  # get numpy vector of a word
sims = word2vec_model.wv.most_similar('man', topn=10)  # get other similar words
print(sims)

[('peters', 0.9969028830528259), ('she', 0.9968954920768738), ('suit', 0.9968575835227966), ('himself', 0.9968065023422241), ('bush', 0.9967989921569824), ('decision', 0.9967637658119202), ('saw', 0.9967111349105835), ('great', 0.9965247511863708), ('deloitte', 0.9962803721427917), ('whole', 0.9959662556648254)]


In [236]:
word_vectors = KeyedVectors.load("word2vec.wordvectors", mmap='r')

### Split dataset

In [443]:
len(df)

8937

In [445]:
df = df.sample(frac=1, random_state=0).reset_index(drop=True)
num_train_samples = int(len(df) * 0.9) + 1
num_valid_samples = int(len(df) * 0.1)
print(num_train_samples, num_valid_samples)

8044 893


In [446]:
train_df = df[:num_train_samples]
valid_df = df[num_train_samples:]

In [516]:
train_X = train_df["text_padded"]
train_Y = train_df["target_padded"]
train_Y_unpadded = train_df["target_encoded"]

valid_X = valid_df["text_padded"]
valid_Y = valid_df["target_padded"]
valid_Y_unpadded = valid_df["target_encoded"]

### Data Loader

In [517]:
train_ds = POSDataset(train_X, train_Y, train_Y_unpadded)
valid_ds = POSDataset(valid_X, valid_Y, valid_Y_unpadded)

In [498]:
train_dl = DataLoader(train_ds, batch_size=12)
valid_dl = DataLoader(valid_ds, batch_size=12)

### Model

#### Try out model

In [481]:
model = Model(
    input_size=max_seq_length, 
    output_size=len(pos2value), 
    hidden_dim=64, 
    n_layers=1, 
    embedding_weights=embedding_weights, 
    embedding_size=300
).double()

In [482]:
print(train_df["text"].iloc[1])
input_ = train_df["text_padded"].iloc[1] # (max_seq_length)
input_ = torch.tensor(input_, dtype=torch.long).unsqueeze(0)
print(input_.shape)
print(input_.dtype)

['Although', 'air-traffic', 'delays', 'in', 'San', 'Francisco', 'were', 'significant', 'yesterday', ',', 'they', 'did', "n't", 'appear', 'to', 'spread', 'to', 'other', 'airports', '.']
torch.Size([1, 78])
torch.int64


In [483]:
output = model(input_)

Input torch.Size([1, 78])
Embedding torch.Size([1, 78, 300])
RNN Output torch.Size([1, 78, 64])
TDD Input torch.Size([1, 1, 78, 64])
TDD Output torch.Size([1, 45, 78, 1])
Return torch.Size([1, 45, 78])


In [484]:
cls_output = torch.argmax(output, dim=1).squeeze(0)
print(cls_output.shape)

torch.Size([78])


In [489]:
print(train_df["target_padded"].iloc[1])
print(cls_output)

[28, 29, 20, 28, 17, 17, 40, 43, 29, 7, 11, 40, 35, 41, 3, 41, 3, 43, 20, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
tensor([42, 42, 37, 15, 15,  7, 15, 29, 37, 40, 42, 15, 17, 15, 40, 38, 40, 37,
        15, 15, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
        37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
        37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37,
        37, 37, 37, 37, 37, 37])


In [515]:
classes = list(value2pos.keys())

target = np.array(train_df["target_encoded"].iloc[1])
pred = np.array(cls_output[:len(target)])

print(target)
print(pred)

f1 = f1_score(target, pred, labels=classes, average="micro")
print(f1)
#for cls in classes[1:]: # don't take adding class into account
#    target_cls = np.where(target == cls, target, 0)
#    pred_cls = np.where(pred == cls, pred, 0)
#    #print(target_cls)
#    #print(pred_cls)
#    f1 = f1_score(target_cls, pred_cls)
#    print(f1)
##f1_score(y_true, y_pred, *, labels=None, pos_label=1, average='binary', 

[28 29 20 28 17 17 40 43 29  7 11 40 35 41  3 41  3 43 20 23]
[42 42 37 15 15  7 15 29 37 40 42 15 17 15 40 38 40 37 15 15]
0.0


### Training Loop

In [526]:
torch.cuda.is_available()
device = torch.device("cuda")
torch.tensor([1, 2, 3]).to(device)

RuntimeError: CUDA error: all CUDA-capable devices are busy or unavailable
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [519]:
train_eval(
    train_dataloader=train_dl, 
    valid_dataloader=valid_dl, 
    classes=classes,
    model=model,
    num_epochs=5,
    learning_rate=0.001,
    weight_decay=1e-8,
    device=torch.device("cuda")
)

RuntimeError: CUDA error: all CUDA-capable devices are busy or unavailable
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.