# Training and Evaluating a POS Tagger

**Goal**
- assign POS tag to each word
    - x = first column of conll, y = second column of conll

**Plan**
- preprocess data
    - numerical classes bsed on unique POS tags
    - encoded strings (look at ways to do that)
- decide on model (have to be able to explain it!)
- train and evaluation loop
- pick appropriate metric (F1! precision? recall?)
- optional: create some nice plots, e.g.: confusion matrix, learning curve, precision-recall curve
- optional: analyse dataset (distribution of POS tages, most common words per POS tag, etc.)

**Model**
- decision tree: create features for each word (https://nlpforhackers.io/training-pos-tagger/)
- LSTM/RNN with word ids based on unique words --> study how LSTM/RNN network works!

if tokenization: use spcy en_core_web_sm

In [184]:
%load_ext autoreload
%autoreload 2

In [249]:
import pandas as pd
import numpy as np

import spacy
#from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
#from gensim.models.keyedvectors import load_word2vec_format
#from gensim.models import FastText
from gensim.models import Word2Vec, KeyedVectors

from torch import nn

from utils import get_embedding

In [None]:
!python -m spacy download en_core_web_sm

## Preprocessing

### Create dataset

In [48]:
with open('train.txt') as f:
    train_data = f.readlines()

In [49]:
# split dataset into sentences

lines = list()
lines.append(list()) 
current_idx = 0

for string in train_data:
    if string == "\n":
        lines.append(list())
        current_idx += 1
    else:
        lines[current_idx].append(string)

In [100]:
words_per_line = []
for line in lines:
    words_per_line.append(len(line))
    
print(f"Avg. number of words per sentence: {np.mean(words_per_line):.1f}")
print(f"Total number of sentences: {len(lines)}")

Avg. number of words per sentence: 23.7
Total number of sentences: 8937


In [101]:
print(lines[1])

['Chancellor NNP O\n', 'of IN B-PP\n', 'the DT B-NP\n', 'Exchequer NNP I-NP\n', 'Nigel NNP B-NP\n', 'Lawson NNP I-NP\n', "'s POS B-NP\n", 'restated VBN I-NP\n', 'commitment NN I-NP\n', 'to TO B-PP\n', 'a DT B-NP\n', 'firm NN I-NP\n', 'monetary JJ I-NP\n', 'policy NN I-NP\n', 'has VBZ B-VP\n', 'helped VBN I-VP\n', 'to TO I-VP\n', 'prevent VB I-VP\n', 'a DT B-NP\n', 'freefall NN I-NP\n', 'in IN B-PP\n', 'sterling NN B-NP\n', 'over IN B-PP\n', 'the DT B-NP\n', 'past JJ I-NP\n', 'week NN I-NP\n', '. . O\n']


In [26]:
# for each sentence, extract each word and corresponding POS tag

text = list()
target = list()

for line in lines:
    words = list()
    pos_tags = list()
    for string in line:
        word, pos, _ = string.split()
        words.append(word)
        pos_tags.append(pos)
    text.append(words)
    target.append(pos_tags)

In [34]:
df = pd.DataFrame(data={"text": text, "target": target})

In [36]:
df.head(10)

Unnamed: 0,text,target
0,"[Confidence, in, the, pound, is, widely, expec...","[NN, IN, DT, NN, VBZ, RB, VBN, TO, VB, DT, JJ,..."
1,"[Chancellor, of, the, Exchequer, Nigel, Lawson...","[NNP, IN, DT, NNP, NNP, NNP, POS, VBN, NN, TO,..."
2,"[But, analysts, reckon, underlying, support, f...","[CC, NNS, VBP, VBG, NN, IN, NN, VBZ, VBN, VBN,..."
3,"[This, has, increased, the, risk, of, the, gov...","[DT, VBZ, VBN, DT, NN, IN, DT, NN, VBG, VBN, T..."
4,"[``, The, risks, for, sterling, of, a, bad, tr...","[``, DT, NNS, IN, NN, IN, DT, JJ, NN, NN, VBP,..."
5,"[``, If, there, is, another, bad, trade, numbe...","[``, IN, EX, VBZ, DT, JJ, NN, NN, ,, EX, MD, V..."
6,"[Forecasts, for, the, trade, figures, range, w...","[NNS, IN, DT, NN, NNS, VBP, RB, ,, CC, JJ, NNS..."
7,"[The, August, deficit, and, the, #, 2.2, billi...","[DT, NNP, NN, CC, DT, #, CD, CD, NN, VBN, IN, ..."
8,"[Sanjay, Joshi, ,, European, economist, at, Ba...","[NNP, NNP, ,, JJ, NN, IN, NNP, NNPS, CC, NNP, ..."
9,"[At, the, same, time, ,, he, remains, fairly, ...","[IN, DT, JJ, NN, ,, PRP, VBZ, RB, JJ, IN, DT, ..."


In [57]:
# check if length of X and Y are the same for each sample

for idx in range(len(df)):
    if len(df["text"].iloc[idx]) != len(df["target"].iloc[idx]):
        print(idx)

In [102]:
#for idx in [13, 20]: 
#    text = df["text"].iloc[idx]
#    pos = df["target"].iloc[idx]
#    for a, b in zip(text, pos):
#        print(a, b)

### Targets

In [75]:
# create POS encodings
unique_pos_tags = set()
for idx in range(len(df)):
    for tag in df["target"].iloc[idx]:
        if tag not in unique_pos_tags:
            unique_pos_tags.add(tag)

print(unique_pos_tags)

{'DT', 'RBS', 'TO', 'VBG', 'MD', 'POS', ',', 'PRP$', '#', 'SYM', 'PRP', 'UH', 'VBP', 'EX', '(', 'VBN', 'NNP', 'CC', 'WP', 'NNS', 'JJR', 'WRB', '.', 'RP', 'FW', 'WP$', 'CD', 'IN', 'NN', 'JJS', ':', '$', 'RBR', "''", 'RB', 'NNPS', 'WDT', ')', '``', 'VBD', 'VB', 'PDT', 'JJ', 'VBZ'}


In [88]:
pos2value = dict()
for idx, tag in enumerate(unique_pos_tags):
    pos2value[tag] = idx

print(pos2value)

{'DT': 0, 'RBS': 1, 'TO': 2, 'VBG': 3, 'MD': 4, 'POS': 5, ',': 6, 'PRP$': 7, '#': 8, 'SYM': 9, 'PRP': 10, 'UH': 11, 'VBP': 12, 'EX': 13, '(': 14, 'VBN': 15, 'NNP': 16, 'CC': 17, 'WP': 18, 'NNS': 19, 'JJR': 20, 'WRB': 21, '.': 22, 'RP': 23, 'FW': 24, 'WP$': 25, 'CD': 26, 'IN': 27, 'NN': 28, 'JJS': 29, ':': 30, '$': 31, 'RBR': 32, "''": 33, 'RB': 34, 'NNPS': 35, 'WDT': 36, ')': 37, '``': 38, 'VBD': 39, 'VB': 40, 'PDT': 41, 'JJ': 42, 'VBZ': 43}


In [136]:
targets_encoded = list()
for idx in range(len(df)):
    target_encoded = list()
    for tag in df["target"].iloc[idx]:
        target_encoded.append(pos2value[tag])
    targets_encoded.append(target_encoded)

In [137]:
df["target_encoded"] = targets_encoded
df.sample(3)

Unnamed: 0,text,target,text_encoded,target_encoded
8680,"[The, company, closed, at, $, 12, a, share, ,,...","[DT, NN, VBD, IN, $, CD, DT, NN, ,, RB, CD, NN...","[1, 38, 300, 15, 228, 4, 47, 141, 105, 5, 427,...","[0, 28, 39, 27, 31, 26, 0, 28, 6, 34, 26, 19, ..."
3634,"[They, succeed, Robert, W., Kasten, and, John,...","[PRP, VBP, NNP, NNP, NNP, CC, NNP, NNP, NNP, ,...","[37, 2643, 839, 11080, 6, 527, 4721, 63, 900, ...","[10, 12, 16, 16, 16, 17, 16, 16, 16, 6, 18, 39..."
4341,"[``, All, these, interconnected, computers, ma...","[``, DT, DT, VBN, NNS, VBP, PRP, JJ, TO, VB, I...","[74, 180, 11681, 437, 162, 13, 884, 3, 1082, 6...","[38, 0, 0, 15, 19, 12, 10, 42, 2, 40, 27, 18, ..."


### Inputs

In [166]:
# keras

# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(text_combined)
# text_encoded = tokenizer.texts_to_sequences(text)
# df["text_encoded_keras"] = text_encoded

In [212]:
text = list()
for idx in range(len(df)):
    text.append([word.lower() for word in df["text"].iloc[idx]])
text_combined = [word for sentence in text for word in sentence]

In [214]:
df["text_lower"] = text

In [195]:
# create vocabulary based on unique words and IDs

unique_words = set()
for idx in range(len(df)):
    for word in df["text"].iloc[idx]:
        if word.lower() not in unique_words:
            unique_words.add(word.lower())

print(len(unique_tokens))

word2value = dict()
for idx, word in enumerate(unique_words):
    word2value[word] = idx

17258


In [164]:
texts_encoded = list()
for idx in range(len(df)):
    text_encoded = list()
    for word in df["text"].iloc[idx]:
        text_encoded.append(word2value[word.lower()])
    texts_encoded.append(text_encoded)

In [165]:
df["text_encoded"] = texts_encoded

In [174]:
sentence_lengths = [len(df["text"].iloc[idx]) for idx in range(len(df))]
max_seq_length = max(sentence_lengths)
print("Median sentence length", np.median(sentence_lengths))
print("Max sentence length", max_seq_length)

Median sentence length 23.0
Max sentence length 78


In [180]:
texts_padded = pad_sequences(df["text_encoded"], maxlen=max_seq_length, padding='post')
texts_padded = [text.tolist() for text in texts_padded]

78

In [181]:
df["text_padded"] = texts_padded

In [215]:
df.head(3)

Unnamed: 0,text,target,text_encoded,target_encoded,text_padded,text_lower
0,"[Confidence, in, the, pound, is, widely, expec...","[NN, IN, DT, NN, VBZ, RB, VBN, TO, VB, DT, JJ,...","[7194, 2731, 15113, 10265, 2980, 15351, 8589, ...","[28, 27, 0, 28, 43, 34, 15, 2, 40, 0, 42, 28, ...","[7194, 2731, 15113, 10265, 2980, 15351, 8589, ...","[confidence, in, the, pound, is, widely, expec..."
1,"[Chancellor, of, the, Exchequer, Nigel, Lawson...","[NNP, IN, DT, NNP, NNP, NNP, POS, VBN, NN, TO,...","[2105, 11904, 15113, 211, 9094, 12921, 12443, ...","[16, 27, 0, 16, 16, 16, 5, 15, 28, 2, 0, 28, 4...","[2105, 11904, 15113, 211, 9094, 12921, 12443, ...","[chancellor, of, the, exchequer, nigel, lawson..."
2,"[But, analysts, reckon, underlying, support, f...","[CC, NNS, VBP, VBG, NN, IN, NN, VBZ, VBN, VBN,...","[3473, 9855, 6566, 14313, 14610, 16233, 995, 1...","[17, 19, 12, 3, 28, 27, 28, 43, 15, 15, 27, 0,...","[3473, 9855, 6566, 14313, 14610, 16233, 995, 1...","[but, analysts, reckon, underlying, support, f..."


In [193]:
word2vec = load_word2vec_format("/home/hkortschak/Repositories/commonlit_kaggle/xund/GoogleNews-vectors-negative300.bin", binary=True)

In [194]:
embedding_size = 300
vocabulary_size = len(word2value) + 1

In [199]:
embedding_weights = np.zeros((vocabulary_size, embedding_size))
count = 0
for word, idx in word2value.items():
    try: 
        embedding_weights[idx] = word2vec[word]
    except KeyError:
        count += 1
        # print(word)
print(count)

4808


In [220]:
word2vec_model = Word2Vec(sentences=df["text_lower"], vector_size=100, window=5, min_count=1, workers=4)
word_vectors = word2vec_model.wv
# word_vectors.save("word2vec.wordvectors")

In [233]:
vector = word2vec_model.wv['man']  # get numpy vector of a word
sims = word2vec_model.wv.most_similar('man', topn=10)  # get other similar words
print(sims)

[('peters', 0.9969028830528259), ('she', 0.9968954920768738), ('suit', 0.9968575835227966), ('himself', 0.9968065023422241), ('bush', 0.9967989921569824), ('decision', 0.9967637658119202), ('saw', 0.9967111349105835), ('great', 0.9965247511863708), ('deloitte', 0.9962803721427917), ('whole', 0.9959662556648254)]


In [236]:
word_vectors = KeyedVectors.load("word2vec.wordvectors", mmap='r')

KeyError: "Key '17258' not present"

### Model

In [250]:
class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers, word_vectors):
        super(Model, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(num_embeddings=len(word_vectors), embedding_dim=100, padding_idx=0)
        self.embedding = nn.Embedding.from_pretrained(word_vectors)
        self.rnn = nn.RNN(input_size, hidden_dim, n_layers)   
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        
        output, hidden = self.rnn(x)
        # getting output from the hidden state
        output = output.view(-1, self.hidden_dim)
        output = self.fc(output)

        return output, hidden