# Get stuff from github

### Checkout repository

In [1]:
!if [ ! -d "SwissTagger" ] && [ "$(pwd)" != "/content/SwissTagger" ]; then git clone https://github.com/MethDamon/SwissTagger.git; fi
%cd SwissTagger

Cloning into 'SwissTagger'...
remote: Enumerating objects: 117, done.[K
remote: Counting objects: 100% (117/117), done.[K
remote: Compressing objects: 100% (75/75), done.[K
remote: Total 117 (delta 69), reused 87 (delta 41), pack-reused 0[K
Receiving objects: 100% (117/117), 1002.65 KiB | 9.37 MiB/s, done.
Resolving deltas: 100% (69/69), done.
/content/SwissTagger


### Switch to `dev` branch

In [2]:
!git checkout dev

Branch 'dev' set up to track remote branch 'dev' from 'origin'.
Switched to a new branch 'dev'


# $BiLSTM$

### `preprocessing.py`

In [0]:
import xml.etree.ElementTree as ET
import os


def select_files_in_folder(directory, ext):
    for file in os.listdir(directory):
        if file.endswith('.%s' % ext):
            yield os.path.join(directory, file)


def parse_xml_data():
    articles = []
    for file in select_files_in_folder('data', 'xml'):
        tree = ET.parse(file)
        root = tree.getroot()
        print('READING %s ...' % file)
        for article in root:
            sentences = {}
            for sentence in article:
                sentences[sentence.attrib['n']] = {'words': []}
                for tag in sentence:
                    datapoint = {'n': tag.attrib['n'], 'pos': tag.attrib['pos'],
                                 'verified': tag.attrib['verified'], 'word': tag.text}
                    sentences[sentence.attrib['n']]['words'].append(datapoint)
            articles.append(sentences)
    return articles


## BiLSTM with lowercasing

### data exploration

In [9]:
data = prepr.parse_xml_data()
n_of_articles = len(data)
print("number of articles:", n_of_articles)

n_of_sentences = 0
words = []
tags = []
for article in data:
  for sentence in article:
    n_of_sentences += 1
    for word in article[sentence]['words']:
      words.append(word['word'])
      tags.append(word['pos'])
      
print("number of sentences:", n_of_sentences)
n_of_words = len(words)
print("number of words:", n_of_words)

n_of_words_unique = len(set(words))
n_of_tags_unique = len(set(tags))
print("number of unique words:", n_of_words_unique)
print("number of unique tags:", n_of_tags_unique)

words_lower = [w.lower() for w in words]
n_of_words_unique_lower = len(set(words_lower))
print("number of unique words (lowercased):", n_of_words_unique_lower)

READING data/wiki.xml ...
READING data/blogs.xml ...
READING data/schobinger.xml ...
READING data/blick.xml ...
READING data/swatch.xml ...
number of articles: 55
number of sentences: 7327
number of words: 113565
number of unique words: 23806
number of unique tags: 86
number of unique words (lowercased): 22526


### `main.py`

In [0]:
# remove logs from previous run
!rm -rf log
!mkdir log

In [18]:
# get ngrok CV
! wget -O ngrok-stable-linux-amd64.zip https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
! unzip -o ngrok-stable-linux-amd64.zip

# setup ngrok
LOG_DIR = './log'
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(LOG_DIR)
)

get_ipython().system_raw('./ngrok http 6006 &')

! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"



import keras.utils
import numpy as np
from keras import backend as K
from keras.layers import Dense
from keras.layers import LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation, Dropout, Flatten
from keras.models import Sequential
from keras.optimizers import Adam, SGD
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard

import preprocessing as prepr

# check whether we're running on CPU/TPU or GPU
# on colab: change device in 'Runtime' → 'Change runtime type' → 'Hardware accelerator'
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print("Running on CPU or TPU.")
else:
    print("Running on GPU.")
print("---")


# keep results deterministic (results on colab may differ between runtime resets)
# https://machinelearningmastery.com/reproducible-results-neural-networks-keras/
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)


test_size = 0.1

data = prepr.parse_xml_data()

import random
#random.shuffle(data)


# Split
n_of_articles = len(data)
n_of_test_articles = int(n_of_articles * test_size)
test_articles = data[(n_of_articles - n_of_test_articles):]
train_articles = data[:(n_of_articles - n_of_test_articles)]


train_sentences = []
test_sentences = []
train_tags = []
test_tags = []


for article in train_articles:
    for sentence in article:
        words = []
        tags = []
        for word in article[sentence]['words']:
            words.append(word['word'])
            tags.append(word['pos'])
        train_sentences.append(np.array(words))
        train_tags.append(np.array(tags))
        
        
for article in test_articles:
    for sentence in article:
        words = []
        tags = []
        for word in article[sentence]['words']:
            words.append(word['word'])
            tags.append(word['pos'])
        test_sentences.append(np.array(words))
        test_tags.append(np.array(tags))

# print('Training sentences:')
# print(train_sentences[0])
print('Number of training sentences: %d' % len(train_sentences))
# print('Test sentences:')
# print(test_sentences[0])
print('Number of testing sentences: %d' % len(test_sentences))
# print('Training tags:')
# print(train_tags[0])
print('Number of training tags: %d' % len(train_tags))
# print('Testing tags')
# print(test_tags[0])
print('Number of testing tags: %d' % len(test_tags))

unique_words, unique_tags = set([]), set([])


for s in train_sentences:
    for w in s:
        unique_words.add(w.lower())

for ts in train_tags:
    for t in ts:
        unique_tags.add(t)

for s in test_sentences:
    for w in s:
        unique_words.add(w.lower())

for ts in test_tags:
    for t in ts:
        unique_tags.add(t)

word2index = {w: i + 2 for i, w in enumerate(list(unique_words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs

tag2index = {t: i + 1 for i, t in enumerate(list(unique_tags))}
tag2index['-PAD-'] = 0  # The special value used to padding
print('tag2index:', tag2index)

index2tag = {v: k for k, v in tag2index.items()}


train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []

for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])

    train_sentences_X.append(s_int)

for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])

    test_sentences_X.append(s_int)

for s in train_tags:
    train_tags_y.append([tag2index[t] for t in s])

for s in test_tags:
    test_tags_y.append([tag2index[t] for t in s])

# print(train_sentences_X[0])
# print(test_sentences_X[0])
# print(train_tags_y[0])
# print(test_tags_y[0])

MAX_LENGTH = len(max(train_sentences_X, key=len))
print("Max sequence length:", MAX_LENGTH)  # Should be 156

train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')

# print(train_sentences_X[0])
# print(test_sentences_X[0])
# print(train_tags_y[0])
# print(test_tags_y[0])

num_words=len(word2index)
num_tags=len(tag2index)
print("number of words:", num_words)
print("number of tags:", num_tags)

################################################################################
# LSTM #########################################################################
################################################################################
#'''
model = Sequential()
#model.add(InputLayer(input_shape=(MAX_LENGTH,)))
model.add(Embedding(num_words,
                    512,
                    mask_zero=True,
                    #input_length=MAX_LENGTH
                   ))
model.add(Bidirectional(LSTM(512,
                             return_sequences=True,
                             recurrent_regularizer=keras.regularizers.l1_l2(0.2, 0.2),
                             unit_forget_bias=True,
                             dropout=0.5,
                             recurrent_dropout=0.5)))
model.add(TimeDistributed(Dense(num_tags)))
model.add(Activation('softmax'))


model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])

model.summary()
#'''
################################################################################

################################################################################
# MLP ##########################################################################
################################################################################
'''
model = Sequential()
# Dense(64) is a fully-connected layer with 64 hidden units.
# in the first layer, you must specify the expected input data shape:
# here, the size of the longest sentence.
#model.add(InputLayer(input_shape=(MAX_LENGTH,)))
model.add(Embedding(num_words,                       # embedding layer
                    128,
                    mask_zero=True,
                    #input_length=MAX_LENGTH
                   ))
model.add(Dense(128, activation='relu', input_dim=MAX_LENGTH))             # input layer
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))             # hidden layer 1
model.add(Dropout(0.5))
#model.add(Flatten())
model.add(Dense(num_tags, activation='softmax'))     # output layer
#model.add(Activation('softmax'))


model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])

model.summary()
'''
################################################################################


categorical_tags_y = keras.utils.to_categorical(train_tags_y, len(tag2index))
#categorical_words_X = keras.utils.to_categorical(train_sentences_X, len(word2index))


from datetime import datetime
from pytz import timezone
tensorboard = TensorBoard(log_dir="log/{}".format(datetime.now(timezone('Europe/Zurich')).strftime("%Y-%m-%d %H:%M:%S")), update_freq='batch')
# update_freq is either 'batch', 'epoch', or integer i to write after every i samples

history = model.fit(train_sentences_X,
                    categorical_tags_y,
                    batch_size=512,
                    epochs=30,
                    validation_split=0.1,
                    callbacks=[tensorboard])
scores = model.evaluate(test_sentences_X, keras.utils.to_categorical(test_tags_y, len(tag2index)))
for i, name in enumerate(model.metrics_names):
    print("%s: %s" % (name, 100 * scores[i]))

test_samples = [
    "Ich bin scho recht gspannt was passiert".split(),
    "Mer hend jetzt es Model trainiert und es isch ziemli guet worde.".split()
]

test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_samples_X.append(s_int)

  
def prediction_to_tag(tag_prediction_probabilities):
  index_of_max = tag_prediction_probabilities.argmax()
  return index2tag[index_of_max]
  
  
test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')

predictions = model.predict(test_samples_X)

for sen_idx, sen in enumerate(test_samples):
  for word_idx, tag_probabilities in enumerate(predictions[sen_idx]):
    word = sen[word_idx] if word_idx < len(sen) else '-PAD-'
    predicted_tag = prediction_to_tag(tag_probabilities)
    print((word, predicted_tag), end=' ')
  print()  # newline after each sentence
    

--2019-01-25 02:29:19--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 34.206.130.40, 35.173.6.94, 34.204.22.7, ...
Connecting to bin.equinox.io (bin.equinox.io)|34.206.130.40|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5363700 (5.1M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip’


2019-01-25 02:29:19 (17.5 MB/s) - ‘ngrok-stable-linux-amd64.zip’ saved [5363700/5363700]

Archive:  ngrok-stable-linux-amd64.zip
  inflating: ngrok                   
http://fa2ee2cc.ngrok.io
Running on GPU.
---
READING data/wiki.xml ...
READING data/blogs.xml ...
READING data/schobinger.xml ...
READING data/blick.xml ...
READING data/swatch.xml ...
Number of training sentences: 6815
Number of testing sentences: 512
Number of training tags: 6815
Number of testing tags: 512
tag2index: {'PIS': 1, 'ADJD+': 2, 'CARD+': 3, 'ITJ': 4, 'PDAT+': 5, 'VVPP': 6, 'KOUI+': 7, 'PDAT': 8, 'PAV+': 9, 'VAINF':

### inspect predictions

In [20]:
def get_prediction(input_sentences):
  test_samples_X = []
  for s in input_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_samples_X.append(s_int)
  
  test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')
  predictions = model.predict(test_samples_X)
    
  predictions_human_readable = []
  for sentence in predictions:
    sentence_human_readable = []
    for tag_probabilities in sentence:
      predicted_tag = prediction_to_tag(tag_probabilities)
      sentence_human_readable.append(predicted_tag)
    predictions_human_readable.append(sentence_human_readable)
    
  return predictions_human_readable


test_predictions = get_prediction(test_sentences)

import pandas as pd
import itertools

df_index = []
for i in range(len(test_sentences)):
  df_index.append('input')
  df_index.append('truth')
  df_index.append('pred')
  df_index.append('comp')
  df_index.append('')


data = []
error_statistics = {}
error_examples = {}
for idx, s in enumerate(test_sentences):
  test_tags_s = list(test_tags[idx]) + ['-PAD-' for i in range(MAX_LENGTH-len(test_tags[idx]))]
  test_predictions_s = test_predictions[idx]
  comparison = ['✓' if y == p else '𐄂' for y, p in zip(test_tags_s, test_predictions_s)]
  errors = [((y, p), word) for y, p, word in list(itertools.zip_longest(test_tags_s, test_predictions_s, s, fillvalue='-PAD-')) if y != p]
  for e, word in errors:
    if e in error_statistics:
      error_statistics[e]['count'] += 1
      error_statistics[e]['examples'].append(word)
    else:
      error_statistics[e] = {'count': 1, 'examples': [word]}
#    if e in error_examples:
#      error_examples[e].append(word)
#    else:
#      error_examples[e] = []
  data.append(list(s) + ['-PAD-' for i in range(MAX_LENGTH-len(s))])
  data.append(test_tags_s)
  data.append(test_predictions_s)
  data.append(comparison)
  data.append([''] * MAX_LENGTH)

error_stats_sorted = sorted(error_statistics.items(), key=lambda kv: kv[1]['count'], reverse=True)
error_stats_sorted = [[y, p, dict['count'], dict['examples']] for ((y, p), dict) in error_stats_sorted]
#print('error_stats_sorted:', error_stats_sorted)

# calcualte accuracy manually
# (1) taking padding into account
total_errors_with_padding = 0
for y, p, count, examples in error_stats_sorted:
  total_errors_with_padding += count

total_tags_with_padding = len(test_tags)*MAX_LENGTH  # (number of test sentences) * (max length of each sentence)

correct_predictions_with_padding = total_tags_with_padding - total_errors_with_padding
accuracy_manual_with_padding = float(correct_predictions_with_padding) / total_tags_with_padding
print("Manually calculated accuracy (with padding) = ", accuracy_manual_with_padding)

# (2) without padding
total_errors = 0
for y, p, count, examples in error_stats_sorted:
  if y != '-PAD-':
    total_errors += count
total_tags = 0
for sen in test_tags:
  total_tags += len(sen)

correct_predictions = total_tags - total_errors
accuracy_manual = float(correct_predictions) / total_tags
print("Manually calculated accuracy (without padding) = ", accuracy_manual)


df = pd.DataFrame(data, index=df_index)

filename = 'test_predictions.pkl'
df.to_pickle(filename)
print("You may download '{}' to inspect the predictions".format(filename))

df.head(40)


Manually calculated accuracy (with padding) =  0.1074969951923077
Manually calculated accuracy (without padding) =  0.708942283874164
You may download 'test_predictions.pkl' to inspect the predictions


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,146,147,148,149,150,151,152,153,154,155
input,SWATCH,GROOVE,MOOVE,www.longines.com,Eleganz,",",Tradition,und,Högschtlaischtig,sitt,...,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-
truth,NE,FM,FM,NE,NN,"$,",NN,KON,NN,APPR,...,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-
pred,NE,NE,NN,NN,NN,"$,",NN,KON,KON,CARD,...,$.,$.,$.,$.,$.,$.,$.,$.,$.,$.
comp,✓,𐄂,𐄂,𐄂,✓,✓,✓,✓,𐄂,𐄂,...,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂
,,,,,,,,,,,...,,,,,,,,,,
input,Zum,Jubiläum,stellt,sich,d’,Uhräliniä,The,Longines,Saint-Imier,Collection,...,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-
truth,APPRART,NN,VVFIN,PRF,ART,NN,FM,NE,NE,FM,...,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-
pred,APPRART,NN,VVFIN,PRF,ART,FM,FM,NE,NE,FM,...,$.,$.,$.,$.,$.,$.,$.,$.,$.,$.
comp,✓,✓,✓,✓,✓,𐄂,✓,✓,✓,✓,...,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂
,,,,,,,,,,,...,,,,,,,,,,


### inspect common errors

In [32]:
filename = 'common_errors.pkl'
df.to_pickle(filename)
print("You may download '{}' to inspect the most common errors".format(filename))

print("total predictions:", total_tags)
print("total errors:", total_errors)
df = pd.DataFrame(error_stats_sorted, columns=['truth', 'pred', 'count', 'examples'])
print("Most common errors ('-PAD-' tag errors are ignored in evaluation accuracy):")
df

You may download 'common_errors.pkl' to inspect the most common errors
total predictions: 12111
total errors: 3525
Most common errors ('-PAD-' tag errors are ignored in evaluation accuracy):


Unnamed: 0,truth,pred,count,examples
0,-PAD-,$.,67006,"[-PAD-, -PAD-, -PAD-, -PAD-, -PAD-, -PAD-, -PA..."
1,-PAD-,VVINF,310,"[-PAD-, -PAD-, -PAD-, -PAD-, -PAD-, -PAD-, -PA..."
2,ADJA,NN,294,"[uhrmachorischi, rundä, extraflachä, uhrmachor..."
3,NE,NN,179,"[www.longines.com, Chile, Baker, H.H., Mohamme..."
4,-PAD-,ART,154,"[-PAD-, -PAD-, -PAD-, -PAD-, -PAD-, -PAD-, -PA..."
5,-PAD-,$(,154,"[-PAD-, -PAD-, -PAD-, -PAD-, -PAD-, -PAD-, -PA..."
6,NN,APPR,141,"[Neuiuuflage, Ahfang, Damämodäll, Edelstaahl, ..."
7,-PAD-,ADJA,137,"[-PAD-, -PAD-, -PAD-, -PAD-, -PAD-, -PAD-, -PA..."
8,NN,NE,130,"[Uhrä, Brichtsjohr, Damämodäll, Segment, Rennä..."
9,VVPP,NN,129,"[fortgsetzt, verankoret, iifüegt, inspiriert, ..."


In [31]:
! rm -rfv "log/2019-01-25 00:51:36"
! ls -hl log

removed 'log/2019-01-25 00:51:36/events.out.tfevents.1548373902.8934e8e9fe4a'
removed directory 'log/2019-01-25 00:51:36'
total 8.0K
drwxr-xr-x 2 root root 4.0K Jan 25 02:14 '2019-01-25 03:14:21'
drwxr-xr-x 2 root root 4.0K Jan 25 02:29 '2019-01-25 03:29:27'


### playground

In [0]:
a = [1, 2, 3, 4, 5]
b = [9, 2, 7, 6, 5]
print(['✓' if i == j else '𐄂' for i, j in zip(a, b)])
print([(i, j) for i, j in zip(a, b) if i != j])

['𐄂', '✓', '𐄂', '𐄂', '✓']
[(1, 9), (3, 7), (4, 6)]


In [0]:
import itertools

a = [1, 2, 3, 4, 5]
b = [9, 2, 7]
print(list(itertools.zip_longest(a, b)))

[(1, 9), (2, 2), (3, 7), (4, None), (5, None)]


## BiLSTM without lowercasing

### `main.py`

In [0]:
# remove logs from previous run
!rm -rf log
!mkdir log

In [0]:
# get ngrok
! wget -O ngrok-stable-linux-amd64.zip https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
! unzip -o ngrok-stable-linux-amd64.zip

# setup ngrok
LOG_DIR = './log'
get_ipython().system_raw(
    'tensorboard --logdir {} --host 0.0.0.0 --port 6006 &'
    .format(LOG_DIR)
)

get_ipython().system_raw('./ngrok http 6006 &')

! curl -s http://localhost:4040/api/tunnels | python3 -c \
    "import sys, json; print(json.load(sys.stdin)['tunnels'][0]['public_url'])"



import keras.utils
import numpy as np
from keras import backend as K
from keras.layers import Dense
from keras.layers import LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.models import Sequential
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard

import preprocessing as prepr

# check whether we're running on CPU/TPU or GPU
# on colab: change device in 'Runtime' → 'Change runtime type' → 'Hardware accelerator'
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print("Running on CPU or TPU.")
else:
    print("Running on GPU.")
print("---")


# keep results deterministic (results on colab may differ between runtime resets)
# https://machinelearningmastery.com/reproducible-results-neural-networks-keras/
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)


test_size = 0.1

data = prepr.parse_xml_data()

# Split
n_of_articles = len(data)
n_of_test_articles = int(n_of_articles * test_size)
test_articles = data[(n_of_articles - n_of_test_articles):]
train_articles = data[:(n_of_articles - n_of_test_articles)]


train_sentences = []
test_sentences = []
train_tags = []
test_tags = []


for article in train_articles:
    for sentence in article:
        words = []
        tags = []
        for word in article[sentence]['words']:
            words.append(word['word'])
            tags.append(word['pos'])
        train_sentences.append(np.array(words))
        train_tags.append(np.array(tags))
        
        
for article in test_articles:
    for sentence in article:
        words = []
        tags = []
        for word in article[sentence]['words']:
            words.append(word['word'])
            tags.append(word['pos'])
        test_sentences.append(np.array(words))
        test_tags.append(np.array(tags))

# print('Training sentences:')
# print(train_sentences[0])
print('Number of training sentences: %d' % len(train_sentences))
# print('Test sentences:')
# print(test_sentences[0])
print('Number of testing sentences: %d' % len(test_sentences))
# print('Training tags:')
# print(train_tags[0])
print('Number of training tags: %d' % len(train_tags))
# print('Testing tags')
# print(test_tags[0])
print('Number of testing tags: %d' % len(test_tags))

unique_words, unique_tags = set([]), set([])


for s in train_sentences:
    for w in s:
        unique_words.add(w)

for ts in train_tags:
    for t in ts:
        unique_tags.add(t)

for s in test_sentences:
    for w in s:
        unique_words.add(w)

for ts in test_tags:
    for t in ts:
        unique_tags.add(t)

word2index = {w: i + 2 for i, w in enumerate(list(unique_words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs

tag2index = {t: i + 1 for i, t in enumerate(list(unique_tags))}
tag2index['-PAD-'] = 0  # The special value used to padding
print('tag2index:', tag2index)

index2tag = {v: k for k, v in tag2index.items()}


train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []

for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w])
        except KeyError:
            s_int.append(word2index['-OOV-'])

    train_sentences_X.append(s_int)

for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w])
        except KeyError:
            s_int.append(word2index['-OOV-'])

    test_sentences_X.append(s_int)

for s in train_tags:
    train_tags_y.append([tag2index[t] for t in s])

for s in test_tags:
    test_tags_y.append([tag2index[t] for t in s])

# print(train_sentences_X[0])
# print(test_sentences_X[0])
# print(train_tags_y[0])
# print(test_tags_y[0])

MAX_LENGTH = len(max(train_sentences_X, key=len))
print("Max sequence length:", MAX_LENGTH)  # Should be 156

train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')

# print(train_sentences_X[0])
# print(test_sentences_X[0])
# print(train_tags_y[0])
# print(test_tags_y[0])

num_words=len(word2index)
num_tags=len(tag2index)
print("number of words:", num_words)
print("number of tags:", num_tags)

################################################################################
# LSTM #########################################################################
################################################################################
model = Sequential()
#model.add(InputLayer(input_shape=(MAX_LENGTH,)))
model.add(Embedding(num_words,
                    128,
                    mask_zero=True,
                    #input_length=MAX_LENGTH
                   ))
model.add(Bidirectional(LSTM(64,
                             return_sequences=True,
                             recurrent_regularizer=keras.regularizers.l1_l2(0.5, 0.5),
                             dropout=0.5,
                             recurrent_dropout=0.5)))
model.add(TimeDistributed(Dense(num_tags)))
model.add(Activation('softmax'))


model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])

model.summary()
################################################################################

categorical_tags_y = keras.utils.to_categorical(train_tags_y, len(tag2index))


from datetime import datetime
from pytz import timezone
tensorboard = TensorBoard(log_dir="log/{}".format(datetime.now(timezone('Europe/Zurich')).strftime("%Y-%m-%d %H:%M:%S")), update_freq='batch')
# update_freq is either 'batch', 'epoch', or integer i to write after every i samples

history = model.fit(train_sentences_X,
                    keras.utils.to_categorical(train_tags_y, len(tag2index)),
                    batch_size=128,
                    epochs=25,
                    validation_split=0.2,
                    callbacks=[tensorboard])
scores = model.evaluate(test_sentences_X, keras.utils.to_categorical(test_tags_y, len(tag2index)))
for i, name in enumerate(model.metrics_names):
    print("%s: %s" % (name, 100 * scores[i]))

test_samples = [
    "Ich bin scho recht gspannt was passiert".split(),
    "Mer hend jetzt es Model trainiert und es isch ziemli guet worde.".split()
]

test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_samples_X.append(s_int)

  
def prediction_to_tag(tag_prediction_probabilities):
  index_of_max = tag_prediction_probabilities.argmax()
  return index2tag[index_of_max]
  
  
test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')

predictions = model.predict(test_samples_X)

for sen_idx, sen in enumerate(test_samples):
  for word_idx, tag_probabilities in enumerate(predictions[sen_idx]):
    word = sen[word_idx] if word_idx < len(sen) else '-PAD-'
    predicted_tag = prediction_to_tag(tag_probabilities)
    print((word, predicted_tag), end=' ')
  print()  # newline after each sentence
    

--2019-01-23 14:26:25--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
Resolving bin.equinox.io (bin.equinox.io)... 52.55.191.55, 52.2.175.150, 52.201.75.180, ...
Connecting to bin.equinox.io (bin.equinox.io)|52.55.191.55|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5363700 (5.1M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.zip’


2019-01-23 14:26:27 (3.41 MB/s) - ‘ngrok-stable-linux-amd64.zip’ saved [5363700/5363700]

Archive:  ngrok-stable-linux-amd64.zip
  inflating: ngrok                   
http://ccc44786.ngrok.io
Running on GPU.
---
READING data/blogs.xml ...
READING data/swatch.xml ...
READING data/blick.xml ...
READING data/wiki.xml ...
READING data/schobinger.xml ...
Number of training sentences: 7003
Number of testing sentences: 324
Number of training tags: 7003
Number of testing tags: 324
tag2index: {'KON+': 1, 'PAV+': 2, 'CARD+': 3, 'PWAV': 4, 'PIAT': 5, 'VVFIN+': 6, '$(': 7, 'ITJ': 8, 'PAV': 9, 'VVINF': 1

### inspect predictions

In [0]:
def get_prediction(input_sentences):
  test_samples_X = []
  for s in input_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_samples_X.append(s_int)
  
  test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')
  predictions = model.predict(test_samples_X)
    
  predictions_human_readable = []
  for sentence in predictions:
    sentence_human_readable = []
    for tag_probabilities in sentence:
      predicted_tag = prediction_to_tag(tag_probabilities)
      sentence_human_readable.append(predicted_tag)
    predictions_human_readable.append(sentence_human_readable)
    
  return predictions_human_readable


test_predictions = get_prediction(test_sentences)

import pandas as pd
import itertools

df_index = []
for i in range(len(test_sentences)):
  df_index.append('input')
  df_index.append('truth')
  df_index.append('pred')
  df_index.append('comp')
  df_index.append('')


data = []
error_statistics = {}
error_examples = {}
for idx, s in enumerate(test_sentences):
  test_tags_s = list(test_tags[idx]) + ['-PAD-' for i in range(MAX_LENGTH-len(test_tags[idx]))]
  test_predictions_s = test_predictions[idx]
  comparison = ['✓' if y == p else '𐄂' for y, p in zip(test_tags_s, test_predictions_s)]
  errors = [((y, p), word) for y, p, word in list(itertools.zip_longest(test_tags_s, test_predictions_s, s, fillvalue='-PAD-')) if y != p]
  for e, word in errors:
    if e in error_statistics:
      error_statistics[e]['count'] += 1
      error_statistics[e]['examples'].append(word)
    else:
      error_statistics[e] = {'count': 1, 'examples': [word]}
#    if e in error_examples:
#      error_examples[e].append(word)
#    else:
#      error_examples[e] = []
  data.append(list(s) + ['-PAD-' for i in range(MAX_LENGTH-len(s))])
  data.append(test_tags_s)
  data.append(test_predictions_s)
  data.append(comparison)
  data.append([''] * MAX_LENGTH)

error_stats_sorted = sorted(error_statistics.items(), key=lambda kv: kv[1]['count'], reverse=True)
error_stats_sorted = [[y, p, dict['count'], dict['examples']] for ((y, p), dict) in error_stats_sorted]
#print('error_stats_sorted:', error_stats_sorted)

# calcualte accuracy manually
# (1) taking padding into account
total_errors_with_padding = 0
for y, p, count, examples in error_stats_sorted:
  total_errors_with_padding += count

total_tags_with_padding = len(test_tags)*MAX_LENGTH  # (number of test sentences) * (max length of each sentence)

correct_predictions_with_padding = total_tags_with_padding - total_errors_with_padding
accuracy_manual_with_padding = float(correct_predictions_with_padding) / total_tags_with_padding
print("Manually calculated accuracy (with padding) = ", accuracy_manual_with_padding)

# (2) without padding
total_errors = 0
for y, p, count, examples in error_stats_sorted:
  if y != '-PAD-':
    total_errors += count
total_tags = 0
for sen in test_tags:
  total_tags += len(sen)

correct_predictions = total_tags - total_errors
accuracy_manual = float(correct_predictions) / total_tags
print("Manually calculated accuracy (without padding) = ", accuracy_manual)


df = pd.DataFrame(data, index=df_index)

filename = 'test_predictions.pkl'
df.to_pickle(filename)
print("You may download '{}' to inspect the predictions".format(filename))

df.head(40)


Manually calculated accuracy (with padding) =  0.05161839189616967
Manually calculated accuracy (without padding) =  0.7068545109726362
You may download 'test_predictions.pkl' to inspect the predictions


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,146,147,148,149,150,151,152,153,154,155
input,Viktor,Schobinger,Em,Ääschme,sini,vier,Bäize,(,2005,),...,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-
truth,NE,NE,ART,NE,PPOSAT,CARD,NN,$(,CARD,$(,...,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-
pred,XY,XY,ART,PPOSAT,PPOSAT,CARD,CARD,$(,CARD,$(,...,$.,$.,$.,$.,$.,$.,$.,$.,$.,$.
comp,𐄂,𐄂,✓,𐄂,✓,✓,𐄂,✓,✓,✓,...,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂
,,,,,,,,,,,...,,,,,,,,,,
input,Scho,wil,der,Isering,de,Polizeidiräkter,känt,.,-PAD-,-PAD-,...,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-
truth,ADV,KOUS,ART,NE,ART,NN,VVFIN,$.,-PAD-,-PAD-,...,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-,-PAD-
pred,ADV,KOUS,ART,ADJA,ART,NN,VVINF,$.,$.,$.,...,$.,$.,$.,$.,$.,$.,$.,$.,$.,$.
comp,✓,✓,✓,𐄂,✓,✓,𐄂,✓,𐄂,𐄂,...,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂,𐄂
,,,,,,,,,,,...,,,,,,,,,,


### inspect common errors

In [0]:
filename = 'common_errors.pkl'
df.to_pickle(filename)
print("You may download '{}' to inspect the most common errors".format(filename))

df = pd.DataFrame(error_stats_sorted, columns=['truth', 'pred', 'count', 'examples'])
print("Most common errors ('-PAD-' tag errors are ignored in evaluation accuracy):")
df

You may download 'common_errors.pkl' to inspect the most common errors
Most common errors ('-PAD-' tag errors are ignored in evaluation accuracy):


Unnamed: 0,truth,pred,count,examples
0,-PAD-,$.,40988,"[-PAD-, -PAD-, -PAD-, -PAD-, -PAD-, -PAD-, -PA..."
1,-PAD-,$(,5087,"[-PAD-, -PAD-, -PAD-, -PAD-, -PAD-, -PAD-, -PA..."
2,-PAD-,VVINF,624,"[-PAD-, -PAD-, -PAD-, -PAD-, -PAD-, -PAD-, -PA..."
3,-PAD-,ADJA,154,"[-PAD-, -PAD-, -PAD-, -PAD-, -PAD-, -PAD-, -PA..."
4,NE,NN,83,"[Ääschme, Ääschme, Ääschme, Isegrind, Fischer,..."
5,PPER,ART,82,"[s, s, s, em, em, s, s, s, s, s, en, s, en, s,..."
6,VVFIN,NN,55,"[inepasst, chlopfet, redt, telifoniered, lange..."
7,NN,VVINF,37,"[Gämferkoleeg, Ggwicht, Chropf, Scheff, Schrii..."
8,ADJA,NN,32,"[unaaggnèème, äiges, ubekant, hèrte, vermisst,..."
9,ADV,NN,23,"[sèttig, näbetzue, übere, blos, linggs, übere,..."


# $MLP_M$: Multilayer perceptron with manual feature engineering

from https://becominghuman.ai/part-of-speech-tagging-tutorial-with-the-keras-deep-learning-library-d7f93fa05537

*This code crashes the runtime on colab, run it on your local machine*

In [0]:
from keras.optimizers import Adam

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

import preprocessing as prepr

# check whether we're running on CPU/TPU or GPU
# on colab: change device in 'Runtime' → 'Change runtime type' → 'Hardware accelerator'
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print("Running on CPU or TPU.")
else:
    print("Running on GPU.")
print("---")

# keep results deterministic (results on colab may differ between runtime resets)
# https://machinelearningmastery.com/reproducible-results-neural-networks-keras/
from numpy.random import seed

seed(1)
from tensorflow import set_random_seed

set_random_seed(2)

test_size = 0.1

data = prepr.parse_xml_data()

# Split
n_of_articles = len(data)
n_of_test_articles = int(n_of_articles * test_size)
test_articles = data[(n_of_articles - n_of_test_articles):]
train_articles = data[:(n_of_articles - n_of_test_articles)]

train_sentences = []
test_sentences = []

for article in train_articles:
    for sentence in article:
        word_class_pairs = []
        for word in article[sentence]['words']:
            word_class_pairs.append((word['word'], word['pos']))
        train_sentences.append(word_class_pairs)

for article in test_articles:
    for sentence in article:
        word_class_pairs = []
        for word in article[sentence]['words']:
            word_class_pairs.append((word['word'], word['pos']))
        test_sentences.append(word_class_pairs)


# print(train_sentences)
# print(test_sentences)


def add_basic_features(sentence_terms, index):
    """ Compute some very basic word features.
        :param sentence_terms: [w1, w2, ...]
        :type sentence_terms: list
        :param index: the index of the word
        :type index: int
        :return: dict containing features
        :rtype: dict
    """
    term = sentence_terms[index]
    return {
        'nb_terms': len(sentence_terms),
        'term': term,
        'is_first': index == 0,
        'is_last': index == len(sentence_terms) - 1,
        'is_capitalized': term[0].upper() == term[0],
        'is_all_caps': term.upper() == term,
        'is_all_lower': term.lower() == term,
        'prefix-1': term[0],
        'prefix-2': term[:2],
        'prefix-3': term[:3],
        'suffix-1': term[-1],
        'suffix-2': term[-2:],
        'suffix-3': term[-3:],
        'prev_word': '' if index == 0 else sentence_terms[index - 1],
        'next_word': '' if index == len(sentence_terms) - 1 else sentence_terms[index + 1]
    }


def untag(tagged_sentence):
    """
    Remove the tag for each tagged term.
    :param tagged_sentence: a POS tagged sentence
    :type tagged_sentence: list
    :return: a list of tags
    :rtype: list of strings
    """
    return [w for w, _ in tagged_sentence]


def transform_to_dataset(tagged_sentences):
    """
    Split tagged sentences to X and y datasets and append some basic features.
    :param tagged_sentences: a list of POS tagged sentences
    :param tagged_sentences: list of list of tuples (term_i, tag_i)
    :return:
    """
    X, y = [], []
    for pos_tags in tagged_sentences:
        for index, (term, class_) in enumerate(pos_tags):
            # Add basic NLP features for each sentence term
            X.append(add_basic_features(untag(pos_tags), index))
            y.append(class_)
    return X, y


X_train, y_train = transform_to_dataset(train_sentences)
X_test, y_test = transform_to_dataset(test_sentences)

print(X_train[:10])
print(y_train[:10])
print(X_test[:10])
print(y_test[:10])

from sklearn.feature_extraction import DictVectorizer

# Fit our DictVectorizer with our set of features
dict_vectorizer = DictVectorizer(sparse=False)
dict_vectorizer.fit(X_train + X_test)
# Convert dict features to vectors
X_train = dict_vectorizer.transform(X_train)
print("~~~ DEBUG ~~~")
X_test = dict_vectorizer.transform(X_test)

from sklearn.preprocessing import LabelEncoder

# Fit LabelEncoder with our list of classes
label_encoder = LabelEncoder()
label_encoder.fit(y_train + y_test)
# Encode class values as integers
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

# Convert integers to dummy variables (one hot encoded)
from keras.utils import np_utils

y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)

from keras.models import Sequential
from keras.layers import Dense, Dropout

model = Sequential([
    Dense(128, input_dim=X_train.shape[1], activation='relu'),  # input layer
    Dropout(0.5),
    Dense(128, activation='relu'),  # hidden layer 1
    Dropout(0.5),
    Dense(y_train.shape[1], activation='softmax')  # output layer
])
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])

history = model.fit(X_train, y_train, batch_size=128,
                    epochs=5, validation_split=0.2)
scores = model.evaluate(X_test, y_test)
for i, name in enumerate(model.metrics_names):
    print("%s: %s" % (name, 100 * scores[i]))

Running on GPU.
---
READING data/wiki.xml ...
READING data/blogs.xml ...
READING data/schobinger.xml ...
READING data/blick.xml ...
READING data/swatch.xml ...
[{'nb_terms': 30, 'term': 'Mit', 'is_first': True, 'is_last': False, 'is_capitalized': True, 'is_all_caps': False, 'is_all_lower': False, 'prefix-1': 'M', 'prefix-2': 'Mi', 'prefix-3': 'Mit', 'suffix-1': 't', 'suffix-2': 'it', 'suffix-3': 'Mit', 'prev_word': '', 'next_word': 'de'}, {'nb_terms': 30, 'term': 'de', 'is_first': False, 'is_last': False, 'is_capitalized': False, 'is_all_caps': False, 'is_all_lower': True, 'prefix-1': 'd', 'prefix-2': 'de', 'prefix-3': 'de', 'suffix-1': 'e', 'suffix-2': 'de', 'suffix-3': 'de', 'prev_word': 'Mit', 'next_word': 'Eroberig'}, {'nb_terms': 30, 'term': 'Eroberig', 'is_first': False, 'is_last': False, 'is_capitalized': True, 'is_all_caps': False, 'is_all_lower': False, 'prefix-1': 'E', 'prefix-2': 'Er', 'prefix-3': 'Ero', 'suffix-1': 'g', 'suffix-2': 'ig', 'suffix-3': 'rig', 'prev_word': 'de'

# PyTorch implementation

from https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html#example-an-lstm-for-part-of-speech-tagging

### Install PyTorch

In [3]:
# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision
import torch

tcmalloc: large alloc 1073750016 bytes == 0x57bc6000 @  0x7fe4f7c1e2a4 0x591a07 0x5b5d56 0x502e9a 0x506859 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x504c28 0x502540 0x502f3d 0x507641


### read in data

In [4]:
import preprocessing as prepr
import numpy as np

# check whether we're running on CPU/TPU or GPU
# on colab: change device in 'Runtime' → 'Change runtime type' → 'Hardware accelerator'
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print("Running on CPU or TPU.")
else:
    print("Running on GPU.")
print("---")


# keep results deterministic (results on colab may differ between runtime resets)
# https://machinelearningmastery.com/reproducible-results-neural-networks-keras/
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)


test_size = 0.1

data = prepr.parse_xml_data()

# Split
n_of_articles = len(data)
n_of_test_articles = int(n_of_articles * test_size)
test_articles = data[(n_of_articles - n_of_test_articles):]
train_articles = data[:(n_of_articles - n_of_test_articles)]

training_data = []
test_data = []

for article in train_articles:
    for sentence in article:
        words = []
        tags = []
        for word in article[sentence]['words']:
            words.append(word['word'])
            tags.append(word['pos'])
        if len(words) > 0:
          training_data.append((words, tags))
        
        
for article in test_articles:
    for sentence in article:
        words = []
        tags = []
        for word in article[sentence]['words']:
            words.append(word['word'])
            tags.append(word['pos'])
        if len(words) > 0:
          test_data.append((words, tags))

Running on GPU.
---
READING data/wiki.xml ...
READING data/blogs.xml ...
READING data/schobinger.xml ...
READING data/blick.xml ...
READING data/swatch.xml ...


In [67]:
print(training_data)
print(len(training_data))

for i, sample in enumerate(training_data):
  sen, tags = sample
  if len(tags) < 1:
    print('tag zero', i, sen, tags)
  if len(sen) < 1:
    print('sen zero', i, sen, tags)

[(['Mit', 'de', 'Eroberig', 'vom', 'Aargau', 'durch', 'di', 'alti', 'Eidgnosseschaft', 'im', '1415i', 'isch', 'Bade', 'de', 'Sitz', 'vom', 'Landvogt', 'vo', 'de', 'Grafschaft', 'Bade', 'worde', 'und', 'au', 'vili', 'Tagsatzige', 'hei', 'hiir', 'schtattgfunde', '.'], ['APPR', 'ART', 'NN', 'APPRART', 'NE', 'APPR', 'ART', 'ADJA', 'NN', 'APPRART', 'CARD', 'VAFIN', 'NE', 'ART', 'NN', 'APPRART', 'NN', 'APPR', 'ART', 'NN', 'NE', 'VAPP', 'KON', 'ADV', 'PIAT', 'NN', 'VAFIN', 'ADV', 'VVPP', '$.']), (['d', 'Gmeindsversammlig', 'vo', 'Noiehof', 'het', ',', 'wi', 'au', 'de', 'Iiwohnerrot', 'vo', 'Bade', 'am', '30.', 'März', '2010', 'de', 'plante', 'Fusion', 'mit', 'de', 'Schtadt', 'Bade', 'zugschtimmt', ',', 'wo', 'uf', 'de', '1.', 'Jänner', '2012', 'het', 'soll', 'realisiirt', 'werde', '.'], ['ART', 'NN', 'APPR', 'NE', 'VAFIN', '$,', 'KOKOM', 'ADV', 'ART', 'NN', 'APPR', 'NE', 'APPRART', 'ADJA', 'NN', 'CARD', 'ART', 'ADJA', 'NN', 'APPR', 'ART', 'NN', 'NE', 'VVPP', '$,', 'PRELS', 'APPR', 'ART', 'ADJ

### Run model

In [49]:
# -*- coding: utf-8 -*-
r"""
Sequence Models and Long-Short Term Memory Networks
===================================================

At this point, we have seen various feed-forward networks. That is,
there is no state maintained by the network at all. This might not be
the behavior we want. Sequence models are central to NLP: they are
models where there is some sort of dependence through time between your
inputs. The classical example of a sequence model is the Hidden Markov
Model for part-of-speech tagging. Another example is the conditional
random field.

A recurrent neural network is a network that maintains some kind of
state. For example, its output could be used as part of the next input,
so that information can propogate along as the network passes over the
sequence. In the case of an LSTM, for each element in the sequence,
there is a corresponding *hidden state* :math:`h_t`, which in principle
can contain information from arbitrary points earlier in the sequence.
We can use the hidden state to predict words in a language model,
part-of-speech tags, and a myriad of other things.


LSTM's in Pytorch
~~~~~~~~~~~~~~~~~

Before getting to the example, note a few things. Pytorch's LSTM expects
all of its inputs to be 3D tensors. The semantics of the axes of these
tensors is important. The first axis is the sequence itself, the second
indexes instances in the mini-batch, and the third indexes elements of
the input. We haven't discussed mini-batching, so lets just ignore that
and assume we will always have just 1 dimension on the second axis. If
we want to run the sequence model over the sentence "The cow jumped",
our input should look like

.. math::


   \begin{bmatrix}
   \overbrace{q_\text{The}}^\text{row vector} \\
   q_\text{cow} \\
   q_\text{jumped}
   \end{bmatrix}

Except remember there is an additional 2nd dimension with size 1.

In addition, you could go through the sequence one at a time, in which
case the 1st axis will have size 1 also.

Let's see a quick example.
"""

# Author: Robert Guthrie

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

######################################################################

#lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
#inputs = [torch.randn(1, 3) for _ in range(5)]  # make a sequence of length 5

# initialize the hidden state.
#hidden = (torch.randn(1, 1, 3),
#          torch.randn(1, 1, 3))
#for i in inputs:
#    # Step through the sequence one element at a time.
#    # after each step, hidden contains the hidden state.
#    out, hidden = lstm(i.view(1, 1, -1), hidden)

# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension

#inputs = torch.cat(inputs).view(len(inputs), 1, -1)
#hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
#out, hidden = lstm(inputs, hidden)
#print(out)
#print(hidden)


######################################################################
# Example: An LSTM for Part-of-Speech Tagging
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# In this section, we will use an LSTM to get part of speech tags. We will
# not use Viterbi or Forward-Backward or anything like that, but as a
# (challenging) exercise to the reader, think about how Viterbi could be
# used after you have seen what is going on.
#
# The model is as follows: let our input sentence be
# :math:`w_1, \dots, w_M`, where :math:`w_i \in V`, our vocab. Also, let
# :math:`T` be our tag set, and :math:`y_i` the tag of word :math:`w_i`.
# Denote our prediction of the tag of word :math:`w_i` by
# :math:`\hat{y}_i`.
#
# This is a structure prediction, model, where our output is a sequence
# :math:`\hat{y}_1, \dots, \hat{y}_M`, where :math:`\hat{y}_i \in T`.
#
# To do the prediction, pass an LSTM over the sentence. Denote the hidden
# state at timestep :math:`i` as :math:`h_i`. Also, assign each tag a
# unique index (like how we had word\_to\_ix in the word embeddings
# section). Then our prediction rule for :math:`\hat{y}_i` is
#
# .. math::  \hat{y}_i = \text{argmax}_j \  (\log \text{Softmax}(Ah_i + b))_j
#
# That is, take the log softmax of the affine map of the hidden state,
# and the predicted tag is the tag that has the maximum value in this
# vector. Note this implies immediately that the dimensionality of the
# target space of :math:`A` is :math:`|T|`.
#
#
# Prepare data:

def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


# DEBUG: use sample training data:
#training_data = [
#    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
#    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
#]

  
unique_words, unique_tags = set([]), set([])

for sample in training_data:
    sent, tags = sample
    for word in sent:
        unique_words.add(word)
    for tag in tags:
        unique_tags.add(tag)

for sample in test_data:
    sent, tags = sample
    for word in sent:
        unique_words.add(word)
    for tag in tags:
        unique_tags.add(tag)

word_to_ix = {w: i + 1 for i, w in enumerate(list(unique_words))}
word_to_ix['-OOV-'] = 0  # The special value used for OOVs
print('word_to_ix:', word_to_ix)
print('unique words:', len(word_to_ix))

tag_to_ix = {t: i for i, t in enumerate(list(unique_tags))}
print('tag_to_ix:', tag_to_ix)
print('unique tags:', len(tag_to_ix))

ix_to_tag = {v: k for k, v in tag_to_ix.items()}
print('ix_to_tag:', ix_to_tag)
print('unique tags:', len(ix_to_tag))


# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 64
HIDDEN_DIM = 64
EPOCHS = 25

######################################################################
# Create the model:


class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(
            embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

######################################################################
# Train the model:


model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
#with torch.no_grad():
#    inputs = prepare_sequence(training_data[0][0], word_to_ix)
#    tag_scores = model(inputs)
#    print(tag_scores)

for epoch in range(EPOCHS):  # again, normally you would NOT do 300 epochs, it is toy data
    print('epoch {}/{} ...'.format(epoch+1, EPOCHS))
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
#with torch.no_grad():
#    inputs = prepare_sequence(training_data[0][0], word_to_ix)
#    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
#    print(tag_scores)


######################################################################
# Exercise: Augmenting the LSTM part-of-speech tagger with character-level features
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# In the example above, each word had an embedding, which served as the
# inputs to our sequence model. Let's augment the word embeddings with a
# representation derived from the characters of the word. We expect that
# this should help significantly, since character-level information like
# affixes have a large bearing on part-of-speech. For example, words with
# the affix *-ly* are almost always tagged as adverbs in English.
#
# To do this, let :math:`c_w` be the character-level representation of
# word :math:`w`. Let :math:`x_w` be the word embedding as before. Then
# the input to our sequence model is the concatenation of :math:`x_w` and
# :math:`c_w`. So if :math:`x_w` has dimension 5, and :math:`c_w`
# dimension 3, then our LSTM should accept an input of dimension 8.
#
# To get the character level representation, do an LSTM over the
# characters of a word, and let :math:`c_w` be the final hidden state of
# this LSTM. Hints:
#
# * There are going to be two LSTM's in your new model.
#   The original one that outputs POS tag scores, and the new one that
#   outputs a character-level representation of each word.
# * To do a sequence model over characters, you will have to embed characters.
#   The character embeddings will be the input to the character LSTM.
#


word_to_ix: {'Bewegige': 1, 'erschiesse': 2, 'Finger': 3, 'Lektion': 4, 'ermordete': 5, 'Konzert': 6, 'oggi': 7, 'verschtritte': 8, 'schimmeret': 9, 'zementiera': 10, 'Ärnöyerygsbèwègyge': 11, 'umz’stellä': 12, 'sunniger': 13, 'shoppe': 14, 'wachsändä': 15, 'verläid': 16, 'Medie': 17, 'sentimentale': 18, 'liebi': 19, 'Festival': 20, 'Hèrmes': 21, 'Plaijaade': 22, 'Aspekt': 23, 'drockt': 24, 'kei': 25, 'attraktiveri': 26, 'Rodriguez': 27, 'würded': 28, 'sämtlicha': 29, 'zerstöre': 30, 'gschlage': 31, 'musica': 32, 'Bancomat': 33, '4.10': 34, 'Mario': 35, 'abeggläit': 36, 'zwöimou': 37, 'pefäkt': 38, 'Arakawa': 39, 'Tennis-Champion': 40, 'befääiget': 41, 'spannende': 42, 'Jetzt': 43, 'Keramikteil': 44, 'aafangend': 45, 'Barbossa': 46, 'Gschmack': 47, 'zgseh': 48, 'Erfindige': 49, 'doozmaal': 50, 'redend': 51, 'naagieng': 52, 'transparänti': 53, 'dassme': 54, 'Kunsthandwärch': 55, 'M3': 56, 'show': 57, 'Phasa': 58, 'truckfèrtegi': 59, 'lange': 60, 'Italiänisch': 61, 'einzelne': 62, 'schön

save the trained model

In [50]:
torch.save(model.state_dict(), 'pytorch-model')
print('saved model as pytorch-model')

saved model as pytorch-model


load a trained model

In [0]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
model.load_state_dict(torch.load('pytorch-model'))
model.eval()

### evaluate model

In [0]:
# TODO: this is slow!!!
def probabilities_to_tag(tag_prediction_probabilities):

#  with tf.Session() as sess:
#    index_of_max = tf.argmax(tag_prediction_probabilities).eval()
#    print('index_of_max', index_of_max)
  
  index_of_max = tag_prediction_probabilities.argmax()
  with tf.Session() as sess:
    index_of_max = tf.to_int32(index_of_max).eval()
#    print('index_of_max', index_of_max)
  return ix_to_tag[index_of_max]


In [52]:
# returns predictions (tags) in human readable form and prints accuracy to console
# TODO: return predictions in the same pandas df format as with the RNN approach -> that way we can compare the two approaches
def get_predictions_human_and_evaluate(input_sentences):
  total_words = 0
  total_correct = 0
  output = []
  i = 0
  total_sentences = len(input_sentences)
  for words, tags in input_sentences:
    total_words += len(words)
    with torch.no_grad():
      inputs = prepare_sequence(words, word_to_ix)
      sen_tag_scores = model(inputs)

    pred_human = []
    for i, score in enumerate(sen_tag_scores):
      predicted_tag = probabilities_to_tag(score)
      if predicted_tag == tags[i]:
        total_correct += 1
      # TODO: df format
      pred_human.append(predicted_tag)
    i += 1
    print("accuracy after {}/{} sentences: {}".format(i, total_sentences, float(total_correct)/total_words))
    output.append(pred_human)
  print("accuracy: ", float(total_correct)/total_words)
  return output


import time
start = time.time()

preds = get_predictions_human_and_evaluate(test_data)

end = time.time()
print('time in seconds:', end - start)

accuracy after 30/511 sentences: 0.8
accuracy after 27/511 sentences: 0.7192982456140351
accuracy after 38/511 sentences: 0.7368421052631579
accuracy after 15/511 sentences: 0.7090909090909091
accuracy after 39/511 sentences: 0.697986577181208
accuracy after 15/511 sentences: 0.6951219512195121
accuracy after 32/511 sentences: 0.6938775510204082
accuracy after 27/511 sentences: 0.6771300448430493
accuracy after 31/511 sentences: 0.6929133858267716
accuracy after 17/511 sentences: 0.6900369003690037
accuracy after 15/511 sentences: 0.6888111888111889
accuracy after 32/511 sentences: 0.7012578616352201
accuracy after 39/511 sentences: 0.6862745098039216
accuracy after 7/511 sentences: 0.6868131868131868
accuracy after 30/511 sentences: 0.6776649746192893
accuracy after 8/511 sentences: 0.6791044776119403
accuracy after 16/511 sentences: 0.6794258373205742
accuracy after 19/511 sentences: 0.6796338672768879
accuracy after 3/511 sentences: 0.6795454545454546
accuracy after 22/511 sentences

KeyboardInterrupt: ignored

In [32]:
print(preds)

[['NN', 'NN', 'NN', 'NN', 'NN', '$,', 'KON', 'KON', 'NN', 'VAFIN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'ART', 'NN', 'NN', 'NN', 'APPR', 'ART', 'NN', 'NN', 'KON', 'APPRART', 'NN', 'NN', 'NN', '$.']]


### alternative evaluation algorithm

In [29]:
def get_predictions_tensors(input_sentences):
  output = []
  for words, tags in input_sentences:
    with torch.no_grad():
      inputs = prepare_sequence(words, word_to_ix)
      sen_tag_scores = model(inputs)
    output.append(sen_tag_scores)
  return output



def tensors_to_human_readable(predictions_as_tensors):
  output = []
  for sentence in predictions_as_tensors:
    sen_human = []
    for score in sentence:
        predicted_tag = probabilities_to_tag(score)
        sen_human.append(predicted_tag)
    output.append(sen_human)
  return output


print(len(test_data))
print('starting get_predictions_tensors')
preds_t = get_predictions_tensors(test_data[:1])
print('done')
print('starting tensors_to_human_readable')
preds = tensors_to_human_readable(preds_t)
print('done')
print(preds)

511
starting get_predictions_tensors
done
starting tensors_to_human_readable
done
[['NN', 'NN', 'NN', 'NN', 'NN', '$,', 'KON', 'KON', 'NN', 'VAFIN', 'NN', 'NN', 'NN', 'NN', 'NN', 'NN', 'ART', 'NN', 'NN', 'NN', 'APPR', 'ART', 'NN', 'NN', 'KON', 'APPRART', 'NN', 'NN', 'NN', '$.']]
