In [0]:
! git clone https://github.com/UniversalDependencies/UD_English-PUD.git
!ls

In [0]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam
import numpy as np
from keras import backend as K

#extract words and their tags from train file
sentences = []
sentences_tags = []
tag_words = []
tag_tags = []
with open("UD_English-ParTUT/en_partut-ud-train.conllu", "r") as f:
    for line in f:
        if line[0] == '#':
            continue
        if line[0] == '1' and line[1] == '\t':
            sentences.append(tag_words)
            sentences_tags.append(tag_tags)
            tag_words = []
            tag_tags = []
        splitted = line.split('\t')
        if len(splitted) <= 3:
            continue
        tag_words.append(splitted[1])
        tag_tags.append(splitted[3])
        
del sentences[0]
del sentences_tags[0]

train_sentences, train_tags = sentences[:int(len(sentences)* 1)], sentences_tags[:int(len(sentences)* 1)]

#extract words and their tags from test file
sentences = []
sentences_tags = []
tag_words = []
tag_tags = []
with open("UD_English-ParTUT/en_partut-ud-test.conllu", "r") as f:
    for line in f:
        if line[0] == '#':
            continue
        if line[0] == '1' and line[1] == '\t':
            sentences.append(tag_words)
            sentences_tags.append(tag_tags)
            tag_words = []
            tag_tags = []
        splitted = line.split('\t')
        if len(splitted) <= 3:
            continue
        tag_words.append(splitted[1])
        tag_tags.append(splitted[3])
        
del sentences[0]
del sentences_tags[0]

test_sentences, test_tags = sentences, sentences_tags

words, tags = set([]), set([])
 
for s in train_sentences:
    for w in s:
        words.add(w.lower())
        
for ts in train_tags:
    for t in ts:
        tags.add(t)

word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # The special value used to padding

train_sentences_X, train_tags_y, test_tags_y = [], [], []
 
for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    train_sentences_X.append(s_int)

for s in train_tags:
    train_tags_y.append([tag2index[t] for t in s])

for s in test_tags:
    try:
        test_tags_y.append([tag2index[t] for t in s])
    except KeyError:
        print(test_tags.index(s))
        
test_sentences_X = []

for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    test_sentences_X.append(s_int)
    
MAX_LENGTH = len(max(train_sentences_X, key=len))
print(MAX_LENGTH)

train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')

def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))

def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
 
        token_sequences.append(token_sequence)
 
    return token_sequences

def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy', ignore_class_accuracy(0)])
 
model.summary()

model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=50, validation_split=0.2)

scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))
print(f"{model.metrics_names[1]}: {scores[1] * 100}") 

print(test_sentences[10:20])
print("test tags", test_tags[10:20])
print("prediction")
predictions = model.predict([test_sentences_X[10:20]])
print(logits_to_tokens(predictions, {i: t for t, i in tag2index.items()}))


def evaluate(sent_set, tags_set):
    individual_score = 0
    sentence_score = 0
    total_word_count = 0
    final_results = []
    idx=0
    for i in range(len(sent_set)):
        idx+=1
        if idx%100==0:
            print(str(idx) + " senteces completed.")
        tag_sequence = tags_set[i]
        result = model.predict(np.asarray([sent_set[i]]))
        pred = logits_to_tokens(result, {i: t for t, i in tag2index.items()})
        for i in range(len(pred[0])):
            if '-PAD-' in pred[0]:
                i +=1
                pred[0].remove('-PAD-')
        final_results += pred[0]
    
        if pred[0]==tag_sequence:
            sentence_score+=1
            individual_score+=len(pred[0])
            total_word_count += len(pred[0])
        else:
            for predicted, actual in zip(pred[0],tag_sequence):
                total_word_count+=1
                if predicted==actual:
                    individual_score+=1


    print("Accuracy (tokenwise): ",float(individual_score)/total_word_count)
    print("Accuracy (sentencewise): ",float(sentence_score)/len(sent_set))

evaluate(test_sentences_X, test_tags)


In [0]:
# embedding with pre trained Word2vec model

import multiprocessing
from sklearn.model_selection import train_test_split
import gensim
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras import backend as K

#extract words and their tags from file
sentences = []
sentences_tags = []
tag_words = []
tag_tags = []
with open("UD_English-ParTUT/en_partut-ud-train.conllu", "r") as f:
    for line in f:
        if line[0] == '#':
            continue
        if line[0] == '1' and line[1] == '\t':
            sentences.append(tag_words)
            sentences_tags.append(tag_tags)
            tag_words = []
            tag_tags = []
        splitted = line.split('\t')
        if len(splitted) <= 3:
            continue
        tag_words.append(splitted[1])
        tag_tags.append(splitted[3])
        
del sentences[0]
del sentences_tags[0]

# loaded from https://code.google.com/archive/p/word2vec/
news_w2v = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

tags = set([])
        
for ts in sentences_tags:
    for t in ts:
        tags.add(t)

tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # The special value used to padding

tags_y = []

for s in sentences_tags:
    tags_y.append([tag2index[t] for t in s])
        
tokenizer = Tokenizer(lower=True, oov_token='-OOV-')
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1
sentences_X = tokenizer.texts_to_sequences(sentences)

MAX_LENGTH = len(max(sentences_X, key=len))
sentences_X = pad_sequences(sentences_X, maxlen=MAX_LENGTH, padding='post')
tags_y = pad_sequences(tags_y,maxlen=MAX_LENGTH, padding='post')

tokenizer.word_index['-PAD-'] = 0

# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, news_w2v.wv.vector_size))
for word, i in tokenizer.word_index.items():
    try:
        embedding_vector = news_w2v.wv.get_vector(word)
    except:
        embedding_vector = None
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

(train_sentences_X, test_sentences_X, train_tags_y, test_tags_y) = train_test_split(sentences_X, tags_y, test_size=0.2)

def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))

def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam
 

model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(vocab_size,300,weights=[embedding_matrix],input_length=MAX_LENGTH,trainable=False))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy', ignore_class_accuracy(0)])
 
model.summary()

model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=50, validation_split=0.2)

scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))
print(f"{model.metrics_names[1]}: {scores[1] * 100}") 
