In [3]:
import os 
import sys 
import random 

import numpy as np 
import pandas as pd 
import pickle
import time

import nltk 
nltk.download('brown')
nltk.download('punkt')
nltk.download('universal_tagset')
brownwords = nltk.corpus.brown.tagged_words(categories='news', tagset='universal')

import tensorflow as tf

from sklearn.model_selection import train_test_split
random.seed(42)  



[nltk_data] Downloading package brown to /student/mrahbar/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /student/mrahbar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /student/mrahbar/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [4]:
with open('train.tsv', 'r') as f: 
    train_data = f.read()
    
with open('test.tsv', 'r') as f: 
    test_data = f.read()

In [5]:
splited_lines_train = train_data.strip('\n').split('\n')
splited_lines_test = test_data.strip('\n').split('\n')

In [6]:
for i in range(len(splited_lines_train)): 
    splited_lines_train[i] = splited_lines_train[i].strip(' ').split('\t')
    
for i in range(len(splited_lines_test)): 
    splited_lines_test[i] = splited_lines_test[i].strip(' ').split('\t')

In [7]:
splited_lines_train[i]

['.', 'N']

In [8]:
def get_sentence_list(splited_lines):
    temp_tokens= [] 
    sentence_list = []
    global_list_of_words = [] 
    global_list_of_chars = []
    global_list_of_tags = []
    for w,t in splited_lines: 
        global_list_of_words.append(w)
        global_list_of_chars += [c for c in w]
        global_list_of_tags.append(t)
        if w == '<S>':
            sentence_list.append(temp_tokens)
            temp_tokens = []
            temp_tokens.append(tuple((w,t)))
        else:
            temp_tokens.append(tuple((w,t)))
    sentence_list.append(temp_tokens)
    return sentence_list, global_list_of_words, global_list_of_chars, global_list_of_tags


In [9]:
sentence_list_train, global_list_of_words, global_list_of_chars, global_list_of_tags = get_sentence_list(splited_lines_train)
sentence_list_test, _, _, _= get_sentence_list(splited_lines_test)

In [10]:
print("The length of words list is: ", len(global_list_of_words))
print("The length of char list is: ", len(global_list_of_chars))
print("The length of tag list is: ", len(global_list_of_tags))

The length of words list is:  10000000
The length of char list is:  41518749
The length of tag list is:  10000000


In [11]:
global_set_of_words = set(global_list_of_words)
global_set_of_chars = set(global_list_of_chars)
global_set_of_tags = set(global_list_of_tags)

In [12]:
print("The length of words set is: ", len(global_set_of_words))
print("The length of char set is: ", len(global_set_of_chars))
print("The length of tag set is: ", len(global_set_of_tags))

The length of words set is:  171515
The length of char set is:  237
The length of tag set is:  5


In [13]:
def create_dict(input_set):
    dictionary = {}
    dictionary['[PAD]'] = 0
    dictionary['[UKN]'] = 1
    counter = 2 
    for i in input_set: 
        dictionary[i] = counter 
        counter += 1 
    return dictionary

def encode_labels(input_set):
    dictionary = {}
    dictionary['[PAD]'] = 0
    counter = 1 
    for i in input_set: 
        dictionary[i] = counter 
        counter += 1 
    return dictionary
        

In [14]:
word_dict = create_dict(global_set_of_words)
char_dict = create_dict(global_set_of_chars)
tag_dict = encode_labels(global_set_of_tags)


In [15]:
# X = [] 
# y = [] 
# for i in range(len(sentence_list)):
#     temp_X = []
#     temp_y = [] 
#     for j in range(len(sentence_list[i])):
#         temp_X.append(sentence_list[i][j][0])
#         temp_y.append(sentence_list[i][j][1])
#     X.append(temp_X)
#     y.append(temp_y)
    
def split_feature_label(sentence_list): 
    X = [] 
    y = [] 
    for i in range(len(sentence_list)):
        temp_X = []
        temp_y = [] 
        for j in range(len(sentence_list[i])):
            temp_X.append(sentence_list[i][j][0])
            temp_y.append(sentence_list[i][j][1])
        X.append(temp_X)
        y.append(temp_y)
    return X, y                   

In [16]:
X_train, y_train = split_feature_label(sentence_list_train)
X_test, y_test = split_feature_label(sentence_list_test)

In [17]:
len(X_test)

39718

In [18]:
get_max_seq_len = 0 
for i in range(len(sentence_list_train)):
    if len(sentence_list_train[i]) > get_max_seq_len: 
        get_max_seq_len = len(sentence_list_train[i])
print(get_max_seq_len)

147


In [19]:
MAX_SEQ_PAD = 150 

In [20]:
def pad_trim_seq(sequence, max_len = MAX_SEQ_PAD):
    sequence = sequence.copy()
    if len(sequence)> max_len: 
        sequence = sequence[:max_len]
    elif len(sequence)< max_len: 
        seq_len = len(sequence)
        for _ in range(max_len - seq_len): 
            sequence.append('[PAD]')
    return sequence 
        

In [21]:
def pad_list_sequences(X,y):
    padded_X = [] 
    padded_y = [] 
    for i in range(len(X)):
        padded_X.append(pad_trim_seq(X[i]))
        padded_y.append(pad_trim_seq(y[i]))
    return padded_X, padded_y


In [23]:
padded_X_train, padded_y_train = pad_list_sequences(X_train, y_train)
padded_X_test, padded_y_test = pad_list_sequences(X_test, y_test)

In [24]:
padded_X_train[0]

['ansin',
 ')',
 'tá',
 'níos',
 'lú',
 'gaeilge',
 'ag',
 'na',
 'gardaí',
 'ná',
 'bí',
 'ariamh',
 'ainneoin',
 'na',
 'cearta',
 '.',
 'níl',
 'sé',
 'ach',
 'roinnt',
 'seachtainí',
 'ó',
 'sin',
 'a',
 'tógadh',
 'fear',
 'bocht',
 'a',
 'tug',
 'ainm',
 'gaeilge',
 'dóibh',
 '.',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '

In [25]:
def encode_sequence(sequence, dictionary):
    encoded_seq = [] 
    for i in range(len(sequence)):
        try: 
            encoded_seq.append(dictionary[sequence[i]])
        except:
            encoded_seq.append(dictionary['[UKN]'])
    return encoded_seq 

In [26]:
encoded_X_train = [] 
encoded_y_train = []
for i in range(len(padded_X_train)):
    encoded_X_train.append(encode_sequence(padded_X_train[i], word_dict))
    encoded_y_train.append(encode_sequence(padded_y_train[i], tag_dict))
    
encoded_X_test = [] 
encoded_y_test = []
for i in range(len(padded_X_test)):
    encoded_X_test.append(encode_sequence(padded_X_test[i], word_dict))
    encoded_y_test.append(encode_sequence(padded_y_test[i], tag_dict))

## Character level encoding 

In [27]:
max_char_num = 0 
for i in range(len(X_train)):
    temp_encoded_chars = [] 
    for j in range(len(X_train[i])):
        if len(X_train[i][j])> max_char_num: 
            max_char_num = len(X_train[i][j])
            max_word = X_train[i][j]
#             print(max_word)
            
print(max_char_num)

69


In [28]:
MAX_CHAR_NUM = 20 

In [108]:
# padded_char_X = [] 
# for i in range(len(padded_X)):
#     temp_list = [] 
#     for j in range(len(padded_X[i])):
#         temp_char_list = [c for c in padded_X[i][j]]
#         temp_char_list = pad_trim_seq(temp_char_list, MAX_CHAR_NAME)
#         temp_list.append(temp_char_list)
#     padded_char_X.append(temp_list)
    
def pad_list_chars(padded_X):
    padded_char_X = [] 
    for i in range(len(padded_X)):
        temp_list = [] 
        for j in range(len(padded_X[i])):
            temp_char_list = [c for c in padded_X[i][j]]
            temp_char_list = pad_trim_seq(temp_char_list, MAX_CHAR_NUM)
            temp_list.append(temp_char_list)
        padded_char_X.append(temp_list)
    return padded_char_X
        


In [109]:
padded_char_X_train = pad_list_chars(padded_X_train)
padded_char_X_test = pad_list_chars(padded_X_test)

In [62]:
# padded_char_X = [] 
# for i in range(len(padded_X)):
#     temp_list = [] 
#     for j in range(len(padded_X[i])):
#         temp_char_list = [c for c in padded_X[i][j]]
# #         temp_char_list = pad_trim_seq(temp_char_list, MAX_CHAR_NAME)
#         temp_list.append(temp_char_list)
#     padded_char_X.append(temp_list)
    
def unpad_list_chars(padded_X):
    padded_char_X = [] 
    for i in range(len(padded_X)):
        temp_list = [] 
        for j in range(len(padded_X[i])):
            temp_char_list = [c for c in padded_X[i][j]]
    #         temp_char_list = pad_trim_seq(temp_char_list, MAX_CHAR_NAME)
            temp_list.append(temp_char_list)
        padded_char_X.append(temp_list)
    return padded_char_X

In [63]:
padded_char_X_train = unpad_list_chars(padded_X_train)
padded_char_X_test = unpad_list_chars(padded_X_test)

In [110]:
len_set_train = []
for i in range(len(padded_char_X_train)):
    for j in range(len(padded_char_X_train[i])):
        len_set_train.append(len(padded_char_X_train[i][j]))
        
        
len_set_test = []
for i in range(len(padded_char_X_test)):
    for j in range(len(padded_char_X_test[i])):
        len_set_test.append(len(padded_char_X_test[i][j]))

In [111]:
len(padded_char_X_train[0])

150

In [35]:
# Encoding characters for character level model training 

# encoded_X_chars = [] 
# for i in range(len(padded_char_X)):
#     temp_encoded_chars = [] 
#     for j in range(len(padded_char_X[i])):
#         temp_encoded_chars.append(encode_sequence(padded_char_X[i][j], char_dict))
#     encoded_X_chars.append(temp_encoded_chars)
    
def encode_chars_list(padded_char_X):
    encoded_X_chars = [] 
    for i in range(len(padded_char_X)):
        temp_encoded_chars = [] 
        for j in range(len(padded_char_X[i])):
            temp_encoded_chars.append(encode_sequence(padded_char_X[i][j], char_dict))
        encoded_X_chars.append(temp_encoded_chars)
    return encoded_X_chars


In [112]:
encoded_X_chars_train =  encode_chars_list(padded_char_X_train)
encoded_X_chars_test =  encode_chars_list(padded_char_X_test)

In [113]:
for i in range(len(encoded_X_chars_train)):
    for j in range(len(encoded_X_chars_train[i])):
        encoded_X_chars_train[i][j] = np.array(encoded_X_chars_train[i][j])

for i in range(len(encoded_X_chars_test)):
    for j in range(len(encoded_X_chars_test[i])):
        encoded_X_chars_test[i][j] = np.array(encoded_X_chars_test[i][j])
    

In [94]:
def one_hot_encoder(input_list, dictionary = char_dict):
    encoded_data = [] 
    char_len = len(char_dict)
    for i in range(len(input_list)):
        temp_char_list = []
        for j in range(len(input_list[i])): 
            temp_vec = np.zeros(char_len)
            temp_vec[input_list[i][j]] = 1 
            temp_char_list.append(temp_vec)
#         temp_char_list = np.array(temp_char_list)
        encoded_data.append(temp_char_list)
    return encoded_data

In [96]:
one_hot_X_chars_train = one_hot_encoder(encoded_X_chars_train[:3])
one_hot_X_chars_test = one_hot_encoder(encoded_X_chars_test[:3])

In [71]:
# encoded_X_chars_tensor_train = np.array(encoded_X_chars_train)
# encoded_X_chars_tensor_test = np.array(encoded_X_chars_test)

In [84]:
encoded_X_chars_test[0][9]

array([210,   1, 151,   1, 147])

In [38]:
encoded_X_tensor_train = np.array(encoded_X_train)
encoded_X_tensor_test = np.array(encoded_X_test)

In [39]:
encoded_y_tensor_train = np.array(encoded_y_train)
encoded_y_tensor_test = np.array(encoded_y_test)

In [40]:
# encoded_X_chars = [] 
# for i in range(len(padded_char_X)):
#     temp_encoded_chars = [] 
#     for j in range(len(padded_char_X[i])):
#         temp_encoded_chars.append(encode_sequence(padded_char_X[i][j], char_dict))
#     encoded_X_chars.append(temp_encoded_chars)

In [115]:
encoded_X_chars_train[1][:5][:5][:5]

[array([ 51, 115, 236,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0]),
 array([209,   3, 169,  42,  47,  85,   3,  50, 171,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0]),
 array([65,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0]),
 array([ 42,  47,  21, 125, 171,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0]),
 array([65, 41,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0])]

In [42]:
# with open('encoded_X_chars_unpadded.lis', 'wb') as fp:
#     pickle.dump(encoded_X_chars, fp)
    

In [43]:
# with open('encoded_X_chars.list', 'wb') as fp:
#     pickle.dump(encoded_X_chars, fp)
    
# with open('encoded_X.list', 'wb') as fp:
#     pickle.dump(encoded_X, fp)
    
# with open('encoded_y.list', 'wb') as fp:
#     pickle.dump(encoded_y, fp)

In [44]:
# with open('encoded_X_chars.list', 'rb') as f:
#     encoded_X_chars = pickle.load(f)
    
# with open('encoded_X.list', 'rb') as f:
#     encoded_X = pickle.load(f)
    
# with open('encoded_y.list', 'rb') as f:
#     encoded_y = pickle.load(f)

In [45]:
# encoded_X_chars_tensor = np.array(encoded_X_chars) 
# with open('encoded_X_chars.ten', 'wb') as fp:
#     pickle.dump(encoded_X_chars_tensor, fp)
    
# del encoded_X_chars

# encoded_X_tensor = np.array(encoded_X)
# with open('encoded_X.ten', 'wb') as fp:
#     pickle.dump(encoded_X_tensor, fp)
    
# del encoded_X

# encoded_y_tensor = np.array(encoded_y)
# with open('encoded_y.ten', 'wb') as fp:
#     pickle.dump(encoded_y_tensor, fp)
# del encoded_y


# ----------------------------------------------


# with open('encoded_X_chars.ten', 'wb') as fp:
#     pickle.dump(encoded_X_chars_tensor, fp)
    
# with open('encoded_X.ten', 'wb') as fp:
#     pickle.dump(encoded_X_tensor, fp)
    
# with open('encoded_y.ten', 'wb') as fp:
#     pickle.dump(encoded_y_tensor, fp)

In [46]:
with open('encoded_X_chars.ten', 'rb') as f:
    encoded_X_chars_tensor = pickle.load(f)
    
with open('encoded_X.ten', 'rb') as f:
    encoded_X_tensor = pickle.load(f)
    
with open('encoded_y.ten', 'rb') as f:
    encoded_y_tensor = pickle.load(f)

# with open('encoded_X_chars_unpadded.lis', 'rb') as f:
#     encoded_X_chars_tensor_unpadded = pickle.load(f)
    

In [47]:
print(encoded_X_chars_tensor.shape)
print(encoded_X_tensor.shape)

(395923, 150, 20)
(395923, 150)


In [48]:
# print(encoded_y_chars_tensor.shape)
print(encoded_y_tensor.shape)

(395923, 150)


In [49]:
X_train, X_test, y_train, y_test = (encoded_X_tensor_train, encoded_X_tensor_test,
                                    encoded_y_tensor_train, encoded_y_tensor_test)

In [50]:
X_train.shape

(395923, 150)

In [52]:
input_words_input = len(word_dict)
embedding_vector_length = 32

inputs = tf.keras.Input(shape=(encoded_X_tensor.shape[-1],), name="word_input")
x = tf.keras.layers.Embedding(input_words_input, embedding_vector_length, input_length = encoded_X_tensor.shape[-1])(inputs)

x = tf.keras.layers.Dropout(0.2)(x)

# x = tf.keras.layers.LSTM(128,return_sequences=True)(x) # 128
# x = tf.keras.layers.Dropout(0.2)(x)

x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,return_sequences=True))(x) # 128
x = tf.keras.layers.Dropout(0.2)(x)

# dense_layer = tf.keras.layers.Dense(64, activation="tanh", name="dense_1")
# x = tf.keras.layers.TimeDistributed(dense_layer)(x)

output_layer = tf.keras.layers.Dense(len(tag_dict), activation="softmax", name="predictions")
outputs = tf.keras.layers.TimeDistributed(output_layer)(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)


# recall = tf.keras.metrics.Recall(class_id=4)
scce = tf.keras.metrics.SparseCategoricalCrossentropy()

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy',scce]) # recall, sparse_categorical_cross_entropy
print(model.summary())

model.fit(X_train, y_train, validation_split =0.2, epochs=6, batch_size=32)

scores = model.evaluate(X_test, y_test, verbose=0)
print("The final accuracy in test set is: %.2f%%" % (scores[1]*100))


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 word_input (InputLayer)     [(None, 150)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 150, 32)           5488544   
                                                                 
 dropout_2 (Dropout)         (None, 150, 32)           0         
                                                                 
 bidirectional_1 (Bidirectio  (None, 150, 256)         164864    
 nal)                                                            
                                                                 
 dropout_3 (Dropout)         (None, 150, 256)          0         
                                                                 
 time_distributed_1 (TimeDis  (None, 150, 6)           1542      
 tributed)                                                 

In [53]:
print()
print('Acc score: %.2f%%' % (scores[1]*100))
# print('Recall score: %.2f' % (scores[2]))
print('Cross entropy loss: %.4f' % (scores[2]))


Acc score: 99.66%
Cross entropy loss: 0.0107


## Character level

In [54]:
X_train, X_test, y_train, y_test = (encoded_X_tensor_train, encoded_X_tensor_test,
                                    encoded_y_tensor_train, encoded_y_tensor_test)

In [61]:
encoded_X_chars_tensor[0][5].shape

(20,)

In [118]:
X_train, X_test, y_train, y_test = (encoded_X_tensor_train, encoded_X_tensor_test, encoded_y_tensor_train, encoded_y_tensor_test)
X_train_chars_one_hot, X_test_chars_one_hot, y_train, y_test = (one_hot_X_chars_train, one_hot_X_chars_test, encoded_y_tensor_train, encoded_y_tensor_test)
X_train_chars, X_test_chars, y_train, y_test = (np.array(encoded_X_chars_train), np.array(encoded_X_chars_test), encoded_y_tensor_train, encoded_y_tensor_test)


# X_train_chars_e5, X_test_chars_e5, y_train, y_test = train_test_split(encoded_X_chars_tensor_unpadded[:,:,-5:], encoded_y_tensor, test_size=0.2, random_state=42)

In [119]:
# X_train_chars_b5, X_test_chars_b5
# X_train_chars_e5, X_test_chars_e5

# X_train_chars = np.concatenate([X_train_chars_b5, X_train_chars_e5], -1)
# X_test_chars = np.concatenate([X_test_chars_b5, X_test_chars_e5], -1)

In [120]:
X_train_chars.shape

(395923, 150, 20)

150

In [122]:
input_words_input = len(word_dict)
input_chars_input = len(char_dict)

embedding_vector_length = 32

inputs1 = tf.keras.Input(shape=(encoded_X_tensor.shape[-1],), name="word_input")
inputs2 = tf.keras.Input(shape=(X_train_chars.shape[-2], X_train_chars.shape[-1],), name="char_input")
x1 = tf.keras.layers.Embedding(input_words_input, embedding_vector_length, input_length = encoded_X_tensor.shape[-1])(inputs1)
x2 = tf.keras.layers.Embedding(input_chars_input, embedding_vector_length, input_length = encoded_X_chars_tensor.shape[-1])(inputs2)
x2 = tf.reshape(x2, [tf.shape(x2)[0], tf.shape(x2)[1], tf.shape(x2)[2]*tf.shape(x2)[3]])
x = tf .concat([x1,x2], 2)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,return_sequences=True))(x) # 128
x = tf.keras.layers.Dropout(0.2)(x)
output_layer = tf.keras.layers.Dense(len(tag_dict), activation="softmax", name="predictions")
outputs = tf.keras.layers.TimeDistributed(output_layer)(x)

model = tf.keras.Model(inputs=(inputs1, inputs2), outputs=outputs)

# recall = tf.keras.metrics.Recall(class_id=4)
scce = tf.keras.metrics.SparseCategoricalCrossentropy()

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy',scce]) # recall, sparse_categorical_cross_entropy
print(model.summary())

model.fit((X_train, X_train_chars), y_train, validation_split =0.2, epochs=5, batch_size=32)

scores = model.evaluate((X_test, X_test_chars), y_test, verbose=0)
print("The final accuracy in test set is: %.2f%%" % (scores[1]*100))


Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 char_input (InputLayer)        [(None, 150, 20)]    0           []                               
                                                                                                  
 embedding_13 (Embedding)       (None, 150, 20, 32)  7648        ['char_input[0][0]']             
                                                                                                  
 tf.compat.v1.shape_18 (TFOpLam  (4,)                0           ['embedding_13[0][0]']           
 bda)                                                                                             
                                                                                                  
 tf.compat.v1.shape_19 (TFOpLam  (4,)                0           ['embedding_13[0][0]']     

In [123]:
print()
print('Acc score: %.2f%%' % (scores[1]*100))
print('Cross entropy loss: %.4f' % (scores[2]))


Acc score: 99.69%
Cross entropy loss: 0.0097


In [72]:
X_train_chars.shape

(316738, 150, 10)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(encoded_X_tensor, encoded_y_tensor, test_size=0.2, random_state=42)
X_train_chars, X_test_chars, y_train, y_test = train_test_split(encoded_X_chars_tensor_unpadded, encoded_y_tensor, test_size=0.2, random_state=42)
# X_train_chars_e5, X_test_chars_e5, y_train, y_test = train_test_split(encoded_X_chars_tensor_unpadded[:,:,-5:], encoded_y_tensor, test_size=0.2, random_state=42)

In [103]:
type(X_train_chars[0][0])

numpy.ndarray

In [126]:
encoded_X_chars_tensor.shape[-2]

150

In [127]:
input_words_input = len(word_dict)
input_chars_input = len(char_dict)

embedding_vector_length = 32

inputs1 = tf.keras.Input(shape=(encoded_X_tensor.shape[-1],), name="word_input")
x1 = tf.keras.layers.Embedding(input_words_input, embedding_vector_length, input_length = encoded_X_tensor.shape[-1])(inputs1)


### ---- 

# inputs1 = tf.keras.Input(shape=(encoded_X_tensor.shape[-1],), name="word_input")
# inputs2 = tf.keras.Input(shape=(len(char_dict)), name="char_input")

# x1 = tf.keras.layers.Embedding(input_words_input, embedding_vector_length, input_length = encoded_X_tensor.shape[-1])(inputs1)
# x2 = tf.keras.layers.Embedding(input_chars_input, embedding_vector_length, input_length = encoded_X_chars_tensor.shape[-1])(inputs2)


# ------

inputs2 = tf.keras.Input(shape=(len(char_dict)), name="char_input")
x2 = tf.keras.layers.Embedding(input_chars_input, embedding_vector_length, input_length = encoded_X_chars_tensor.shape[-2])(inputs2)
print(x2.shape)
x2 = tf.keras.layers.TimeDistributed(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)))(x2) # 128
print(x2.shape)
x2 = tf.keras.layers.Dropout(0.2)(x2)
x2_dense_2 = tf.keras.layers.Dense(8, activation="tanh", name="x2_dense")
x2 = x2_dense_2(x2)


# x2 = tf.reshape(x2, [tf.shape(x2)[0], tf.shape(x2)[1], tf.shape(x2)[2]*tf.shape(x2)[3]])
x = tf.keras.layers.concatenate([x1,x2],axis=-1)

x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(x) # 128
x = tf.keras.layers.Dropout(0.2)(x)



output_layer = tf.keras.layers.Dense(len(tag_dict), activation="softmax", name="predictions")
outputs = tf.keras.layers.TimeDistributed(output_layer)(x)

model = tf.keras.Model(inputs=(inputs1, inputs2), outputs=outputs)


# recall = tf.keras.metrics.Recall(class_id=4)
scce = tf.keras.metrics.SparseCategoricalCrossentropy()

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy',scce]) # recall, sparse_categorical_cross_entropy
print(model.summary())

model.fit((X_train, X_train_chars_one_hot), y_train, validation_split =0.2, epochs=10, batch_size=16)

scores = model.evaluate((X_test, X_test_chars), y_test, verbose=0)
print("The final accuracy in test set is: %.2f%%" % (scores[1]*100))


In [54]:
input_words_input = len(word_dict)
input_chars_input = len(char_dict)

embedding_vector_length = 32

inputs1 = tf.keras.Input(shape=(encoded_X_tensor.shape[-1],), name="word_input")
x1 = tf.keras.layers.Embedding(input_words_input, embedding_vector_length, input_length = encoded_X_tensor.shape[-1])(inputs1)
x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True))(x1) # 128
x1 = tf.keras.layers.Dropout(0.2)(x1)



inputs2 = tf.keras.Input(shape=(encoded_X_chars_tensor.shape[-2], encoded_X_chars_tensor.shape[-1],), name="char_input")
x2 = tf.keras.layers.Embedding(input_chars_input, embedding_vector_length, input_length = encoded_X_chars_tensor.shape[-1])(inputs2)
print(x2.shape)
x2 = tf.keras.layers.TimeDistributed(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)))(x2) # 128
print(x2.shape)
x2 = tf.keras.layers.Dropout(0.2)(x2)

### ----- Start Experimental -------
x2 = tf.reshape(x2, [tf.shape(x2)[0], tf.shape(x2)[1], tf.shape(x2)[2]*tf.shape(x2)[3]])

# x2_dense_2 = tf.keras.layers.Dense(8, activation="tanh", name="x2_dense")
# x2 = x2_dense_2(x2)

### ----- End Experimental -------


# x2 = tf.reshape(x2, [tf.shape(x2)[0], tf.shape(x2)[1], tf.shape(x2)[2]*tf.shape(x2)[3]])
x = tf.keras.layers.concatenate([x1,x2],axis=-1)

x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(x) # 128
x = tf.keras.layers.Dropout(0.2)(x)



output_layer = tf.keras.layers.Dense(len(tag_dict), activation="softmax", name="predictions")
outputs = tf.keras.layers.TimeDistributed(output_layer)(x)

model = tf.keras.Model(inputs=(inputs1, inputs2), outputs=outputs)


# recall = tf.keras.metrics.Recall(class_id=4)
scce = tf.keras.metrics.SparseCategoricalCrossentropy()

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy',scce]) # recall, sparse_categorical_cross_entropy
print(model.summary())

model.fit((X_train, X_train_chars), y_train, validation_split =0.2, epochs=10, batch_size=16)

scores = model.evaluate((X_test, X_test_chars), y_test, verbose=0)
print("The final accuracy in test set is: %.2f%%" % (scores[1]*100))


(None, 150, 20, 32)
(None, 150, 64)
Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 char_input (InputLayer)        [(None, 150, 20)]    0           []                               
                                                                                                  
 word_input (InputLayer)        [(None, 150)]        0           []                               
                                                                                                  
 embedding_23 (Embedding)       (None, 150, 20, 32)  7648        ['char_input[0][0]']             
                                                                                                  
 embedding_22 (Embedding)       (None, 150, 32)      5488544     ['word_input[0][0]']             
                                                        

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: 

In [None]:
print()
print('Acc score: %.2f%%' % (scores[1]*100))
# print('Recall score: %.2f' % (scores[2]))
print('Cross entropy loss: %.4f%%' % (scores[2]))

## From Scratch Implementation 

## From Scratch Implementation

The code from this section is mainly borrowed from the following sources: 
1. https://github.com/kscanne/5755/tree/master/mutations
2. https://www.mygreatlearning.com/blog/pos-tagging/

In [1]:
import os 
import sys 
import random 

import numpy as np 
import pandas as pd 
import pickle
import time

import nltk 
nltk.download('punkt')


import tensorflow as tf

from sklearn.model_selection import train_test_split


[nltk_data] Downloading package punkt to /student/mrahbar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2022-09-23 22:53:54.472448: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-23 22:53:54.692126: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-09-23 22:53:54.732474: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-23 22:53:59.

In [2]:
with open('train.tsv', 'r') as f: 
    train_data = f.read()
    
with open('test.tsv', 'r') as f: 
    test_data = f.read()

In [3]:
splited_lines_train = train_data.strip('\n').split('\n')
splited_lines_test = test_data.strip('\n').split('\n')

for i in range(len(splited_lines_train)): 
    splited_lines_train[i] = splited_lines_train[i].strip(' ').split('\t')
    
for i in range(len(splited_lines_test)): 
    splited_lines_test[i] = splited_lines_test[i].strip(' ').split('\t')

In [15]:
splited_lines_train[0]

['ansin', 'N']

In [5]:
def get_sentence_list(splited_lines):
    temp_tokens= [] 
    sentence_list = []
    global_list_of_words = [] 
    global_list_of_chars = []
    global_list_of_tags = []
    for w,t in splited_lines: 
        global_list_of_words.append(w)
        global_list_of_chars += [c for c in w]
        global_list_of_tags.append(t)
        if w == '<S>':
            sentence_list.append(temp_tokens)
            temp_tokens = []
            temp_tokens.append(tuple((w,t)))
        else:
            temp_tokens.append(tuple((w,t)))
    sentence_list.append(temp_tokens)
    return sentence_list, global_list_of_words, global_list_of_chars, global_list_of_tags


In [6]:
sentence_list_train, global_list_of_words, global_list_of_chars, global_list_of_tags = get_sentence_list(splited_lines_train)
sentence_list_test, _, _, _= get_sentence_list(splited_lines_test)

In [7]:
Xy_train, Xy_test = (sentence_list_train, sentence_list_test)

In [8]:
train_tagged_words = [ tup for sent in Xy_train for tup in sent]
test_tagged_words = [ tup for sent in Xy_test for tup in sent]
print(len(train_tagged_words))
print(len(test_tagged_words))

10000000
1000000


In [9]:
train_tagged_words[3]

('níos', 'N')

In [10]:
#use set datatype to check how many unique tags are present in training data
tags = {tag for word,tag in train_tagged_words}
print(len(tags))
print(tags)
 
# check total words in vocabulary
vocab = {word for word,tag in train_tagged_words}

5
{'S', 'H', 'N', 'U', 'T'}


In [11]:
# compute Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)#total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
#now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(w_given_tag_list)
 
     
    return (count_w_given_tag, count_tag)

In [12]:
# compute  Transition Probability
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [13]:
# creating t x t transition matrix of tags, t= no of tags
# Matrix(i, j) represents P(jth tag after the ith tag)
 
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]
 
print(tags_matrix)

[[5.1699642e-02 1.5994346e-03 9.4544768e-01 1.1905875e-03 6.2662497e-05]
 [3.9538410e-02 2.3849746e-03 9.5707047e-01 8.3225680e-04 1.7390441e-04]
 [1.0547736e-01 9.1363927e-03 8.4339511e-01 3.7937343e-02 4.0536942e-03]
 [3.8271528e-02 6.9395616e-04 9.6029162e-01 6.9089909e-04 5.1970284e-05]
 [5.7859290e-02 2.8943976e-03 9.3898839e-01 8.5972206e-05 1.7194441e-04]]


In [14]:
# convert the matrix to a df for better readability
#the table is same as the transition table shown in section 3 of article
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
display(tags_df)

Unnamed: 0,S,H,N,U,T
S,0.0517,0.001599,0.945448,0.001191,6.3e-05
H,0.039538,0.002385,0.95707,0.000832,0.000174
N,0.105477,0.009136,0.843395,0.037937,0.004054
U,0.038272,0.000694,0.960292,0.000691,5.2e-05
T,0.057859,0.002894,0.938988,8.6e-05,0.000172


In [66]:
trained_flat = [w for w,t in train_tagged_words]  

In [67]:
trained_flat[:10]

['ansin', ')', 'tá', 'níos', 'lú', 'gaeilge', 'ag', 'na', 'gardaí', 'ná']

In [None]:
probablity_dictionary = {}
for w in trained_flat: 
    if w not in probablity_dictionary.keys(): 
        probablity_dictionary[w] = {}
    for t in tags: 
        if t not in probablity_dictionary[w].keys():
            probablity_dictionary[w][t] = word_given_tag(w, t)

In [None]:
with open('probability.dict', 'wb') as fp:
    pickle.dump(probablity_dictionary, fp)

In [56]:
word_given_tag('bí', 'N',)

(824, 8584022)

In [49]:
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
    print(T)
    for key, word in enumerate(words):
        #initialise list of probability column for a given observation
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['N', tag]
                try:
                    transition_p = tags_df.loc[state[-1], tag]
                except:
                    pass
                 
            # compute emission and state probabilities
            emission_p = probablity_dictionary[words[key]][tag][0]/probablity_dictionary[words[key]][tag][1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
             
        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [50]:
empty_list = [] 
for i in range(len(Xy_test)):
    if len(Xy_test)==0:
        empty_list.append(i)

In [51]:
empty_list

[]

In [105]:
accuracy_list = []
counter = 0
for i in range(len(Xy_test)): 
    test_run = [Xy_test[i]]
    # list of tagged words
    test_run_base = [tup for sent in test_run for tup in sent]

    # list of untagged words
    test_tagged_words = [tup[0] for sent in test_run for tup in sent]
    
    
    #Here We will only test 10 sentences to check the accuracy
    #as testing the whole training set takes huge amount of time
    start = time.time()
    tagged_seq = Viterbi(test_tagged_words)
    end = time.time()
    difference = end-start

    # accuracy
    check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
    try:
        accuracy = len(check)/len(tagged_seq)
        accuracy_list.append(accuracy*100)
    except:
        pass
    counter += 1 
    if counter%10==0 : 
        print("Sentence %d is processed."%(counter))



In [37]:
test_tagged_words

['<S>', "d'fhás", 'martin', 'walser', 'aníos', 'faoi']

In [16]:
rndom = [random.randint(1,len(Xy_test)) for x in range(10)]
test_run = [Xy_test[i] for i in rndom]
 
# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]
 
# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]

In [28]:
test_tagged_words[10]

'amárach'

In [None]:
#Here We will only test 10 sentences to check the accuracy
#as testing the whole training set takes huge amount of time
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start
 
print("Time taken to run viterbi on test set is in seconds: ", difference)
 
# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
 
accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy on the test set is: ',accuracy*100)


In [17]:
cfd = nltk.ConditionalFreqDist(splited_lines_train)

In [20]:
best_tags = dict((w, cfd[w].max()) for w in global_list_of_words)

In [24]:
flipped = [(t,w) for sent in sentence_list_train for (w,t) in sent]
wordgiventag = nltk.ConditionalFreqDist(flipped)

In [25]:
# this is P(w|t), unsmoothed!
def P(w,t):
    return wordgiventag[t][w] / wordgiventag[t].N()

In [26]:
tag_bigrams = [(x,y) for sent in sentence_list_train for x,y in nltk.bigrams([t for (w,t) in sent])]

In [28]:
tag_bigrams[0]

('N', 'N')

In [29]:
tag_bigram_counts = nltk.ConditionalFreqDist(tag_bigrams)
# this is count of noun tags following adjective tags (normal order in English)
print(tag_bigram_counts['N']['N'])


6849694


In [31]:
# this is P(t2|t1), unsmoothed again!
def tagP(t2,t1):
    return tag_bigram_counts[t1][t2] / tag_bigram_counts[t1].N()

In [32]:
sentence_start = nltk.FreqDist(sent[0][1] for sent in sentence_list_train)
def initP(t):
    return sentence_start[t] / sentence_start.N()

In [36]:
def argmax(V,tag_list,t,i):
    ans=-1
    best=None
    for s in tag_list:
        temp=V[(s,i-1)]*tagP(t,s)
        if temp > ans:
            ans = temp
            best = s
    return (best,ans)

In [37]:
def printV(sentence,tag_list,V,B):
    for i in range(len(sentence)):
        print('i='+str(i)+' ['+sentence[i]+']')
        for t in tag_list:
            if V[(t,i)] != 0:
                toprint='  '+t+'='+str(V[(t,i)])
                if i>0:
                    toprint += ' (from '+B[(t,i)]+')'
                print(toprint)
    

In [118]:
def viterbi(sentence, labels ):
    V = dict()    # keys are (t,i) where t is a tag (row label) and i is position in sentence (column label)
    B = dict()    # same keys as V; this stores the "backpointers" to remember best tag sequence
    tag_list = sentence_start.keys()
    for t in tag_list:
        V[(t,0)] = initP(t)*P(sentence[0],t)
    for i in range(1,len(sentence)):
        for t in tag_list:
            pair = argmax(V,tag_list,t,i)
            B[(t,i)] = pair[0]
            V[(t,i)] = pair[1]*P(sentence[i],t)
    counter = 0
    for i in range(len(list(V.keys()))-1):
        
        if labels[i] == B[list(B.keys())[i]]:
            counter += 1 
    return 100*counter / (len(labels)-1)

In [122]:
accuracy_list = []
counter = 0
total = 0
for i in range(len(Xy_test)): 
    test_run = [Xy_test[i]]
    # list of tagged words
    test_run_base = [tup for sent in test_run for tup in sent]

    # list of untagged words
    test_tagged_words = [tup[0] for sent in test_run for tup in sent]
    test_tagged_labels = [tup[1] for sent in test_run for tup in sent]
    
    try: 
        total += viterbi(test_tagged_words, test_tagged_labels)
        counter += 1 
    except: 
        pass
    
average = total/counter


In [123]:
average

85.87601429414087