## Importing Required Packages

In [1]:
import pickle
import numpy as np

from ner_utils.Sentence         import Sentence
from ner_utils.Data             import Data
from ner_utils.DataInformation  import DataInformation
from ner_utils.DataPreparer     import DataPreparer


import tensorflow as tf
from tensorflow.keras.models        import Model
from tensorflow.keras.layers        import Input, Conv2D, MaxPooling2D, Dropout, Dense, LSTM, Embedding, Concatenate, Reshape, Permute, Lambda, Bidirectional
from tensorflow.keras.optimizers    import SGD

from tensorflow_addons.layers import CRF

## Calulating Data Information 
#### This information will be used in defining hyper parameters

In [2]:
dataset_path     = 'data/ner_datasetreference.csv'
dataset_encoding = 'latin'

In [3]:
data_info = DataInformation(dataset_path, dataset_encoding)

In [11]:
word_len_info       = data_info.get_word_length_info()
sentence_len_info   = data_info.get_sentence_length_info()

## Defining Initial Hyper Parameters

In [12]:
#Word Level Hyper Parameters
model_word_len     = int(np.ceil(word_len_info['avg'] + 2*word_len_info['std']))


#Sentence Level Hyper Parameters
model_sentence_len = int(np.ceil(sentence_len_info['avg'] + 2*sentence_len_info['std']))


## Preparing Data to Input to Model

## Defining Remaining Hyper Parameters

In [None]:
#Word Level Hyper Parameters
char_embedding_dim = 35
char_vocab_size    = len(char_to_idx) + 1 #add 1 for padding token
window_size        = 3
conv_filters       = 30 


#Sentence Level Hyper Parameters
word_embedding_dim = 100
word_vocab_size    = len(word_to_idx) + 1 #add 1 for padding token
lstm_units         = 300

#Other Hyper Pararmeters
dropout_rate    = 0.5
n_tags          = len(tags) 
initial_lr      = 0.015
momentum        = 0.9
decay_rate      = 0.05
grad_clip       = 5

model_word_len, model_sentence_len, word_vocab_size

### Reading Dataset

In [4]:
data = Data('data/ner_datasetreference.csv', encoding='latin1')
sentence = Sentence(data.dataset)

In [5]:
tags = sentence.tags
tags

['n', 'm', 'v', 'B', '-', 'g', 'p', 'I', 't', 'i', 'o', 'e', 'a', 'O', 'r']

## Data Features

In [6]:
# getting char voacb
# with open('unique_chars.pkl', 'wb') as f:
#     pickle.dump(sentence.get_unique_chars(), f)

# loading char vocab   
with open('unique_chars.pkl', 'rb') as f:
    chars = pickle.load(f)

In [43]:
# Word info to pass to the model
max_word_len, min_word_len, avg_word_len, std_word_len = sentence.get_word_info()
avg_word_len, std_word_len,  max_word_len, min_word_len

(4.773359082564433, 2.8246976648249014, 64, 1)

In [8]:
# Sentence info to pass to the model
max_sentence_len, min_sentence_len, avg_sentence_len, std_sentence_len = sentence.get_sentence_info()
max_sentence_len, min_sentence_len, avg_sentence_len, std_sentence_len

(104, 1, 21.863987989741236, 7.963596820721575)

## Tokens

In [9]:
word_list, sentence_list = sentence.get_words_and_sentences()
word_sequences, sentence_sequences = data.get_tokenized_sequences(word_list, sentence_list, max_word_len, max_sentence_len)

In [10]:
char_to_idx, word_to_idx, idx_to_char, idx_to_word = data.get_tokens(word_list, sentence_list)

In [16]:
char_to_idx

{'UNK': 1,
 'e': 2,
 'a': 3,
 't': 4,
 'i': 5,
 'n': 6,
 'o': 7,
 's': 8,
 'r': 9,
 'h': 10,
 'l': 11,
 'd': 12,
 'c': 13,
 'u': 14,
 'm': 15,
 'f': 16,
 'p': 17,
 'g': 18,
 'y': 19,
 '.': 20,
 'w': 21,
 'b': 22,
 'v': 23,
 ',': 24,
 'k': 25,
 'S': 26,
 'T': 27,
 'A': 28,
 'M': 29,
 'I': 30,
 '0': 31,
 '-': 32,
 "'": 33,
 'P': 34,
 'C': 35,
 'U': 36,
 'B': 37,
 '1': 38,
 'z': 39,
 'N': 40,
 'H': 41,
 'x': 42,
 '2': 43,
 'q': 44,
 'F': 45,
 'R': 46,
 'E': 47,
 'K': 48,
 'j': 49,
 'O': 50,
 'W': 51,
 'D': 52,
 'G': 53,
 'J': 54,
 'L': 55,
 '9': 56,
 '"': 57,
 '5': 58,
 '3': 59,
 'V': 60,
 '8': 61,
 '4': 62,
 '6': 63,
 '7': 64,
 'Y': 65,
 '$': 66,
 'Z': 67,
 'Q': 68,
 ')': 69,
 '(': 70,
 '%': 71,
 'X': 72,
 ';': 73,
 ':': 74,
 '/': 75,
 '?': 76,
 '&': 77,
 '!': 78,
 '~': 79,
 'é': 80,
 '[': 81,
 ']': 82,
 '#': 83,
 '\x92': 84,
 '\x96': 85,
 '\x85': 86,
 '\x94': 87,
 'ö': 88,
 '+': 89,
 '@': 90,
 '\x93': 91,
 '°': 92,
 '\x97': 93,
 '_': 94,
 'ë': 95,
 'ü': 96,
 '\x91': 97,
 '`': 98,
 '\xa0

# Embeddings

In [17]:
def generate_embedding_matrix_from_file(embedding_filename, vocab_size, embedding_dim, key_to_idx):
    
    with open(embedding_filename, 'r', encoding='latin') as f:
        # char_embeddings = pickle.load(f)
        
        pop = []
        embedding_matrix = np.zeros((vocab_size, embedding_dim))
        for line in f.readlines():
            try:      
                embedding_matrix[key_to_idx[line.split()[0]]] = np.array(line.split()[1:], dtype=np.float32)
            except KeyError: pass
            except ValueError: pass
                
    print(pop)       
    return embedding_matrix  

In [20]:
char_embedding_matrix = generate_embedding_matrix_from_file('char embeddings/char_embeddings_with_features.txt', char_vocab_size, char_embedding_dim, char_to_idx)


[]


In [21]:
word_embedding_martrix = generate_embedding_matrix_from_file('../glove.6B/glove.6B.100d.txt', word_vocab_size, word_embedding_dim, word_to_idx)

[]


### Model Input and Labels

In [27]:
# word and sentence inputs
word_inputs     = word_sequences
sentence_inputs = sentence_sequences

# target labels
y = sentence.get_labels()

In [42]:
np.shape(word_sequences)

  return array(a, dtype, copy=False, order=order)


(47959,)

# Hyper Parameters

(11, 38, 31819)

### Defining the Model

In [1]:
#Word Part of Model
char_inputs = Input(shape=(model_sentence_len, model_word_len))
x           = Lambda(lambda x: tf.reshape(x, (model_sentence_len, model_word_len)))(char_inputs)
x           = Embedding(char_vocab_size,
                            char_embedding_dim,
                            weights=[char_embedding_matrix],
                            input_length=max_word_len,
                            trainable=True)(x)
x           = Permute((2, 1), input_shape=(model_word_len, char_embedding_dim))(x)
x           = Dropout(dropout_rate)(x)
x           = Reshape((char_embedding_dim, model_word_len, 1))(x)
x           = Conv2D(conv_filters, input_shape=(model_sentence_len, char_embedding_dim, model_word_len, 1), kernel_size=(1, window_size), padding='same')(x)
x           = MaxPooling2D(pool_size=(1, model_word_len))(x)
x           = Dense(1, input_shape=(char_embedding_dim, 1, conv_filters))(x)
x           = Reshape((char_embedding_dim, ))(x)
x           = Lambda(lambda x: tf.expand_dims(x, axis=0))(x)


#Sentence Part of Model
word_inputs = Input(shape=(model_sentence_len, ))
y           = Embedding(word_vocab_size,
                            word_embedding_dim,
                            weights=[word_embedding_martrix],
                            input_length=max_sentence_len,
                            trainable=False)(word_inputs)


#Joining the two parts
z = Concatenate(axis=2)([y, x])
z = Dropout(dropout_rate)(z)
z = Bidirectional(LSTM(lstm_units, return_sequences=True, recurrent_dropout=dropout_rate))(z)
z = Dropout(dropout_rate)(z)
z = CRF(n_tags)(z)
# outputs, _,  _, _ = z
outputs = z


NameError: name 'Input' is not defined

In [2]:
tf.random.set_seed(10)
model = Model(inputs=[char_inputs, word_inputs], outputs=outputs)

NameError: name 'tf' is not defined

In [24]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 38, 11)]     0                                            
__________________________________________________________________________________________________
lambda (Lambda)                 (38, 11)             0           input_1[0][0]                    
__________________________________________________________________________________________________
embedding (Embedding)           (38, 11, 35)         3500        lambda[0][0]                     
__________________________________________________________________________________________________
permute (Permute)               (38, 35, 11)         0           embedding[0][0]                  
______________________________________________________________________________________________

In [29]:
def lr_scheduler(epoch, lr): return initial_lr / (1 + decay_rate * epoch)

In [33]:
opt         = SGD(learning_rate=initial_lr, momentum=0.9, clipvalue=grad_clip)
callback    = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)

In [None]:
model.compile(optimizer=opt, loss="mse", metrics=["mae"])

In [28]:
history = model.fit([word_sequences, sentence_sequences], y, callbacks=[callback])



ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).