In [1]:
import pickle
import pandas as pd
import numpy as np

from ner_utils.Sentence import Sentence
from ner_utils.Data     import Data

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Dropout, Dense, LSTM, Embedding, Concatenate, Reshape, Permute, Lambda, Bidirectional

### Character Tokenization

In [2]:
data = Data('data/ner_datasetreference.csv', encoding='latin1')
sentence = Sentence(data.dataset)

In [3]:
word_list, sentence_list = sentence.get_words_and_sentences()
word_sequences, sentence_sequences = data.get_tokenized_sequences(word_list, sentence_list)

In [4]:
char_to_idx, word_to_idx, idx_to_char, idx_to_word = data._get_tokens(word_list, sentence_list)

### Char Embeddings

In [5]:
# getting char voacb
# with open('unique_chars.pkl', 'wb') as f:
#     pickle.dump(sentence.get_unique_chars(), f)

# loading char vocab   
with open('unique_chars.pkl', 'rb') as f:
    chars = pickle.load(f)

In [6]:
# Word info to pass to the model
max_word_len, min_word_len, avg_word_len, std_word_len = sentence.get_word_info()
avg_word_len, std_word_len

(4.773359082564433, 2.8246976648249014)

In [7]:
# Sentence info to pass to the model
max_sentence_len, min_sentence_len, avg_sentence_len, std_sentence_len = sentence.get_sentence_info()
max_sentence_len, min_sentence_len, avg_sentence_len, std_sentence_len

(104, 1, 21.863987989741236, 7.963596820721575)

In [8]:
#Word Level Hyper Parameters
model_word_len     = int(np.ceil(avg_word_len + 2*std_word_len))
char_embedding_dim = 35
char_vocab_size    = len(char_to_idx) + 1 #add 1 for padding token
window_size        = 3
conv_filters       = 30 
dropout_rate       = 0.5

#Sentence Level Hyper Parameters
model_sentence_len = int(np.ceil(avg_sentence_len + 2*std_sentence_len))
word_embedding_dim = 100
word_vocab_size    = len(word_to_idx) + 1 #add 1 for padding token


model_word_len, model_sentence_len, word_vocab_size

(11, 38, 31819)

In [9]:
def get_char_embedding_matrix(embedding_filename, vocab_size, embedding_dim, char_to_idx):
    
    with open(embedding_filename, 'rb') as f:
        char_embeddings = pickle.load(f)
    
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for char, embedding in char_embeddings.items():
        embedding_matrix[char_to_idx[char]] = embedding
        
    return embedding_matrix  

In [10]:
char_embedding_matrix = get_char_embedding_matrix('char_embeddings_with_features.pkl', char_vocab_size, char_embedding_dim, char_to_idx)


In [12]:
#word_embedding_martrix = get_char_embedding_matrix()

### Defining the Model

In [12]:
#Word Part of Model
char_inputs = Input(shape=(model_sentence_len, model_word_len))
x           = Lambda(lambda x: tf.reshape(x, (model_sentence_len, model_word_len)))(char_inputs)
x           = Embedding(char_vocab_size,
                            char_embedding_dim,
                            weights=[char_embedding_matrix],
                            input_length=max_word_len,
                            trainable=True)(x)
x           = Permute((2, 1), input_shape=(11, 35))(x)
x           = Dropout(dropout_rate)(x)
x           = Reshape((35, 11, 1))(x)
x           = Conv2D(conv_filters, input_shape=(38, 35, 11, 1), kernel_size=(1, window_size), padding='same')(x)
x           = MaxPooling2D(pool_size=(1, model_word_len))(x)
x           = Dense(1, input_shape=(35, 1, 30))(x)
x           = Reshape((35, ))(x)


#Sentence Part of Model
word_inputs = Input(shape=(model_sentence_len, ))
y           = Embedding(word_vocab_size,
                            word_embedding_dim,
                            #weights=[word_embedding_matrix],
                            input_length=max_sentence_len,
                            trainable=False)(word_inputs)


#Joining the two parts
x = Lambda(lambda x: tf.expand_dims(x, axis=0))(x)
z = Concatenate(axis=2)([y, x])
z = Dropout(dropout_rate)(z)
z = Bidirectional(LSTM())

outputs = z

# outputs = x


In [13]:
tf.random.set_seed(10)
model = Model(inputs=[char_inputs, word_inputs], outputs=outputs)

In [14]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 38, 11)]     0                                            
__________________________________________________________________________________________________
lambda_2 (Lambda)               (38, 11)             0           input_3[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (38, 11, 35)         3500        lambda_2[0][0]                   
__________________________________________________________________________________________________
permute_1 (Permute)             (38, 35, 11)         0           embedding_2[0][0]                
______________________________________________________________________________________________

In [292]:
np.random.seed(10)
char_inputs_1 = np.random.randint(10, size=(1, 38, 11))
word_inputs_1 = np.random.randint(10, size=(1, 38))

print(f"Char Inputs Shape: {char_inputs.shape}\nWord Inputs Shape: {word_inputs.shape}")



Char Inputs Shape: (1, 38, 11)
Word Inputs Shape: (None, 38)


In [300]:
output = model.predict([char_inputs_1, word_inputs_1], batch_size=1)
output.shape

(1, 38, 135)