In [12]:
import pickle
import pandas as pd
import numpy as np

from Sentence import Sentence
from Data     import Data

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Dropout, Dense, Flatten, Embedding, Reshape, Conv1D

### Character Tokenization

In [13]:
data = Data('../ner_dataset.csv', encoding='latin1')
sentence = Sentence(data.dataset)

In [14]:
word_list, sentence_list = sentence.get_words_and_sentences()
word_sequences, sentence_sequences = data.get_tokenized_sequences(word_list, sentence_list)

In [19]:
char_to_idx, word_to_idx, idx_to_char, idx_to_word = data._get_tokens(word_list, sentence_list)

In [20]:
sentence_sequences[0]

array([ 254,    6,  967,   16, 1795,  238,  468,    7,  523,    2,  129,
          5,   61,    9,  571,    2,  833,    6,  186,   90,   22,   15,
         56,    3,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0])

### Char Embeddings

In [21]:
# getting char voacb
# with open('unique_chars.pkl', 'wb') as f:
#     pickle.dump(sentence.get_unique_chars(), f)

# loading char vocab   
with open('unique_chars.pkl', 'rb') as f:
    chars = pickle.load(f)

In [22]:
# Word info to pass to the model
max_word_len, min_word_len, avg_word_len, std_word_len = sentence.get_word_info()
avg_word_len, std_word_len

(4.773359082564433, 2.8246976648249014)

In [23]:
# Sentence info to pass to the model
max_sentence_len, min_sentence_len, avg_sentence_len, std_sentence_len = sentence.get_sentence_info()
max_sentence_len, min_sentence_len, avg_sentence_len, std_sentence_len

(104, 1, 21.863987989741236, 7.963596820721575)

In [38]:
#Word Level Hyper Parameters
model_word_len     = int(np.ceil(avg_word_len + 2*std_word_len))
char_embedding_dim = 35
char_vocab_size    = len(char_to_idx) + 1 #add 1 for padding token
window_size        = 3
conv_filters       = 30 
dropout_rate       = 0.5

#Sentence Level Hyper Parameters
model_sentence_len = int(np.ceil(avg_sentence_len + 2*std_sentence_len))
word_embedding_dim = 35
word_vocab_size    = len(word_to_idx) + 1 #add 1 for padding token


model_word_len, model_sentence_len, word_vocab_size

(11, 38, 31819)

In [25]:
def get_char_embedding_matrix(embedding_filename, vocab_size, embedding_dim, char_to_idx):
    
    with open(embedding_filename, 'rb') as f:
        char_embeddings = pickle.load(f)
    
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for char, embedding in char_embeddings.items():
        embedding_matrix[char_to_idx[char]] = embedding
        
    return embedding_matrix  

In [26]:
char_embedding_matrix = get_char_embedding_matrix('char_embeddings_with_features.pkl', char_vocab_size, char_embedding_dim, char_to_idx)


In [None]:
word_embedding_martrix = get_embedding_matrix()

In [28]:
input_sentence = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] ] * model_sentence_len
input_sentence = np.expand_dims(input_sentence, axis=0)
np.shape(input_sentence)

(1, 38, 11)

### Defining the Model

In [39]:
#Word Part of Model
char_inputs = Input(shape=(model_sentence_len, model_word_len))
x           = Embedding(char_vocab_size,
                            char_embedding_dim,
                            weights=[char_embedding_matrix],
                            input_length=max_word_len,
                            trainable=False)(char_inputs)
x           = Dropout(dropout_rate)(x)
x           = Conv2D(conv_filters, kernel_size=(1, window_size), padding='same')(x)
x           = MaxPooling2D(pool_size=(1,model_word_len))(x)
x           = Flatten()(x)

#Sentence Part of Model
word_inputs = Input(shape=(model_sentence_len, ))
y           = Embedding(word_vocab_size,
                            word_embedding_dim,
                            weights=[word_embedding_matrix],
                            input_length=max_sentence_len,
                            trainable=False)(words_inputs)

outputs     = Dense(char_embedding_dim)(x)


NameError: name 'word_embedding_matrix' is not defined

In [35]:
model = Model(inputs=[char_inputs,  word_inputs], outputs=outputs)

In [36]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 38, 11)]     0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 38, 11, 35)   3500        input_4[0][0]                    
__________________________________________________________________________________________________
dropout_2 (Dropout)             (None, 38, 11, 35)   0           embedding_2[0][0]                
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 38, 11, 30)   3180        dropout_2[0][0]                  
____________________________________________________________________________________________

In [None]:
array = np.random.randint(10, size=(11, ))
array

In [None]:
model.predict(input_sentence)