In [1]:
import pickle
import pandas as pd
import numpy as np

from Sentence import Sentence
from Data     import Data

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Dropout, Dense, Flatten, Embedding, Reshape, Conv1D

### Loading Dataset

In [2]:
data = Data('../ner_dataset.csv', encoding='latin1')
sentence = Sentence(data.dataset)

In [7]:
# list of unique words and sentences in the dataset
word_list, sentence_list = sentence.get_words_and_sentences()

In [8]:
# character and word tokens
char_to_idx, word_to_idx, idx_to_char, idx_to_word = data.get_tokens(word_list, sentence_list)

### Char Embeddings

In [21]:
# getting char voacb
# with open('unique_chars.pkl', 'wb') as f:
#     pickle.dump(sentence.get_unique_chars(), f)

# loading char vocab   
with open('unique_chars.pkl', 'rb') as f:
    chars = pickle.load(f)

In [3]:
# Word info to pass to the model
max_word_len, min_word_len, avg_word_len, std_word_len = sentence.get_word_info()
avg_word_len, std_word_len

(4.773359082564433, 2.8246976648249014)

In [4]:
# Sentence info to pass to the model
max_sentence_len, min_sentence_len, avg_sentence_len, std_sentence_len = sentence.get_sentence_info()
max_sentence_len, min_sentence_len, avg_sentence_len, std_sentence_len

(104, 1, 21.863987989741236, 7.963596820721575)

In [9]:
#Word Level Hyper Parameters
model_word_len     = int(np.ceil(avg_word_len + 2*std_word_len))
char_embedding_dim = 35
char_vocab_size    = len(char_to_idx) + 1 #add 1 for padding token
window_size        = 3
conv_filters       = 30 
dropout_rate       = 0.5

#Sentence Level Hyper Parameters
model_sentence_len = int(np.ceil(avg_sentence_len + 2*std_sentence_len))
word_embedding_dim = 35
word_vocab_size    = len(word_to_idx) + 1 #add 1 for padding token


model_word_len, model_sentence_len, word_vocab_size

(11, 38, 31819)

In [23]:
def get_char_embedding_matrix(embedding_filename, vocab_size, embedding_dim, char_to_idx):
    
    with open(embedding_filename, 'rb') as f:
        char_embeddings = pickle.load(f)
    
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for char, embedding in char_embeddings.items():
        embedding_matrix[char_to_idx[char]] = embedding
        
    return embedding_matrix  

In [24]:
char_embedding_matrix = get_char_embedding_matrix('char_embeddings_with_features.pkl', char_vocab_size, char_embedding_dim, char_to_idx)

In [25]:
word_embedding_martrix = get_embedding_matrix()

NameError: name 'get_embedding_matrix' is not defined

In [36]:
input_sentence = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] ] * model_sentence_len
input_sentence = np.expand_dims(input_sentence, axis=0)
np.shape(input_sentence)

(1, 38, 11)

### Tokenized Sentences and Word Sequences

In [10]:
word_sequences, sentence_sequences = data.get_tokenized_sequences(word_list, sentence_list, model_word_len, model_sentence_len)

In [20]:
word_sequences[0]

array([[27, 10,  7, 14,  8,  3,  6, 12,  8,  0,  0],
       [ 7, 16,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [12,  2, 15,  7,  6,  8,  4,  9,  3,  4,  7],
       [10,  3, 23,  2,  0,  0,  0,  0,  0,  0,  0],
       [15,  3,  9, 13, 10,  2, 12,  0,  0,  0,  0],
       [ 4, 10,  9,  7, 14, 18, 10,  0,  0,  0,  0],
       [55,  7,  6, 12,  7,  6,  0,  0,  0,  0,  0],
       [ 4,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [17,  9,  7,  4,  2,  8,  4,  0,  0,  0,  0],
       [ 4, 10,  2,  0,  0,  0,  0,  0,  0,  0,  0],
       [21,  3,  9,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 5,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [30,  9,  3, 44,  0,  0,  0,  0,  0,  0,  0],
       [ 3,  6, 12,  0,  0,  0,  0,  0,  0,  0,  0],
       [12,  2, 15,  3,  6, 12,  0,  0,  0,  0,  0],
       [ 4, 10,  2,  0,  0,  0,  0,  0,  0,  0,  0],
       [21,  5,  4, 10, 12,  9,  3, 21,  3, 11,  0],
       [ 7, 16,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [37,  9,  5,  4,  5,  8, 10,  0,  0,  0

### Defining the Model

In [28]:
#Word Part of Model
char_inputs = Input(shape=(model_sentence_len, model_word_len))
x           = Embedding(char_vocab_size,
                            char_embedding_dim,
                            weights=[char_embedding_matrix],
                            input_length=max_word_len,
                            trainable=False)(char_inputs)
x           = Dropout(dropout_rate)(x)
x           = Conv2D(conv_filters, kernel_size=(1, window_size), padding='same')(x)
x           = MaxPooling2D(pool_size=(1,model_word_len))(x)
x           = Flatten()(x)

#Sentence Part of Model
# word_inputs = Input(shape=(model_sentence_len, ))
# y           = Embedding(word_vocab_size,
#                             word_embedding_dim,
#                             weights=[word_embedding_matrix],
#                             input_length=max_sentence_len,
#                             trainable=False)(words_inputs)

outputs     = Dense(char_embedding_dim)(x)


In [38]:
model = Model(inputs=[char_inputs], outputs=outputs)

In [39]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 38, 11)]          0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 38, 11, 35)        3500      
_________________________________________________________________
dropout_1 (Dropout)          (None, 38, 11, 35)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 38, 11, 30)        3180      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 38, 1, 30)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 1140)              0         
_________________________________________________________________
dense (Dense)                (None, 35)                3993

In [40]:
array = np.random.randint(10, size=(11, 38))
array

array([[3, 7, 5, 1, 3, 9, 5, 7, 7, 2, 0, 0, 5, 0, 4, 2, 6, 0, 5, 2, 0, 5,
        2, 0, 8, 5, 7, 5, 8, 6, 8, 4, 5, 5, 5, 5, 5, 3],
       [7, 7, 4, 0, 1, 2, 2, 2, 7, 2, 4, 1, 0, 9, 5, 0, 4, 7, 7, 2, 2, 1,
        2, 7, 5, 1, 6, 2, 6, 6, 8, 8, 4, 4, 7, 3, 2, 2],
       [4, 6, 1, 2, 0, 1, 1, 4, 8, 9, 1, 0, 8, 5, 3, 6, 0, 8, 8, 6, 9, 0,
        9, 0, 3, 8, 7, 2, 2, 1, 1, 5, 1, 4, 1, 4, 5, 6],
       [1, 9, 7, 0, 8, 8, 3, 0, 4, 3, 9, 3, 3, 6, 1, 4, 7, 1, 5, 5, 4, 7,
        8, 8, 5, 6, 9, 5, 3, 3, 3, 3, 0, 9, 4, 0, 2, 0],
       [3, 9, 2, 9, 1, 2, 0, 3, 4, 5, 5, 8, 5, 4, 1, 4, 0, 7, 0, 7, 0, 0,
        6, 9, 3, 1, 2, 6, 6, 8, 1, 7, 6, 1, 2, 8, 4, 5],
       [3, 4, 4, 0, 7, 8, 4, 9, 7, 5, 7, 7, 6, 4, 1, 4, 2, 3, 9, 4, 1, 4,
        6, 0, 6, 4, 0, 2, 4, 1, 9, 2, 2, 9, 7, 5, 5, 7],
       [9, 0, 5, 0, 4, 8, 9, 7, 8, 5, 2, 0, 8, 5, 7, 3, 0, 5, 9, 6, 9, 5,
        5, 6, 9, 6, 6, 0, 2, 5, 5, 3, 7, 8, 5, 5, 4, 3],
       [6, 8, 7, 1, 8, 2, 7, 8, 0, 1, 4, 5, 2, 2, 9, 9, 7, 2, 8, 0, 9, 8,
        1

In [41]:
model.predict(input_sentence)

array([[-0.588582  , -0.31788087,  0.06420205,  0.11311965,  0.04510457,
         0.48612192, -0.8989942 , -0.00648756,  0.36246017, -0.08838062,
         0.46495324, -0.324118  , -0.7427248 , -0.1971462 ,  0.63165516,
         0.0421341 ,  0.0324133 ,  0.34811825, -0.16643837, -1.0018299 ,
        -0.3986591 , -0.194302  , -0.20608908, -0.10410573, -0.603616  ,
         0.13091409,  0.40794054, -0.5922547 , -0.1245733 ,  0.01020445,
         0.30914104, -0.42015412,  0.6095438 , -0.42226735, -0.37890208]],
      dtype=float32)