In [177]:
import pickle
import pandas as pd
import numpy as np

from ner_utils.Sentence import Sentence
from ner_utils.Data     import Data

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Dropout, Dense, Flatten, Embedding, Concatenate, Reshape, Permute

### Character Tokenization

In [6]:
data = Data('data/ner_datasetreference.csv', encoding='latin1')
sentence = Sentence(data.dataset)

In [7]:
word_list, sentence_list = sentence.get_words_and_sentences()
word_sequences, sentence_sequences = data.get_tokenized_sequences(word_list, sentence_list)

In [8]:
char_to_idx, word_to_idx, idx_to_char, idx_to_word = data._get_tokens(word_list, sentence_list)

In [9]:
sentence_sequences[0]

array([ 254,    6,  967,   16, 1795,  238,  468,    7,  523,    2,  129,
          5,   61,    9,  571,    2,  833,    6,  186,   90,   22,   15,
         56,    3,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0])

### Char Embeddings

In [10]:
# getting char voacb
# with open('unique_chars.pkl', 'wb') as f:
#     pickle.dump(sentence.get_unique_chars(), f)

# loading char vocab   
with open('unique_chars.pkl', 'rb') as f:
    chars = pickle.load(f)

In [11]:
# Word info to pass to the model
max_word_len, min_word_len, avg_word_len, std_word_len = sentence.get_word_info()
avg_word_len, std_word_len

(4.773359082564433, 2.8246976648249014)

In [12]:
# Sentence info to pass to the model
max_sentence_len, min_sentence_len, avg_sentence_len, std_sentence_len = sentence.get_sentence_info()
max_sentence_len, min_sentence_len, avg_sentence_len, std_sentence_len

(104, 1, 21.863987989741236, 7.963596820721575)

In [23]:
#Word Level Hyper Parameters
model_word_len     = int(np.ceil(avg_word_len + 2*std_word_len))
char_embedding_dim = 35
char_vocab_size    = len(char_to_idx) + 1 #add 1 for padding token
window_size        = 3
conv_filters       = 30 
dropout_rate       = 0.5

#Sentence Level Hyper Parameters
model_sentence_len = int(np.ceil(avg_sentence_len + 2*std_sentence_len))
word_embedding_dim = 100
word_vocab_size    = len(word_to_idx) + 1 #add 1 for padding token


model_word_len, model_sentence_len, word_vocab_size

(11, 38, 31819)

In [14]:
def get_char_embedding_matrix(embedding_filename, vocab_size, embedding_dim, char_to_idx):
    
    with open(embedding_filename, 'rb') as f:
        char_embeddings = pickle.load(f)
    
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for char, embedding in char_embeddings.items():
        embedding_matrix[char_to_idx[char]] = embedding
        
    return embedding_matrix  

In [15]:
char_embedding_matrix = get_char_embedding_matrix('char_embeddings_with_features.pkl', char_vocab_size, char_embedding_dim, char_to_idx)


In [17]:
#word_embedding_martrix = get_char_embedding_matrix()

In [18]:
input_sentence = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] ] * model_sentence_len
input_sentence = np.expand_dims(input_sentence, axis=0)
np.shape(input_sentence)

(1, 38, 11)

### Defining the Model

In [173]:
#Word Part of Model
char_inputs = Input(shape=(model_word_len), batch_size=model_sentence_len)
x           = Embedding(char_vocab_size,
                            char_embedding_dim,
                            weights=[char_embedding_matrix],
                            input_length=max_word_len,
                            trainable=True)(char_inputs)
x           = Permute((2, 1), input_shape=(11, 35))(x)
x           = Dropout(dropout_rate)(x)
x           = Reshape((35, 11, 1))(x)
x           = Conv2D(conv_filters, input_shape=(38, 35, 11, 1), kernel_size=(1, window_size), padding='same')(x)
x           = MaxPooling2D(pool_size=(1,model_word_len ))(x)
# x           = Reshape((35, 30))(x)
# x           = Flatten()(x)
x           = Dense(1, input_shape=(35, 1, 30))(x)
x           = Reshape((35, ))(x)


#Sentence Part of Model
word_inputs = Input(shape=(model_sentence_len, ))
y           = Embedding(word_vocab_size,
                            word_embedding_dim,
                            #weights=[word_embedding_matrix],
                            input_length=max_sentence_len,
                            trainable=False)(word_inputs)

#Joining the two parts

#outputs = Concatenate(axis=0)([y, x])
outputs = x


In [174]:
model = Model(inputs=[char_inputs, word_inputs], outputs=outputs)

In [175]:
model.summary()

Model: "model_38"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_147 (InputLayer)          [(38, 11)]           0                                            
__________________________________________________________________________________________________
embedding_146 (Embedding)       (38, 11, 35)         3500        input_147[0][0]                  
__________________________________________________________________________________________________
permute_1 (Permute)             (38, 35, 11)         0           embedding_146[0][0]              
__________________________________________________________________________________________________
dropout_77 (Dropout)            (38, 35, 11)         0           permute_1[0][0]                  
___________________________________________________________________________________________

In [176]:
array = np.random.randint(10, size=(11, ))
array

array([1, 9, 1, 3, 7, 4, 8, 1, 9, 8, 9])

In [None]:
model.predict(input_sentence)

In [192]:
a = Input((3, 2))
b = Permute((2, 1), input_shape=(3, 2))(a)

test = Model(inputs=a, outputs=b)

In [199]:
pop = np.array([[1, 2], [3, 4], [5, 6]])
pop = np.expand_dims(pop,  axis=0)
pop.shape

(1, 3, 2)

In [200]:
test_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
permute_2 (Permute)          (None, 2, 3)              0         
Total params: 0
Trainable params: 0
Non-trainable params: 0
_________________________________________________________________


In [201]:
test.predict([pop])

array([[[1., 3., 5.],
        [2., 4., 6.]]], dtype=float32)