In [36]:
import pickle
import pandas as pd
import numpy as np

from ner_utils.Sentence import Sentence
from ner_utils.Data     import Data

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Dropout, Dense, Flatten, Embedding, Concatenate
from tensorflow              import convert_to_tensor

### Character Tokenization

In [16]:
data = Data('data/ner_datasetreference.csv', encoding='latin1')
sentence = Sentence(data.dataset)

# generating character tokens
word_list   = data.dataset['Word'].values
char_tokens = data.tokenize_data(word_list, lower=False, char_level=True)

# mapping chars to indices
char_to_idx = char_tokens.word_index
# mapping indices to chars
idx_to_char = { idx : char for char, idx in char_to_idx.items()}


In [17]:
char_to_idx

{'UNK': 1,
 'e': 2,
 'a': 3,
 't': 4,
 'i': 5,
 'n': 6,
 'o': 7,
 's': 8,
 'r': 9,
 'h': 10,
 'l': 11,
 'd': 12,
 'c': 13,
 'u': 14,
 'm': 15,
 'f': 16,
 'p': 17,
 'g': 18,
 'y': 19,
 '.': 20,
 'w': 21,
 'b': 22,
 'v': 23,
 ',': 24,
 'k': 25,
 'S': 26,
 'T': 27,
 'A': 28,
 'M': 29,
 'I': 30,
 '0': 31,
 '-': 32,
 "'": 33,
 'P': 34,
 'C': 35,
 'U': 36,
 'B': 37,
 '1': 38,
 'z': 39,
 'N': 40,
 'H': 41,
 'x': 42,
 '2': 43,
 'q': 44,
 'F': 45,
 'R': 46,
 'E': 47,
 'K': 48,
 'j': 49,
 'O': 50,
 'W': 51,
 'D': 52,
 'G': 53,
 'J': 54,
 'L': 55,
 '9': 56,
 '"': 57,
 '5': 58,
 '3': 59,
 'V': 60,
 '8': 61,
 '4': 62,
 '6': 63,
 '7': 64,
 'Y': 65,
 '$': 66,
 'Z': 67,
 'Q': 68,
 ')': 69,
 '(': 70,
 '%': 71,
 'X': 72,
 ';': 73,
 ':': 74,
 '/': 75,
 '?': 76,
 '&': 77,
 '!': 78,
 '~': 79,
 'é': 80,
 '[': 81,
 ']': 82,
 '#': 83,
 '\x92': 84,
 '\x96': 85,
 '\x85': 86,
 '\x94': 87,
 'ö': 88,
 '+': 89,
 '@': 90,
 '\x93': 91,
 '°': 92,
 '\x97': 93,
 '_': 94,
 'ë': 95,
 'ü': 96,
 '\x91': 97,
 '`': 98,
 '\xa0

In [18]:
idx_to_char

{1: 'UNK',
 2: 'e',
 3: 'a',
 4: 't',
 5: 'i',
 6: 'n',
 7: 'o',
 8: 's',
 9: 'r',
 10: 'h',
 11: 'l',
 12: 'd',
 13: 'c',
 14: 'u',
 15: 'm',
 16: 'f',
 17: 'p',
 18: 'g',
 19: 'y',
 20: '.',
 21: 'w',
 22: 'b',
 23: 'v',
 24: ',',
 25: 'k',
 26: 'S',
 27: 'T',
 28: 'A',
 29: 'M',
 30: 'I',
 31: '0',
 32: '-',
 33: "'",
 34: 'P',
 35: 'C',
 36: 'U',
 37: 'B',
 38: '1',
 39: 'z',
 40: 'N',
 41: 'H',
 42: 'x',
 43: '2',
 44: 'q',
 45: 'F',
 46: 'R',
 47: 'E',
 48: 'K',
 49: 'j',
 50: 'O',
 51: 'W',
 52: 'D',
 53: 'G',
 54: 'J',
 55: 'L',
 56: '9',
 57: '"',
 58: '5',
 59: '3',
 60: 'V',
 61: '8',
 62: '4',
 63: '6',
 64: '7',
 65: 'Y',
 66: '$',
 67: 'Z',
 68: 'Q',
 69: ')',
 70: '(',
 71: '%',
 72: 'X',
 73: ';',
 74: ':',
 75: '/',
 76: '?',
 77: '&',
 78: '!',
 79: '~',
 80: 'é',
 81: '[',
 82: ']',
 83: '#',
 84: '\x92',
 85: '\x96',
 86: '\x85',
 87: '\x94',
 88: 'ö',
 89: '+',
 90: '@',
 91: '\x93',
 92: '°',
 93: '\x97',
 94: '_',
 95: 'ë',
 96: 'ü',
 97: '\x91',
 98: '`',
 99: '

### Char Embeddings

In [19]:
# getting char voacb
# with open('unique_chars.pkl', 'wb') as f:
#     pickle.dump(sentence.get_unique_chars(), f)

# loading char vocab   
with open('unique_chars.pkl', 'rb') as f:
    chars = pickle.load(f)

In [20]:
# sentence info to pass to the model
max_word_len, min_word_len, avg_word_len, std_word_len = sentence.get_word_info()
avg_word_len, std_word_len

(4.773359082564433, 2.8246976648249014)

In [22]:
model_word_len     = int(np.ceil(avg_word_len + 2*std_word_len))
model_sentence_len = 10
char_embedding_dim = 35
char_vocab_size    = len(char_to_idx) + 1 #add 1 for padding token
window_size        = 3
conv_filters       = 30 
dropout_rate       = 0.5
#input_shape=(model_word_len, char_embedding_dim)

In [21]:
def get_char_embedding_matrix(embedding_filename, vocab_size, embedding_dim, char_to_idx):
    
    with open(embedding_filename, 'rb') as f:
        char_embeddings = pickle.load(f)
    
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for char, embedding in char_embeddings.items():
        embedding_matrix[char_to_idx[char]] = embedding
        
    return embedding_matrix  

In [23]:
char_embedding_matrix = get_char_embedding_matrix('char_embeddings_with_features.pkl', char_vocab_size, char_embedding_dim, char_to_idx)


In [24]:
len(char_embedding_matrix)

100

In [25]:
input_sentence = [ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] ] * model_sentence_len
input_sentence = np.expand_dims(input_sentence, axis=0)
np.shape(input_sentence)

(1, 10, 11)

### Defining the Model

In [43]:
inputs1  = Input(shape=(model_sentence_len, model_word_len))
x       = Embedding(char_vocab_size,
                            char_embedding_dim,
                            weights=[char_embedding_matrix],
                            input_length=max_word_len,
                            trainable=False)(inputs1)
# x       = Reshape((1, 64, 35))(x)
x       = Dropout(dropout_rate)(x)
x       = Conv2D(conv_filters, kernel_size=(1, window_size), padding='same')(x)
x       = MaxPooling2D(pool_size=(1,model_word_len))(x)
x       = Flatten()(x)
outputs = Dense(char_embedding_dim)(x)

#outputs = Dense(2, activation='relu')(x)

In [44]:
model = Model(inputs=[inputs1, inputs2], outputs=outputs)

In [45]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_22 (InputLayer)           [(None, 10, 11)]     0                                            
__________________________________________________________________________________________________
embedding_11 (Embedding)        (None, 10, 11, 35)   3500        input_22[0][0]                   
__________________________________________________________________________________________________
dropout_11 (Dropout)            (None, 10, 11, 35)   0           embedding_11[0][0]               
__________________________________________________________________________________________________
conv2d_11 (Conv2D)              (None, 10, 11, 30)   3180        dropout_11[0][0]                 
____________________________________________________________________________________________

In [81]:
array = np.random.randint(10, size=(11, ))
array

array([[5, 8, 6, 4, 1, 8, 8, 9, 6, 1, 0],
       [5, 5, 0, 8, 6, 6, 2, 3, 9, 0, 1],
       [1, 6, 2, 4, 7, 3, 6, 4, 8, 9, 1],
       [2, 9, 3, 1, 3, 9, 0, 1, 1, 6, 3],
       [8, 6, 3, 1, 0, 4, 6, 0, 8, 6, 8],
       [9, 8, 2, 3, 6, 1, 6, 0, 6, 1, 1],
       [5, 1, 9, 1, 5, 8, 2, 5, 9, 8, 7],
       [6, 1, 4, 5, 9, 6, 3, 5, 2, 5, 1],
       [9, 4, 2, 5, 7, 0, 5, 3, 6, 6, 4],
       [6, 3, 7, 2, 4, 0, 5, 2, 8, 1, 8],
       [8, 7, 7, 4, 9, 7, 2, 7, 7, 9, 1],
       [2, 6, 3, 2, 0, 6, 9, 9, 3, 5, 2],
       [6, 1, 4, 8, 7, 4, 5, 8, 2, 0, 6],
       [0, 0, 6, 4, 3, 4, 1, 8, 1, 5, 8],
       [4, 2, 6, 0, 7, 0, 5, 9, 2, 0, 7],
       [0, 5, 8, 4, 7, 0, 3, 4, 6, 5, 5],
       [0, 1, 8, 4, 7, 6, 2, 7, 8, 6, 1],
       [3, 4, 8, 4, 2, 8, 3, 4, 2, 0, 1],
       [7, 5, 3, 6, 5, 1, 7, 5, 7, 4, 6],
       [5, 0, 5, 8, 3, 6, 8, 0, 0, 9, 9],
       [2, 3, 5, 9, 1, 1, 7, 6, 5, 6, 1],
       [8, 3, 6, 3, 9, 3, 0, 2, 5, 4, 3],
       [6, 5, 0, 1, 7, 2, 9, 3, 5, 9, 3],
       [7, 7, 9, 2, 5, 3, 6, 2, 1,

In [97]:
model.predict(input_sentence)

array([[-0.24038328,  0.4033629 , -0.32530367, -0.41237628, -0.24126104,
         0.65062016, -0.43435034, -0.8574492 , -0.09381327,  0.254031  ,
        -0.39190325, -0.3600513 ,  0.3980916 , -0.40338573, -0.29359978,
        -0.37955457, -0.6862881 ,  0.20838639, -1.0729555 , -0.8090278 ,
         0.0095225 , -0.19786382,  0.35451126,  0.00508714,  0.11782737,
        -0.3889305 ,  0.40416017,  0.35667178,  0.49268103, -0.01485226,
         0.35816324,  0.19061807, -0.46540278, -0.1806159 ,  0.6091532 ]],
      dtype=float32)