In [81]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
import os
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import math
import argparse

## Preprocessing

In [82]:
def split_text_label(filename):
    f = open(filename)
    split_labeled_text = []
    sentence = []
    for line in f:
        # -DOSTART- indique le début d'un nouveau document
        if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\n":
            if len(sentence) > 0:
                split_labeled_text.append(sentence)
                sentence = []
            continue
        splits = line.split(' ')
        sentence.append([splits[0],splits[-1].rstrip("\n")])
    
    if len(sentence) > 0:
        split_labeled_text.append(sentence)
        sentence = []
    return split_labeled_text

In [83]:
train = split_text_label(os.path.join('data/', "conllpp_train.txt"))
valid = split_text_label(os.path.join('data/', "conllpp_dev.txt"))
test = split_text_label(os.path.join('data/', "conllpp_test.txt"))

Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):
<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x0000022B45F26850>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "E:\anaconda3\lib\site-packages\keras\backend.py", line 5091, in <genexpr>
    ta.write(ta_index_to_write, out)  File "E:\anaconda3\lib\site-packages\tensorflow\python\util\tf_should_use.py", line 243, in wrapped


In [84]:
train

[[['EU', 'B-ORG'],
  ['rejects', 'O'],
  ['German', 'B-MISC'],
  ['call', 'O'],
  ['to', 'O'],
  ['boycott', 'O'],
  ['British', 'B-MISC'],
  ['lamb', 'O'],
  ['.', 'O']],
 [['Peter', 'B-PER'], ['Blackburn', 'I-PER']],
 [['BRUSSELS', 'B-LOC'], ['1996-08-22', 'O']],
 [['The', 'O'],
  ['European', 'B-ORG'],
  ['Commission', 'I-ORG'],
  ['said', 'O'],
  ['on', 'O'],
  ['Thursday', 'O'],
  ['it', 'O'],
  ['disagreed', 'O'],
  ['with', 'O'],
  ['German', 'B-MISC'],
  ['advice', 'O'],
  ['to', 'O'],
  ['consumers', 'O'],
  ['to', 'O'],
  ['shun', 'O'],
  ['British', 'B-MISC'],
  ['lamb', 'O'],
  ['until', 'O'],
  ['scientists', 'O'],
  ['determine', 'O'],
  ['whether', 'O'],
  ['mad', 'O'],
  ['cow', 'O'],
  ['disease', 'O'],
  ['can', 'O'],
  ['be', 'O'],
  ['transmitted', 'O'],
  ['to', 'O'],
  ['sheep', 'O'],
  ['.', 'O']],
 [['Germany', 'B-LOC'],
  ["'s", 'O'],
  ['representative', 'O'],
  ['to', 'O'],
  ['the', 'O'],
  ['European', 'B-ORG'],
  ['Union', 'I-ORG'],
  ["'s", 'O'],
  ['vete

Chaque phrase sera représenté de la forme:
[['EU', 'B-ORG'],
  ['rejects', 'O'],
  ['German', 'B-MISC'],
  ['call', 'O'],
  ['to', 'O'],
  ['boycott', 'O'],
  ['British', 'B-MISC'],
  ['lamb', 'O'],
  ['.', 'O']]

In [85]:
labelSet = set()
wordSet = set()
# words and labels
for data in [train, valid, test]:
    for labeled_text in data:
        for word, label in labeled_text:
            labelSet.add(label)
            wordSet.add(word.lower())

In [86]:
labelSet

{'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O'}

WordSet represents our vocabulary

In [87]:
wordSet

{'resulting',
 'bse-infected',
 'purdue',
 'exercised',
 'juliet',
 'watermelon',
 'half-mast',
 '830.20',
 'overcame',
 '7.56',
 'include',
 'out-of-sorts',
 '118',
 'obstacle',
 'becalmed',
 'occupation',
 'squabbles',
 'disrupting',
 'tampa',
 'marino',
 'canberra',
 'neutered',
 'sighted',
 'mission',
 'yi',
 'weiss',
 'pedros',
 'syrian-backed',
 '1:16',
 'insured',
 'woosnam',
 'bang',
 'signed',
 'grenade',
 '1:18.81',
 'substances',
 'possess',
 'chandigarh',
 'hectic',
 'isthmus',
 'commerce',
 'pontypridd',
 '3909.3',
 'poised',
 'honed',
 'learjet',
 'cotti',
 'buying',
 '0.36',
 'rbls',
 '9-242',
 'olympic',
 'schomberg',
 'tanzanian',
 'arthur',
 'chul',
 'windblown',
 '1676/96',
 'sousa',
 '37-year-old',
 'yassin',
 'duce',
 'bells',
 'roubles',
 'renk',
 'slater',
 'headliners',
 '125,000',
 '80-yard',
 'blinker',
 'repair',
 'outraged',
 'zaire',
 '87',
 'lamps',
 'conferences',
 'sali',
 'heads',
 '22.72',
 '561-8671',
 'petrimex',
 'dimas',
 'fried',
 'goodbye',
 'wor

In [88]:
sorted_labels = sorted(list(labelSet), key=len)

In [89]:
sorted_labels

['O', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'B-ORG', 'B-LOC', 'I-MISC', 'B-MISC']

In [90]:
# Create mapping for labels
label2Idx = {}
for label in sorted_labels:
    label2Idx[label] = len(label2Idx)
idx2Label = {v: k for k, v in label2Idx.items()}

In [91]:
label2Idx

{'O': 0,
 'B-PER': 1,
 'I-LOC': 2,
 'I-ORG': 3,
 'I-PER': 4,
 'B-ORG': 5,
 'B-LOC': 6,
 'I-MISC': 7,
 'B-MISC': 8}

In [92]:
idx2Label

{0: 'O',
 1: 'B-PER',
 2: 'I-LOC',
 3: 'I-ORG',
 4: 'I-PER',
 5: 'B-ORG',
 6: 'B-LOC',
 7: 'I-MISC',
 8: 'B-MISC'}

In [93]:
# Create mapping for words
word2Idx = {}
if len(word2Idx) == 0:
    word2Idx["PADDING_TOKEN"] = len(word2Idx)
    word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
for word in wordSet:
    word2Idx[word] = len(word2Idx)

In [94]:
#We give an index to each word
word2Idx

{'PADDING_TOKEN': 0,
 'UNKNOWN_TOKEN': 1,
 'resulting': 2,
 'bse-infected': 3,
 'purdue': 4,
 'exercised': 5,
 'juliet': 6,
 'watermelon': 7,
 'half-mast': 8,
 '830.20': 9,
 'overcame': 10,
 '7.56': 11,
 'include': 12,
 'out-of-sorts': 13,
 '118': 14,
 'obstacle': 15,
 'becalmed': 16,
 'occupation': 17,
 'squabbles': 18,
 'disrupting': 19,
 'tampa': 20,
 'marino': 21,
 'canberra': 22,
 'neutered': 23,
 'sighted': 24,
 'mission': 25,
 'yi': 26,
 'weiss': 27,
 'pedros': 28,
 'syrian-backed': 29,
 '1:16': 30,
 'insured': 31,
 'woosnam': 32,
 'bang': 33,
 'signed': 34,
 'grenade': 35,
 '1:18.81': 36,
 'substances': 37,
 'possess': 38,
 'chandigarh': 39,
 'hectic': 40,
 'isthmus': 41,
 'commerce': 42,
 'pontypridd': 43,
 '3909.3': 44,
 'poised': 45,
 'honed': 46,
 'learjet': 47,
 'cotti': 48,
 'buying': 49,
 '0.36': 50,
 'rbls': 51,
 '9-242': 52,
 'olympic': 53,
 'schomberg': 54,
 'tanzanian': 55,
 'arthur': 56,
 'chul': 57,
 'windblown': 58,
 '1676/96': 59,
 'sousa': 60,
 '37-year-old': 61

- we create matrices of numeric representations of our words according to their index in the vocabulary

In [95]:
def createMatrices(data, word2Idx, label2Idx):
    sentences = []
    labels = []
    for split_labeled_text in data:
        wordIndices = []
        labelIndices = []
        
        for word, label in split_labeled_text:
            if word in word2Idx:
                wordIdx = word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()]
            else:
                wordIdx = word2Idx['UNKNOWN_TOKEN']
            wordIndices.append(wordIdx)
            labelIndices.append(label2Idx[label])
            
        sentences.append(wordIndices)
        labels.append(labelIndices)
    return sentences, labels

In [96]:
train_sentences, train_labels = createMatrices(train, word2Idx, label2Idx)
valid_sentences, valid_labels = createMatrices(valid, word2Idx, label2Idx)
test_sentences, test_labels = createMatrices(test, word2Idx, label2Idx)

In [97]:
train_sentences

[[26060, 11011, 22251, 2677, 21349, 24438, 16811, 1063, 7095],
 [6365, 23010],
 [10673, 12892],
 [16996,
  16418,
  26295,
  16771,
  26372,
  21272,
  22220,
  7269,
  6564,
  22251,
  13997,
  21349,
  18201,
  21349,
  8153,
  16811,
  1063,
  679,
  24277,
  21757,
  5146,
  2807,
  7300,
  1992,
  22245,
  23694,
  9140,
  21349,
  490,
  7095],
 [996,
  9482,
  7276,
  21349,
  16996,
  16418,
  20143,
  9482,
  687,
  8867,
  18299,
  6272,
  16771,
  26372,
  25743,
  18201,
  812,
  1404,
  7583,
  1068,
  15578,
  8603,
  16348,
  13730,
  679,
  16996,
  6883,
  13997,
  18793,
  1928,
  7095],
 [3348,
  26045,
  8921,
  3024,
  1183,
  1660,
  17746,
  25301,
  25058,
  26045,
  8921,
  3024,
  6183,
  1660,
  24096,
  4295,
  22220,
  9346,
  3348,
  16996,
  26295,
  9482,
  24081,
  18236,
  21856,
  3357,
  5326,
  3449,
  4725,
  2881,
  10421,
  22918,
  7095],
 [1454,
  16771,
  5006,
  6883,
  3961,
  18793,
  16227,
  4776,
  6873,
  22220,
  18793,
  9430,
  26443

In [98]:
lengths = []

In [99]:
for data in train_sentences:
    lengths.append(len(data))

In [100]:
print(max(lengths))

113


We need to pad sentences and labels so they will have the same length when we give it as an input to the model

In [101]:
def padding(sentences, labels, max_len, padding='post'):
    padded_sentences = pad_sequences(sentences, max_len,padding='post')
    padded_labels = pad_sequences(labels, max_len, padding='post')
    return padded_sentences, padded_labels

In [102]:
max_seq_len = 113

In [103]:
train_features, train_labels = padding(train_sentences, train_labels, max_seq_len, padding='post' )
valid_features, valid_labels = padding(valid_sentences, valid_labels, max_seq_len, padding='post' )
test_features, test_labels = padding(test_sentences, test_labels, max_seq_len, padding='post' )

In [104]:
train_features

array([[26060, 11011, 22251, ...,     0,     0,     0],
       [ 6365, 23010,     0, ...,     0,     0,     0],
       [10673, 12892,     0, ...,     0,     0,     0],
       ...,
       [ 6699,  5955, 15025, ...,     0,     0,     0],
       [ 1165, 25094,     0, ...,     0,     0,     0],
       [ 4362,   403, 10324, ...,     0,     0,     0]])

Vectorisation des données:

In [26]:
import urllib.request
import zipfile

# Download the GloVe embeddings zip file
url = 'http://nlp.stanford.edu/data/glove.6B.zip'
urllib.request.urlretrieve(url, 'glove.6B.zip')

# extract the glove.6B.100d.txt file from the zip file
with zipfile.ZipFile('glove.6B.zip', 'r') as zip_ref:
    zip_ref.extract('glove.6B.100d.txt')

In [27]:
# Loading glove embeddings
embeddings_index = {}
f = open('glove.6B.100d.txt', encoding="utf-8")
for line in f:
    values = line.strip().split(' ')
    word = values[0] # the first entry is the word
    coefs = np.asarray(values[1:], dtype='float32') #100d vectors representing the word
    embeddings_index[word] = coefs
f.close()

In [105]:
embedding_matrix = np.zeros((len(word2Idx), 100))
# Word embeddings for the tokens
for word,i in word2Idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [106]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.17963   ,  0.014487  , -0.18951   , ..., -0.029267  ,
         0.57937998, -0.89938003],
       ...,
       [-0.21505   ,  0.040371  ,  0.62643999, ...,  0.20871   ,
         0.37895   ,  0.30133   ],
       [ 0.99326998,  0.23939   , -0.46709999, ..., -0.83226001,
        -0.36855   , -0.30056   ],
       [ 1.46089995, -0.77652001, -1.11049998, ..., -1.04610002,
         0.23074   ,  0.11283   ]])

- Data generator

 Convert each numpy array into a tf.data.Dataset object. This allows for more efficient and scalable processing of the data.

In [107]:
train_batch_size = 32
valid_batch_size = 64
test_batch_size = 64
train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels))
valid_dataset = tf.data.Dataset.from_tensor_slices((valid_features, valid_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_labels))
shuffled_train_dataset = train_dataset.shuffle(buffer_size=train_features.shape[0], reshuffle_each_iteration=True)
batched_train_dataset = shuffled_train_dataset.batch(train_batch_size, drop_remainder=True)
batched_valid_dataset = valid_dataset.batch(valid_batch_size, drop_remainder=True)
batched_test_dataset = test_dataset.batch(test_batch_size, drop_remainder=True)

In [108]:
batched_train_dataset

<BatchDataset element_spec=(TensorSpec(shape=(32, 113), dtype=tf.int32, name=None), TensorSpec(shape=(32, 113), dtype=tf.int32, name=None))>

In [109]:
# Create an iterator over the dataset
iterator = batched_train_dataset.__iter__()

# Iterate through the dataset and print each element
for element in iterator:
    print(element)

(<tf.Tensor: shape=(32, 113), dtype=int32, numpy=
array([[ 6520,  9482,  6539, ...,     0,     0,     0],
       [10040, 17243,   864, ...,     0,     0,     0],
       [11989, 25121, 26426, ...,     0,     0,     0],
       ...,
       [ 2881, 23641, 25081, ...,     0,     0,     0],
       [11992,  9482, 24838, ...,     0,     0,     0],
       [16996, 20155, 14553, ...,     0,     0,     0]])>, <tf.Tensor: shape=(32, 113), dtype=int32, numpy=
array([[6, 0, 5, ..., 0, 0, 0],
       [5, 3, 0, ..., 0, 0, 0],
       [1, 4, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 8, 0, ..., 0, 0, 0]])>)
(<tf.Tensor: shape=(32, 113), dtype=int32, numpy=
array([[19363, 10852,  3631, ...,     0,     0,     0],
       [12798,  1068, 18481, ...,     0,     0,     0],
       [ 3348, 11992, 11682, ...,     0,     0,     0],
       ...,
       [ 3479,   864, 12478, ...,     0,     0,     0],
       [25618,  2951, 24698, ...,     0,     0,     0],
 

(<tf.Tensor: shape=(32, 113), dtype=int32, numpy=
array([[ 3348, 26045,  7317, ...,     0,     0,     0],
       [ 4964,  5533,  5955, ...,     0,     0,     0],
       [11063, 26207,  7095, ...,     0,     0,     0],
       ...,
       [ 1763,  1845, 18481, ...,     0,     0,     0],
       [ 7699,  9346,  3338, ...,     0,     0,     0],
       [25566,   864,  5955, ...,     0,     0,     0]])>, <tf.Tensor: shape=(32, 113), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [5, 3, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 8, 1, ..., 0, 0, 0],
       [6, 0, 6, ..., 0, 0, 0],
       [5, 0, 0, ..., 0, 0, 0]])>)
(<tf.Tensor: shape=(32, 113), dtype=int32, numpy=
array([[15527, 22597,  3250, ...,     0,     0,     0],
       [26426,  1029, 18541, ...,     0,     0,     0],
       [16996, 24878, 11682, ...,     0,     0,     0],
       ...,
       [ 6511, 23345,  4295, ...,     0,     0,     0],
       [ 2498, 13376, 20570, ...,     0,     0,     0],
 

32: the number or elements in a batch

- Model

###### - Bi-LSTM

In [110]:
num_labels = len(label2Idx)
EMBEDDING_DIM = 100

In [66]:
import tensorflow as tf
from tensorflow.keras import layers

class TFNer(tf.keras.Model):
    def __init__(self, max_seq_len, embed_input_dim, embed_output_dim, num_labels, weights):
        super(TFNer, self).__init__() 
        self.embedding = layers.Embedding(input_dim=embed_input_dim, 
        output_dim=embed_output_dim, weights=weights,    
        input_length=max_seq_len, trainable=False, mask_zero=True)    
        self.bilstm = layers.Bidirectional(layers.LSTM(128,  
        return_sequences=True))
        self.dense = layers.Dense(num_labels)

    def call(self, inputs):
        x = self.embedding(inputs) # batchsize, max_seq_len,      embedding_output_dim
        x = self.bilstm(x) #batchsize, max_seq_len, hidden_dim_bilstm
        logits = self.dense(x) #batchsize, max_seq_len, num_labels
        return logits

In [67]:
model = TFNer(max_seq_len=max_seq_len,embed_input_dim=len(word2Idx), embed_output_dim=EMBEDDING_DIM, weights=[embedding_matrix], num_labels=num_labels)

In [158]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

In [146]:
scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [113]:
train_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
valid_loss_metric = tf.keras.metrics.Mean('valid_loss', dtype=tf.float32)

In [71]:
def train_step_fn(sentences_batch, labels_batch):
    with tf.GradientTape() as tape:
        logits = model(sentences_batch)
        loss = scce(labels_batch, logits)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(list(zip(grads,   
    model.trainable_variables)))
    return loss, logits

In [72]:
def valid_step_fn(sentences_batch, labels_batch):
    logits = model(sentences_batch)
    loss = scce(labels_batch, logits)
    return loss, logits

- Training the model

In [157]:
from fastprogress.fastprogress import master_bar, progress_bar
epochs = 10
epoch_bar = master_bar(range(epochs))

In [115]:
train_pb_max_len = math.ceil(float(len(train_features))/float(train_batch_size))
valid_pb_max_len = math.ceil(float(len(valid_features))/float(valid_batch_size))
test_pb_max_len = math.ceil(float(len(test_features))/float(test_batch_size))

In [None]:
for epoch in epoch_bar:
    for sentences_batch, labels_batch in progress_bar(batched_train_dataset, total=train_pb_max_len,parent=epoch_bar):
        loss, logits = train_step_fn(sentences_batch, labels_batch)
        train_loss_metric(loss)
    train_loss_metric.reset_states()

In [156]:
for sentences_batch, labels_batch in progress_bar(batched_valid_dataset, total=valid_pb_max_len, parent=epoch_bar):
    loss, logits = valid_step_fn(sentences_batch, labels_batch)
    valid_loss_metric.update_state(loss)
    valid_loss_metric.reset_states()

In [161]:
model.save_weights("model_weights",save_format='tf')

In [131]:
def idx_to_label(predictions, correct, idx2Label):
    label_pred = []
    for sentence in predictions:
        for i in sentence:
            label_pred.append([idx2Label[elem] for elem in i ])
    label_correct = []
    if correct != None:
        for sentence in correct:
            for i in sentence:
                label_correct.append([idx2Label[elem] for elem in i ])
    return label_correct, label_pred

In [164]:
test_model =  TFNer(max_seq_len=max_seq_len, embed_input_dim=len(word2Idx), embed_output_dim=EMBEDDING_DIM, weights=[embedding_matrix], num_labels=num_labels)

In [167]:
test_model.load_weights("model_weights")

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1f4f7fe9100>

In [168]:
true_labels = []
pred_labels = []

In [169]:
for sentences_batch, labels_batch in progress_bar(batched_test_dataset, total=test_pb_max_len):
    logits = test_model(sentences_batch)
    temp1 = tf.nn.softmax(logits)
    preds = tf.argmax(temp1, axis=2)
    true_labels.append(np.asarray(labels_batch))
    pred_labels.append(np.asarray(preds))

In [170]:
label_correct, label_pred = idx_to_label(pred_labels, true_labels, idx2Label)

In [177]:
from seqeval.metrics import classification_report
report = classification_report(label_correct, label_pred, digits=4)

In [121]:
print("'              precision    recall  f1-score   support\n\n         LOC     0.8403    0.9085    0.8731      1628\n        MISC     0.7162    0.6999    0.7079       703\n         ORG     0.7582    0.7354    0.7466      1663\n         PER     0.8964    0.9054    0.9009      1596\n\n   micro avg     0.8179    0.8299    0.8238      5590\n   macro avg     0.8028    0.8123    0.8071      5590\nweighted avg     0.8163    0.8299    0.8226      5590\n'")

'              precision    recall  f1-score   support

         LOC     0.8403    0.9085    0.8731      1628
        MISC     0.7162    0.6999    0.7079       703
         ORG     0.7582    0.7354    0.7466      1663
         PER     0.8964    0.9054    0.9009      1596

   micro avg     0.8179    0.8299    0.8238      5590
   macro avg     0.8028    0.8123    0.8071      5590
weighted avg     0.8163    0.8299    0.8226      5590
'


In [188]:
pickle.dump(idx2Label,open("idx2Label.pkl", 'wb'))

In [189]:
pickle.dump(word2Idx, open("word2Idx.pkl", 'wb'))

In [191]:
pickle.dump(embedding_matrix, open("embedding.pkl", 'wb'))

- Inference Bi-LSTM

In [211]:
import os
import math
import pickle
import logging
import argparse
import numpy as np
import tensorflow as tf 
from nltk import word_tokenize
from fastprogress.fastprogress import master_bar, progress_bar
from seqeval.metrics import classification_report

logging.basicConfig(format='%(asctime)s - %(levelname)s -  %(message)s', datefmt='%m/%d/%Y ', level=logging.INFO)
logger = logging.getLogger(__name__)
class Ner:
    def __init__(self ,model_dir: str):
        self.idx2Label = pickle.load(open("idx2Label.pkl", 'rb'))
        self.label2Idx = {v:k for k,v in self.idx2Label.items()}
        self.word2Idx = pickle.load(open("word2Idx.pkl", 'rb'))
        self.embedding_matrix = pickle.load(open("embedding.pkl", 'rb'))
        self.max_seq_len = 113
        self.EMBEDDING_DIM = 100
        self.num_labels = len(self.label2Idx)
        
        self.model = self.load_model(model_dir)
        

    def load_model(self, model_dir):
        model =  TFNer(max_seq_len=self.max_seq_len, embed_input_dim=len(self.word2Idx), embed_output_dim=self.EMBEDDING_DIM, weights=[self.embedding_matrix], num_labels=self.num_labels)
        model.load_weights("model_weights")
        return model
    
    def preprocess(self, text):
        sentence = list(text.split(" "))
        sentences = []
        wordIndices = []
        masks = []
        length = len(sentence)

        for word in sentence:
            if word in self.word2Idx:
                wordIdx = self.word2Idx[word]
            elif word.lower() in self.word2Idx:
                wordIdx = self.word2Idx[word.lower()]                 
            else:                
                wordIdx = self.word2Idx['UNKNOWN_TOKEN']
            wordIndices.append(wordIdx)
        maskindices = [1]*len(wordIndices)
        sentences.append(wordIndices)
        masks.append(maskindices)
        padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(sentences, maxlen=self.max_seq_len, padding="post")
        masks = tf.keras.preprocessing.sequence.pad_sequences(masks, maxlen=self.max_seq_len, padding="post")
        return length, masks, padded_inputs

    def idx_to_label(self, predictions, correct): 
        label_pred = []    
        for sentence in predictions:
            for i in sentence:
                label_pred.append([self.idx2Label[elem] for elem in i ]) 

        label_correct = []  
        if correct != None:
            for sentence in correct:
                for i in sentence:
                    label_correct.append([self.idx2Label[elem] for elem in i ]) 
            
        return label_correct, label_pred


    def predict(self, text):  
        length, masks, padded_inputs = self.preprocess(text)
        padded_inputs = tf.expand_dims(padded_inputs, 0)
        
        true_labels = None
        pred_labels = []
        pred_logits = []
        for sentence in padded_inputs:
            logits = self.model(sentence)
            temp1 = tf.nn.softmax(logits) 
            max_values = tf.reduce_max(temp1,axis=-1)
            masked_max_values = max_values * masks 
            preds = tf.argmax(temp1, axis=2)
            pred_labels.append(np.asarray(preds))
            pred_logits.extend(np.asarray(masked_max_values))
        _,label_pred  = self.idx_to_label(pred_labels, true_labels)
        label_pred = label_pred[0][:length] 
        pred_logits = pred_logits[0][:length]
        words = word_tokenize(text)
        assert len(label_pred) == len(words)
        zip_val = zip(words, label_pred)
        
        output = [{"word":word,"tag":label} for  word, label in zip_val]

        logger.info(f"Labels predicted are {label_pred}")
        logger.info(f"with a confidence of {pred_logits}")
        return output

In [210]:
text = "Meriem is in New york"
model_dir = ""
Nermodel = Ner(model_dir)
output = Nermodel.predict(text)

04/29/2023  - INFO -  Model weights restored
04/29/2023  - INFO -  Labels predicted are ['B-PER', 'O', 'O', 'B-LOC', 'I-LOC']
04/29/2023  - INFO -  with a confidence of [0.9979875  0.9996989  0.9999825  0.99991417 0.9973127 ]


In [212]:
print(output)

[{'word': 'Sara', 'tag': 'B-PER'}, {'word': 'is', 'tag': 'O'}, {'word': 'in', 'tag': 'O'}, {'word': 'New', 'tag': 'B-LOC'}, {'word': 'york', 'tag': 'I-LOC'}]


- Dropout + L2 regularization

In [135]:
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.feature_selection import SelectKBest, mutual_info_classif

class TFNerAm(tf.keras.Model):
    def __init__(self, max_seq_len, embed_input_dim, embed_output_dim, num_labels, weights):
        super(TFNerAm, self).__init__() 
        self.embedding = layers.Embedding(input_dim=embed_input_dim, 
        output_dim=embed_output_dim, weights=weights,    
        input_length=max_seq_len, trainable=False, mask_zero=True)
        #On ajoute le dropout
        self.dropout = layers.Dropout(0.2)
        #L2 regularization
        self.bilstm = layers.Bidirectional(layers.LSTM(128,kernel_regularizer=tf.keras.regularizers.l2(0.1),
        return_sequences=True))
        self.dense = layers.Dense(num_labels)

    def call(self, inputs):
        x = self.embedding(inputs) # batchsize, max_seq_len, embedding_output_dim
        x = self.dropout(x)
        x = self.bilstm(x) #batchsize, max_seq_len, hidden_dim_bilstm
        logits = self.dense(x) #batchsize, max_seq_len, num_labels
        return logits

In [136]:
model2 = TFNerAm(max_seq_len=max_seq_len,embed_input_dim=len(word2Idx), embed_output_dim=EMBEDDING_DIM, weights=[embedding_matrix], num_labels=num_labels)

In [159]:
def train_step_fn2(sentences_batch, labels_batch):
    with tf.GradientTape() as tape:
        logits = model2(sentences_batch)
        loss = scce(labels_batch, logits)
    grads = tape.gradient(loss, model2.trainable_variables)
    optimizer.apply_gradients(list(zip(grads,   
    model2.trainable_variables)))
    return loss, logits

In [160]:
def valid_step_fn2(sentences_batch, labels_batch):
    logits = model2(sentences_batch)
    loss = scce(labels_batch, logits)
    return loss, logits

In [161]:
for epoch in epoch_bar:
    for sentences_batch, labels_batch in progress_bar(batched_train_dataset, total=train_pb_max_len,parent=epoch_bar):
        loss2, logits2 = train_step_fn2(sentences_batch, labels_batch)
        train_loss_metric(loss2)
    train_loss_metric.reset_states()

In [162]:
for sentences_batch, labels_batch in progress_bar(batched_valid_dataset, total=valid_pb_max_len, parent=epoch_bar):
    loss2, logits2 = valid_step_fn2(sentences_batch, labels_batch)
    valid_loss_metric.update_state(loss2)
    valid_loss_metric.reset_states()

In [163]:
model2.save_weights("model_weights2",save_format='tf')

In [164]:
test_model2 =  TFNerAm(max_seq_len=max_seq_len, embed_input_dim=len(word2Idx), embed_output_dim=EMBEDDING_DIM, weights=[embedding_matrix], num_labels=num_labels)

In [165]:
test_model2.load_weights("model_weights2")

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x22b483d2970>

In [166]:
true_labels = []
pred_labels = []

In [167]:
for sentences_batch, labels_batch in progress_bar(batched_test_dataset, total=test_pb_max_len):
    logits = test_model2(sentences_batch)
    temp1 = tf.nn.softmax(logits)
    preds = tf.argmax(temp1, axis=2)
    true_labels.append(np.asarray(labels_batch))
    pred_labels.append(np.asarray(preds))

In [168]:
label_correct, label_pred = idx_to_label(pred_labels, true_labels, idx2Label)

In [169]:
from seqeval.metrics import classification_report
report = classification_report(label_correct, label_pred, digits=4)

In [170]:
report

'              precision    recall  f1-score   support\n\n         LOC     0.8280    0.8753    0.8510      1628\n        MISC     0.6741    0.6913    0.6826       703\n         ORG     0.7114    0.7781    0.7433      1663\n         PER     0.8770    0.8665    0.8717      1596\n\n   micro avg     0.7859    0.8208    0.8029      5590\n   macro avg     0.7726    0.8028    0.7871      5590\nweighted avg     0.7879    0.8208    0.8037      5590\n'