# Loading data from Masakhane folder

In [None]:
!pip install pandas keras plot_keras_history 

In [None]:
from pathlib import Path
import pandas as pd

In [None]:
from auglib import augment_ner_iob_data
from auglib import read_format_iob_data, list_to_pd_format


In [None]:
# Reading folder path
# Context Masahkane community
# Ramshaw and Marcus (1995) = IOB data format

bbj_ner_path = Path('../data_source/masakhane-ner/MasakhaNER2.0/data/bbj')
dev_data_path   = bbj_ner_path / 'dev.txt'
train_data_path = bbj_ner_path / 'train.txt'
test_data_path  = bbj_ner_path / 'test.txt'


In [None]:
# read and extract iob data
extracted_train_data, pd_train_data, only_train_ner_data, o_train_ner_data = read_format_iob_data(train_data_path)
extracted_test_data, pd_test_data, only_test_ner_data, o_test_ner_data = read_format_iob_data(test_data_path)
extracted_dev_data, pd_dev_data, only_dev_ner_data, o_dev_ner_data = read_format_iob_data(dev_data_path)

# quelques stats liminaires
print("Total number of sentences in the train dataset: {:,}".format(pd_train_data["sentence_id"].nunique()))
print("Total words in the train dataset: {:,}".format(pd_train_data.shape[0]))
print("Total number of sentences in the test dataset: {:,}".format(pd_test_data["sentence_id"].nunique()))
print("Total words in the test dataset: {:,}".format(pd_test_data.shape[0]))
print("Total number of sentences in the dev dataset: {:,}".format(pd_dev_data["sentence_id"].nunique()))
print("Total words in the dev dataset: {:,}".format(pd_dev_data.shape[0]))



In [None]:
augmented_train_ner_data = augment_ner_iob_data(only_train_ner_data)

print(f' Train data: {len(only_train_ner_data)} \n Augmented data: {len(augmented_train_ner_data)}')

In [None]:
# transform to pd format
pd_augment_ner_iob_data = list_to_pd_format(augmented_train_ner_data)
pd_augment_ner_iob_data

# Analyzing data 

In [None]:
# quantification des tags sur le corpus d'entrainement
from matplotlib import pyplot as plt
from collections import Counter
import numpy as np

In [None]:

pd_augment_ner_iob_data[pd_augment_ner_iob_data.iob_tag != 'O']["iob_tag"]\
.value_counts().plot(kind="bar", figsize=(20,15))

# On remarque ici que le tag date représente 40% des tags (en position I ou B)

In [None]:

word_counts = pd_augment_ner_iob_data.groupby("sentence_id")["word"].agg(["count"])
word_counts = word_counts.rename(columns={"count": "Word count"})
word_counts.hist(bins=15, figsize=(8,6))

# On constate que le nombre de mots moyens par phrase est de 15
MAX_LENGTH_SENTENCE = word_counts.max()[0]
print("La phrase la plus longue contient {} mots.".format(MAX_LENGTH_SENTENCE))
longest_sentence_id = word_counts[word_counts["Word count"]==MAX_LENGTH_SENTENCE].index[0]
print("ID de la plus longue phrase est: {}.".format(longest_sentence_id))
longest_sentence = pd_augment_ner_iob_data[pd_augment_ner_iob_data["sentence_id"]==longest_sentence_id]["word"].str.cat(sep=' ')
print(f"\nLa phrase la plus longue du corpus est: \n {longest_sentence}")


In [None]:
all_words = list(set(pd_augment_ner_iob_data["word"].values))
all_tags = list(set(pd_augment_ner_iob_data["iob_tag"].values))

print("Nombre de mots uniques: {}".format(pd_augment_ner_iob_data["word"].nunique()))
print("Nombre de tags uniques : {}".format(pd_augment_ner_iob_data["iob_tag"].nunique()))

# Features engineering

In [None]:
# word to index
word2index = {word: idx + 2 for idx, word in enumerate(all_words)}
word2index["--UNKNOWN_WORD--"]=0
word2index["--PADDING--"]=1
index2word = {idx: word for word, idx in word2index.items()}
# tag to index
tag2index = {tag: idx + 1 for idx, tag in enumerate(all_tags)}
tag2index["--PADDING--"]=1
index2tag = {idx: word for word, idx in tag2index.items()}

# test for one word
test_word = "André"
test_word_idx = word2index[test_word]
test_word_lookup = index2word[test_word_idx]
print("L'index du mot {} est {}.".format(test_word, test_word_idx))
print("Le mot avec l'index {} est {}.".format(test_word_idx, test_word_lookup))

In [None]:
augmented_train_ner_data[5]

In [None]:
from keras.utils import to_categorical
from keras_preprocessing.sequence import pad_sequences


X_words = [[word[0] for word in sentence] for sentence in augmented_train_ner_data]
y_tags = [[word[1] for word in sentence] for sentence in augmented_train_ner_data]
print("X_words[10]:", X_words[10])
print("y_tags[10]:", y_tags[10])

X_words = [[word2index[word] for word in sentence] for sentence in X_words]
y_tags = [[tag2index[tag] for tag in sentence] for sentence in y_tags]
print("\nword2index - X_words[10]:", X_words[10])
print("tag2index - y_tags[10]:", y_tags[10])

#X_words = [sentence + [word2index["--PADDING--"]] * (MAX_LENGTH_SENTENCE - len(sentence)) for sentence in X_words]
#y_tags = [sentence + [tag2index["--PADDING--"]] * (MAX_LENGTH_SENTENCE - len(sentence)) for sentence in y_tags]
X_words = pad_sequences(X_words, maxlen=MAX_LENGTH_SENTENCE, padding='post', value=word2index["--PADDING--"])
y_tags = pad_sequences(y_tags, maxlen=MAX_LENGTH_SENTENCE, padding='post', value=tag2index["--PADDING--"])


print("\npadding - X_words[10]: ", len(X_words[10]),  X_words[10])
print("padding - y_tags[10]: ", len(y_tags[10]), y_tags[10])

TAG_COUNT = len(tag2index)
y_tags = [np.eye(TAG_COUNT)[sentence] for sentence in y_tags]
# [np.eye(TAG_COUNT)[sentence] for sentence in augmented_train_ner_data]
# to_categorical(y_tags, num_classes=TAG_COUNT)

print("to categorical - y_tags[10]:", y_tags[10])


In [None]:
print(len(y_tags), len(X_words))

y_train = np.array(y_tags)
X_train = np.array(X_words)

print(X_train.shape, y_train.shape)


# Modelling

In [None]:
!pip uninstall keras tensorflow -y

In [None]:
!pip3 install plot_keras_history keras tensorflow_addons tensorflow
!pip3 install git+https://www.github.com/keras-team/keras-contrib.git

In [None]:
!pip3 install keras_preprocessing 

In [None]:
!pip3 uninstall keras-contrib -y

In [None]:
!export TF_CPP_MIN_LOG_LEVEL="2"
!sudo apt-get install -y --no-install-recommends libnvinfer6=6.0.1-1+cuda11.0 \
    libnvinfer-dev=6.0.1-1+cuda11.0 \
    libnvinfer-plugin6=6.0.1-1+cuda11.0

In [None]:
import pickle
import operator
import re
import string
import matplotlib.pyplot as plt

#from plot_keras_history import plot_history
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix
from keras_contrib.utils import save_load_utils

from keras import layers
from keras import optimizers

from keras.models import Model
from keras.losses import CategoricalCrossentropy
from keras import Input

from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

from keras_contrib.layers import CRF
#from tensorflow_addons.layers import CRF
from keras_contrib import losses
from keras_contrib import metrics

In [None]:
WORD_COUNT = len(index2word)
DENSE_EMBEDDING = 50
LSTM_UNITS = 10
LSTM_DROPOUT = 0.1
DENSE_UNITS = 20
BATCH_SIZE = 20
MAX_EPOCHS = 50

In [None]:
input_layer = layers.Input(shape=(MAX_LENGTH_SENTENCE,))

model = layers.Embedding(WORD_COUNT, DENSE_EMBEDDING, embeddings_initializer="uniform", 
                                input_length=MAX_LENGTH_SENTENCE)(input_layer)
model = layers.Bidirectional(layers.LSTM(LSTM_UNITS, recurrent_dropout=LSTM_DROPOUT, 
                            return_sequences=True))(model)
model = layers.TimeDistributed(layers.Dense(DENSE_UNITS, activation="relu"))(model)

crf_layer = CRF(units=TAG_COUNT, sparse_target=True)
output_layer = crf_layer(model)

ner_model = Model(input_layer, output_layer)

loss = losses.crf_loss
acc_metric = metrics.crf_accuracy
opt = optimizers.Adam(learning_rate=1e-3)

ner_model.compile(optimizer=opt, loss=loss, metrics=[acc_metric])
ner_model.summary()


In [None]:
filepath="ner-bi-lstm-td-model-{val_accuracy:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor=acc_metric, verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
history = ner_model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=MAX_EPOCHS, 
                        verbose=2, callbacks=callbacks_list)
