# 1, Data preprocessing

### 1, Load and extract the data

In [1]:
### extracting the entities and tags

import os 
from keras.preprocessing.sequence import pad_sequences


def split_text_label(filename):
    '''
    Reads a file named filename, extracts the text and the labels and stores
    them in an array.
     
    returns [ ['EU', 'B-ORG'], ['rejects', 'O'], ['German', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['British', 'B-MISC'], ['lamb', 'O'], ['.', 'O'] ] 
    '''
    
    # open file
    f = open(filename)
    
    # initializing
    split_labeled_text = []
    sentence = []
    
    # processing line by line 
    for line in f:
        if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\t":
            if len(sentence) > 0:
                split_labeled_text.append(sentence)
                sentence = []
            continue
        
        # split by tab
        splits = line.split('	')
        # rstrip: strip from the right 
        sentence.append([splits[0],splits[-1].rstrip("\n")])
        
        
    if len(sentence) > 0:
        split_labeled_text.append(sentence)
        sentence = []
    return split_labeled_text



### load in the train, valid and test set

In [None]:
### load in the train, valid and test set

# ===== train_10_25.txt =====
# ===== valid_10_25.txt =====
# ===== test_10_25.txt =====
split_train = split_text_label(os.path.join("train_10_25.txt"))
split_valid = split_text_label(os.path.join("valid_10_25.txt"))
split_test = split_text_label(os.path.join("test_10_25.txt"))

In [2]:
# print some list to see 

print("train.txt: ", split_train[0:2])
print()

print("valid.txt: ", split_valid[0:1])
print()

print("test.txt: ", split_test[0:1])
print()

train.txt:  [[['literature', 'O'], ['linking', 'O'], ['the', 'O'], ['effects', 'O'], ['of', 'O'], ['prepartum', 'B-PER'], ['nutrition', 'B-NUT'], ['and', 'O'], ['subsequent', 'O'], ['fertility', 'O'], ['is', 'O'], ['scarce', 'O']], [['most', 'O'], ['of', 'O'], ['what', 'O'], ['is', 'O'], ['suggested', 'O'], ['to', 'O'], ['optimize', 'O'], ['future', 'O'], ['fertility', 'O'], ['is', 'O'], ['related', 'O'], ['to', 'O'], ['relationships', 'O'], ['between', 'O'], ['metabolic', 'B-DIS'], ['disorders', 'I-DIS'], ['and', 'O'], ['risk', 'O'], ['for', 'O'], ['delayed', 'B-DIS'], ['conception', 'I-DIS']]]

valid.txt:  [[['we', 'O'], ['will', 'O'], ['see', 'O'], ['in', 'B-PER'], ['the', 'O'], ['next', 'O'], ['sections', 'O'], ['some', 'O'], ['of', 'O'], ['the', 'O'], ['etiology', 'O'], ['and', 'O'], ['preventive', 'O'], ['measures', 'O'], ['for', 'O'], ['these', 'O'], ['problems', 'O'], ['and', 'O'], ['the', 'O'], ['nutritional', 'O'], ['considerations', 'O'], ['related', 'O'], ['to', 'O'], ['the

### 2, Builing Vocabulary

### build a vocabulary for the text, so we can assign a unique index for each word

In [3]:
### use labelSet() as label dictionary 
### use wordSet() as word dictionary 

labelSet = set()
wordSet = set()
# words and labels
for data in [split_train, split_valid, split_test]:
    for labeled_text in data:
        for word, label in labeled_text:
            
            # modifying unwanted mistag
            if label == 'i-NUT':
                label = 'I-NUT'
            if label == '':
                label = 'O'
            
            
            labelSet.add(label)
            wordSet.add(word.lower())
            


# modifying the label set 
print()
#empty = labelSet.pop()
#labelSet.remove('')
#labelSet.remove('BI-DIS')
#labelSet.remove('i-NUT')

# check for the len 
print("labelSet: ", len(labelSet))
print("labelSet: ", labelSet)
print()
print("wordSet: ", len(wordSet))



labelSet:  7
labelSet:  {'I-NUT', 'B-DIS', 'O', 'B-NUT', 'I-DIS', 'B-PER', 'I-PER'}

wordSet:  2012


### 3, Assiging index 

In [8]:
### label2Idx  word2Idx


# sort the set to ensure '0' is assigned to 0
sorted_labels = sorted(list(labelSet), key=len)


# create mapping for labels
label2Idx = {}
for label in sorted_labels:
    label2Idx[label] = len(label2Idx)
idx2Label = {v: k for k, v in label2Idx.items()}


# create mapping for words
word2Idx = {}
if len(word2Idx) == 0:
    word2Idx["PADDING_TOKEN"] = len(word2Idx) # as 0
    word2Idx["UNKNOWN_TOKEN"] = len(word2Idx) # as 1
for word in wordSet:
    word2Idx[word] = len(word2Idx)
    
# print some mapping 

print("idx2Label: ", len(idx2Label))
print("idx2Label: ", idx2Label)
print()
print("word2Idx: ", len(word2Idx))
print("word2Idx: ", word2Idx)

idx2Label:  7
idx2Label:  {0: 'O', 1: 'I-NUT', 2: 'B-DIS', 3: 'B-NUT', 4: 'I-DIS', 5: 'B-PER', 6: 'I-PER'}

word2Idx:  2014
word2Idx:  {'PADDING_TOKEN': 0, 'UNKNOWN_TOKEN': 1, 'depression': 2, 'aims': 3, 'canada': 4, 'hours': 5, 'contents': 6, 'consumed': 7, 'lactation': 8, 'outcomes': 9, 'wk': 10, 'prior': 11, 'begin': 12, 'acquired': 13, 'scientific': 14, 'calving': 15, 'yield': 16, 'house': 17, 'facing': 18, 'stressors': 19, 'quality': 20, '4000': 21, 'occurs': 22, 'given': 23, 'time': 24, 'british': 25, 'avoiding': 26, 'et': 27, 'offer': 28, '1968': 29, 'linked': 30, 'dangerous': 31, 'be': 32, 'utilized': 33, 'fermentation': 34, '28': 35, 'trace': 36, 'primigravid': 37, '4': 38, 'with': 39, 'coming': 40, 'ferguson': 41, 'represent': 42, 'calculate': 43, '2021': 44, 'delivery': 45, '2009': 46, 'cleaning': 47, 'statusoriented': 48, 'identification': 49, 'ketones': 50, 'barley': 51, 'passage': 52, 'inflammation': 53, 'concentrates': 54, 'represents': 55, 'systemic': 56, 'nonfiber': 57

### 4, Change words into representive index

In [9]:
def createMatrices(data, word2Idx, label2Idx):
    sentences = []
    labels = []
    
    # get data 
    for split_labeled_text in data:
        wordIndices = []
        labelIndices = []
        
        # get word and data
        for word, label in split_labeled_text:
            
            # if is in the vocabulary
            if word in word2Idx:
                wordIdx = word2Idx[word]
                
            # if the lower case version is in the vocabulary
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()] 
                
            # if not, assign to the unknown token 
            else:                
                wordIdx = word2Idx['UNKNOWN_TOKEN']
                
            # assign to the corrsponding index
            wordIndices.append(wordIdx)
            
            # fixing a bug of '' 
            if label != '':
                labelIndices.append(label2Idx[label])
    
        # append the index to sentences
        sentences.append(wordIndices)
        labels.append(labelIndices)
    return sentences, labels

train_sentences, train_labels = createMatrices(split_train, word2Idx, label2Idx)
valid_sentences, valid_labels = createMatrices(split_valid, word2Idx, label2Idx)
test_sentences, test_labels = createMatrices(split_test, word2Idx, label2Idx)

# print to check 

print("train_sentences, train_labels: ", train_sentences[0:5], train_labels[0:5])
print()
print("valid_sentences, valid_labels: ", valid_sentences[0:5], valid_labels[0:5])
print()
print("test_sentences, test_labels: ", test_sentences[0:5], test_labels[0:5])

train_sentences, train_labels:  [[1098, 371, 63, 812, 163, 450, 166, 1241, 1333, 1759, 961, 957], [1179, 163, 1748, 961, 1820, 1703, 1207, 1471, 1759, 961, 416, 1703, 890, 1677, 1202, 1236, 1241, 998, 1408, 224, 1046], [1754, 1202, 683, 834, 1015, 1558, 181, 1540, 1425, 404, 1631, 730, 856, 369, 307, 1638, 1241, 511, 1090, 1629, 931, 1703, 623, 63, 188, 163, 88, 1348, 413, 1241, 1871, 182, 163, 2008, 1678], [313, 163, 63, 1348, 1079, 163, 63, 196, 450, 140, 1868, 1585, 1703, 1015, 80, 1194, 1962, 859, 27, 1865], [1913, 1241, 181, 846, 310]] [[0, 0, 0, 0, 0, 5, 3, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 2, 4], [0, 2, 4, 0, 0, 0, 5, 0, 0, 0, 2, 4, 4, 2, 4, 4, 0, 2, 4, 0, 0, 0, 0, 0, 5, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 3, 0, 0, 0, 3, 5, 0, 0, 0, 0, 0, 0, 3, 1, 0, 0, 0], [0, 0, 5, 0, 0]]

valid_sentences, valid_labels:  [[1231, 663, 394, 1919, 63, 268, 731, 1442, 163, 63, 116, 1241, 1595, 586, 1408, 735, 683, 1241, 63, 516, 1639, 416, 1703, 735, 11

### 5, Pad the sentence into the same length

### for fast computation

In [10]:
### padding with packages 'pad_sequences'

# ========== set the max length ========== 
max_seq_len = 128

def padding(sentences, labels, max_len, padding='post'):
    padded_sentences = pad_sequences(sentences, max_len,       
    padding='post')
    padded_labels = pad_sequences(labels, max_len, padding='post')
    return padded_sentences, padded_labels

train_features, train_labels = padding(train_sentences, train_labels, max_seq_len, padding='post' )
valid_features, valid_labels = padding(valid_sentences, valid_labels, max_seq_len, padding='post' )
test_features, test_labels = padding(test_sentences, test_labels, max_seq_len, padding='post' )

# check for the results 
print(len(train_features[1]) == len(train_features[2]))
print()
print("train_features, train_labels: ", train_features[0:1], train_labels[0:1])
print()
print("valid_features, valid_labels: ", valid_features[0:1], valid_labels[0:1])
print()
print("test_features, test_labels: ", test_features[0:1], test_labels[0:1])

True

train_features, train_labels:  [[1098  371   63  812  163  450  166 1241 1333 1759  961  957    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]] [[0 0 0 0 0 5 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]

valid_features, valid_labels:  [[1

# 2, Using the Global Vector Word Embedding

### Glove is a vector for word embedding, the machine understand the words by the location of them in the vector space

![image.png](attachment:image.png)

In [11]:
import numpy as np

# set the dimension for each word 
EMBEDDING_DIM = 100

# loading glove embeddings
embeddings_index = {}
f = open('embeddings/glove.6B.100d.txt', encoding="utf-8")
for line in f:
    values = line.strip().split(' ')
    word = values[0] # the first entry is the word
    coefs = np.asarray(values[1:], dtype='float32') #100d vectors  representing the word
    embeddings_index[word] = coefs
f.close()
embedding_matrix = np.zeros((len(word2Idx), EMBEDDING_DIM))

# word embeddings for the tokens
for word,i in word2Idx.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# 3, Training with tensorflow

### 1, Batching and shuffling the dataset

In [12]:
### using tf.data.Dataset.from_tensor_slices 

import tensorflow as tf
from tensorflow.keras import layers

train_batch_size = 32
valid_batch_size = 64
test_batch_size = 64


train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels))
valid_dataset = tf.data.Dataset.from_tensor_slices((valid_features, valid_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_labels))

# shuffling the training dataset 
shuffled_train_dataset = train_dataset.shuffle(buffer_size=train_features.shape[0], reshuffle_each_iteration=True)

# batching the three datasets
batched_train_dataset = shuffled_train_dataset.batch(train_batch_size, drop_remainder=True) # drop_remiainder: ignore the last batch if nesscery
batched_valid_dataset = valid_dataset.batch(valid_batch_size, drop_remainder=True)
batched_test_dataset = test_dataset.batch(test_batch_size, drop_remainder=True)

# checking 
print("batched_train_dataset: ", batched_train_dataset)
print("batched_valid_dataset: ", batched_valid_dataset)
print("batched_test_dataset: ", batched_test_dataset)

batched_train_dataset:  <BatchDataset shapes: ((32, 128), (32, 128)), types: (tf.int32, tf.int32)>
batched_valid_dataset:  <BatchDataset shapes: ((64, 128), (64, 128)), types: (tf.int32, tf.int32)>
batched_test_dataset:  <BatchDataset shapes: ((64, 128), (64, 128)), types: (tf.int32, tf.int32)>


### 2, Bi-direction Long-Short-Term Memory Neural Network (Bi-LSTM)

![image.png](attachment:image.png)

![image.png](attachment:image.png)

In [13]:
### class TFNer

class TFNer(tf.keras.Model):
    
    def __init__(self, max_seq_len, embed_input_dim, embed_output_dim, num_labels, weights):
        super(TFNer, self).__init__()
        
        # Embedding layer
        self.embedding = layers.Embedding(input_dim=embed_input_dim, 
                                          output_dim=embed_output_dim, 
                                          weights=weights, 
                                          input_length=max_seq_len, 
                                          trainable=False, 
                                          mask_zero=True)
        
        # Bidrectional layer
        self.bilstm = layers.Bidirectional(layers.LSTM(128, return_sequences=True))
        
        # Dense layer
        self.dense = layers.Dense(num_labels)

    def call(self, inputs):
        x = self.embedding(inputs) # batchsize, max_seq_len, embedding_output_dim
        x = self.bilstm(x) # batchsize, max_seq_len, hidden_dim_bilstm
        logits = self.dense(x) # batchsize, max_seq_len, num_labels
        
        # return a logist score
        return logits

### 3, Define the optimizer: Adam 
### and losses: SparseCategoricalCrossentropy

In [14]:
num_labels = len(label2Idx)
idx2Label = {v: k for k, v in label2Idx.items()}

# model
model = TFNer(max_seq_len=max_seq_len,embed_input_dim=len(word2Idx), embed_output_dim=EMBEDDING_DIM, weights=[embedding_matrix], num_labels=num_labels)

# optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

# losses: scce
scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [15]:
# !pip install fastprogress
# !pip install seqeval

# 3, Training 

In [120]:
# os
import os

# math
import math

# pickle: serializes objects so they can be saved to a file, 
# and loaded in a program again later on
import pickle

# logging: use 'logger' to log messages 
# have 5 levels: DEBUG, INFO, WARNING, ERROR, CRITICAL
import logging

# to create the the command line interface
import argparse

# itertoole
import itertools

# numpy
import numpy as np

# tf
import tensorflow as tf 

# from model import TFNer

# pad_sequences
from keras.preprocessing.sequence import pad_sequences

# master_bar, progress_bar 
from fastprogress.fastprogress import master_bar, progress_bar

# from preprocess import split_text_label, padding, createMatrices

# seqeval: for sequence labeling 
from seqeval.metrics import classification_report

### 1, Set the logging and progress bar 

In [122]:
### fastprogress

# ========== num of epochs ==========
epochs = 10

# train_pd_max_len, valid_pb_max_len, test_pb_max_len
epoch_bar = master_bar(range(epochs))
train_pb_max_len = math.ceil(float(len(train_features))/float(train_batch_size))
valid_pb_max_len = math.ceil(float(len(valid_features))/float(valid_batch_size))
test_pb_max_len = math.ceil(float(len(test_features))/float(test_batch_size))

In [123]:
### tf.summary.create_file_writer: a trianing log 

train_log_dir = f"/logs"
valid_log_dir = f"/logs"
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
valid_summary_writer = tf.summary.create_file_writer(valid_log_dir)

### 2, Custom training loop 

In [124]:
# training loss
train_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)

# validing loss
valid_loss_metric = tf.keras.metrics.Mean('valid_loss', dtype=tf.float32)

# customize training loop
def train_step_fn(sentences_batch, labels_batch):
    with tf.GradientTape() as tape:
        logits = model(sentences_batch) # batchsize, max_seq_len, num_labels
        loss = scce(labels_batch, logits) # batchsize,max_seq_len
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(list(zip(grads, model.trainable_variables)))
    return loss, logits

# customize validing loop 
def valid_step_fn(sentences_batch, labels_batch):
    logits = model(sentences_batch)
    loss = scce(labels_batch, logits)
    return loss, logits


for epoch in epoch_bar:
    
    # customize training epoch bar 
    with train_summary_writer.as_default():
        for sentences_batch, labels_batch in progress_bar(
            batched_train_dataset, 
            total=train_pb_max_len, 
            parent=epoch_bar):

            loss, logits = train_step_fn(sentences_batch, labels_batch)
            
            train_loss_metric(loss)
            
            epoch_bar.child.comment = f'training loss : {train_loss_metric.result()}'
        
        tf.summary.scalar('training loss', train_loss_metric.result(), step=epoch)
        train_loss_metric.reset_states()
    
    # customize validing epoch bar 
    with valid_summary_writer.as_default():
        for sentences_batch, labels_batch in progress_bar(
            batched_valid_dataset, 
            total=valid_pb_max_len, 
            parent=epoch_bar):
            
            loss, logits = valid_step_fn(sentences_batch, labels_batch)
            
            valid_loss_metric.update_state(loss)

            epoch_bar.child.comment = f'validation loss : {valid_loss_metric.result()}'

        # logging after each epoch !
        tf.summary.scalar('valid loss', valid_loss_metric.result(), step=epoch)
        valid_loss_metric.reset_states()

model.save_weights(f"./model_weights",save_format='tf')  
logger.info(f"Model weights saved")

10/25/2021  - INFO -   Model weights saved


# 4, Evaluating model performance on the test dataset

### 1, two functions

In [125]:

### assgining the label to each word 

def idx_to_label(predictions, correct, idx2Label):
    """
    input: 
        predictions: predicitons of the model (idx)
        correct: targets from the text (idx)
    output: 
        label_correct: predicitons of the model (label)
        label_pred: targets from the text (label)
    """
    label_pred = []    
    for sentence in predictions:
        for i in sentence:
            label_pred.append([idx2Label[elem] for elem in i ]) 

    label_correct = []  
    if correct != None:
        for sentence in correct:
            for i in sentence:
                label_correct.append([idx2Label[elem] for elem in i ]) 
        
    return label_correct, label_pred

### predict each sentence: using pad_sequences

def predict_single_sentence(sentence, word2Idx, max_seq_len):
    """
    input: 
        sentence: string
        word2Idx: function
        max_seq_len: int
    output: 
        length: int
        masks: array of 1 and 0
        padded_inputs: array of padded index for the sentence 
    """
    sentence = list(sentence.split(" "))
    sentences = []
    wordIndices = []
    masks = []
    length = len(sentence)

    # assining index to the sentence
    for word in sentence:
        if word in word2Idx:
            wordIdx = word2Idx[word]
        elif word.lower() in word2Idx:
            wordIdx = word2Idx[word.lower()]                 
        else:                
            wordIdx = word2Idx['UNKNOWN_TOKEN']
        wordIndices.append(wordIdx)

    # len of mask = len of sentence 
    maskindices = [1]*len(wordIndices)

    # append
    sentences.append(wordIndices)
    masks.append(maskindices)

    padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(
        sentences, 
        maxlen=max_seq_len, 
        padding="post")

    masks = tf.keras.preprocessing.sequence.pad_sequences(
        masks, 
        maxlen=max_seq_len, 
        padding="post")

    return length, masks, padded_inputs

    

### 2, Loading the model 

In [127]:

test_batch_size = 64

# padding sentences and labels to max_length of 128
max_seq_len = 128
EMBEDDING_DIM = 100


# idx2Label = pickle.load(open(os.path.join("idx2Label.pkl"), 'rb'))
label2Idx = {v:k for k,v in idx2Label.items()}
num_labels = len(label2Idx)


# word2Idx = pickle.load(open(os.path.join("word2Idx.pkl"), 'rb'))
# Embedding_matrix = pickle.load(open(os.path.join("embedding.pkl"), 'rb'))
logger.info("Loaded idx2Label, word2Idx and Embedding matrix pickle files")

# loading the model
testmodel =  TFNer(
    max_seq_len=max_seq_len, 
    embed_input_dim=len(word2Idx), 
    embed_output_dim=EMBEDDING_DIM, 
    weights=[embedding_matrix], 
    num_labels=num_labels)

# testmodel.load_weights(f"{args.model_dir}/model_weights")
testmodel.load_weights("./model_weights")
logger.info("Model weights restored")


    



10/25/2021  - INFO -   Loaded idx2Label, word2Idx and Embedding matrix pickle files
10/25/2021  - INFO -   Model weights restored


### 3, Evaluation on the test set 

In [128]:
### processing the test set 

# extract from the txt
split_test = split_text_label("test.txt")

# indexing 
test_sentences, test_labels = createMatrices(split_test, word2Idx, label2Idx)

# padding
test_features, test_labels = padding(test_sentences, test_labels, max_seq_len, padding='post' )

# logger
logger.info(f"Test features shape is {test_features.shape} and labels shape is{test_labels.shape}")

# input to tesorflow
test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_labels))

# batching
batched_test_dataset = test_dataset.batch(test_batch_size, drop_remainder=True)

###  checking 

# print(test_features[0:5], test_labels[0:5])
print()
print(batched_test_dataset)

# print(test_sentences[0:5])

10/25/2021  - INFO -   Test features shape is (172, 128) and labels shape is(172, 128)



<BatchDataset shapes: ((64, 128), (64, 128)), types: (tf.int32, tf.int32)>


In [129]:
### output the test set evaluation results 


from seqeval.metrics import f1_score


### epoch_bar = master_bar(range(epochs))

test_pb_max_len = math.ceil(float(len(test_features))/float(test_batch_size))

true_labels = []
pred_labels = []

#for sentences_batch, labels_batch in progress_bar(batched_test_dataset, total=test_pb_max_len):

logits = testmodel(sentences_batch)
temp1 = tf.nn.softmax(logits)   
preds = tf.argmax(temp1, axis=2)
true_labels.append(np.asarray(labels_batch))
pred_labels.append(np.asarray(preds))

label_correct, label_pred = idx_to_label(pred_labels, true_labels, idx2Label)

#print(preds)
#print(pred_labels, true_labels)
#print(label_correct, label_pred)
# f1_score(pred_labels, true_labels)
print(classification_report(label_correct, label_pred))
logger.info(f"\nResults for the test dataset") 
# logger.info(f"\n{report}")

print()
# print(label_pred[0:5])
print("The length of label_pred: ", len(label_pred))

10/25/2021  - INFO -   
Results for the test dataset


              precision    recall  f1-score   support

         DIS       0.68      0.72      0.70        54
         NUT       0.67      0.53      0.59        76
         PER       0.78      0.83      0.80        87

   micro avg       0.72      0.70      0.71       217
   macro avg       0.71      0.69      0.70       217
weighted avg       0.72      0.70      0.70       217


[['O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'B-PER', 'B-NUT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NUT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-DIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O

### 4, Predictions

In [130]:
# import nltk
# ! pip install nltk 
# nltk.download()
# nltk.download('punkt')

### ========== load in the prediction text ==========
### tokenizing the prediction dataset as sentence

In [131]:
### tokenizing the prediction dataset 

from nltk import sent_tokenize, word_tokenize

# ========== load in the prediction text ==========
with open("prediction.txt", 'r', encoding = 'utf-8') as f:
    prediction_corpus = f.readlines()

# print(training_corpus[0:5])
# print(len(training_corpus))
# print()


# remove some specical objects 

import re

prediction_corpus = str(prediction_corpus)
prediction_corpus = re.sub(r"[^a-zA-Z0-9.?! ]+", "", prediction_corpus)
prediction_corpus = prediction_corpus.lower()

prediction_sentences = sent_tokenize(prediction_corpus)
prediction_words = word_tokenize(prediction_corpus)

# check and see 
print("prediction_sentences: ", prediction_sentences)
print("prediction_words: ", prediction_words)

['theres a lot that can go wrong during the transition phase said dr. mark van der list senior professional services veterinarian with boehringer ingelheim.',
 'their body undergoes many metabolic changes.',
 'its a highrisk period for dairy cows.',
 'diligent management techniques proper nutrition and monitoring can help mitigate potential problems.',
 'cows that undergo a successful transition may experience higher milk production a reduction in postcalving disorders and improved reproductive performance.1 n n consider including the following protocols on your operation for a successful transition period the closeup dry cow diet should be wellformulated and include quality feed ingredients.',
 'dry cows need a sufficient amount of protein vitamins and minerals in their diet to meet energy requirements without increasing their body condition score bcs dr. van der list stated.',
 'overconditioned cows are more likely to develop metabolic problems.',
 'n n we also want to supplement dry

### making prediction on the prediction dataset 

In [132]:
### make prediction on the prediction dataset 

# test_sentence = "Clinical CM is always associated with a drop in the dry matter intake of the cows in transition period"

def predict_text(test_sentence):
    
    length, masks, padded_inputs = predict_single_sentence(test_sentence, word2Idx, max_seq_len)
    padded_inputs = tf.expand_dims(padded_inputs, 0)

    true_labels = None
    pred_labels = []
    pred_logits = []

    for sentence in padded_inputs:
        logits = testmodel(sentence)
        temp1 = tf.nn.softmax(logits) 
        max_values = tf.reduce_max(temp1,axis=-1)

        masked_max_values = max_values * masks 
        preds = tf.argmax(temp1, axis=2)
        pred_labels.append(np.asarray(preds))
        pred_logits.extend(np.asarray(masked_max_values))

    _,label_pred  = idx_to_label(pred_labels, true_labels, idx2Label)

    # logger.info(f"Results for - \"{test_sentence}\"")

    label_pred = label_pred[0][:length] 
    pred_logits = pred_logits[0][:length]
    # logger.info(f"Labels predicted are {label_pred}")
    # logger.info(f"with a confidence of {pred_logits}")
    
    return label_pred

label_preds = []
for sentence in prediction_sentences: 
    label_pred = predict_text(sentence)
    label_preds.append(label_pred)
    
print("label_preds: ", label_preds[0:5])

# print(label_pred)

In [134]:
### check for the length 

prediction_labels = np.array(label_preds) 

prediction_labels = prediction_labels.flatten()

# check and print
print("The length of prediction_labels: ", len(prediction_labels))
print()
print("The length of prediction_sentences: ", len(prediction_sentences))

The length of prediction_labels:  32


The length of prediction_sentences:  32


  prediction_labels = np.array(label_preds)


### ========== name of the prediction output excel ==========

In [137]:
### output to an excel 

# as words, labels and pos

import pandas as pd
import openpyxl

df = pd.DataFrame([prediction_sentences,prediction_labels])

# transpose the columns and rows
d_f = df.T

# ========== name of the prediction output excel ==========
d_f.to_excel('Predictions.xlsx')

### 5, Prediction for a single sentence

In [146]:
test_sentence = """Periparturient diseases such as milk fever, ketosis, and displaced abomasums can all be associated with poor transition"""

length, masks, padded_inputs = predict_single_sentence(test_sentence, word2Idx, max_seq_len)
padded_inputs = tf.expand_dims(padded_inputs, 0)

true_labels = None
pred_labels = []
pred_logits = []

for sentence in padded_inputs:
    logits = testmodel(sentence)
    temp1 = tf.nn.softmax(logits) 
    max_values = tf.reduce_max(temp1,axis=-1)

    masked_max_values = max_values * masks 
    preds = tf.argmax(temp1, axis=2)
    pred_labels.append(np.asarray(preds))
    pred_logits.extend(np.asarray(masked_max_values))
_,label_pred  = idx_to_label(pred_labels, true_labels, idx2Label)

logger.info(f"Results for - \"{test_sentence}\"")

label_pred = label_pred[0][:length] 
pred_logits = pred_logits[0][:length]
logger.info(f"Labels predicted are {label_pred}")
logger.info(f"with a confidence of {pred_logits}")



### the splited sentences: words 



from nltk import word_tokenize 

# remove punct 
words = str(test_sentence)
words = re.sub(r"[^a-zA-Z0-9.?! ]+", "", words)
words = words.lower()

# tokenize
words = word_tokenize(words)

# check for length 
print(len(words) == len(label_pred))
print()
print(words)

10/25/2021  - INFO -   Results for - "Periparturient diseases such as milk fever, ketosis, and displaced abomasums can all be associated with poor transition"
10/25/2021  - INFO -   Labels predicted are ['O', 'B-DIS', 'O', 'O', 'B-DIS', 'I-DIS', 'O', 'O', 'B-DIS', 'I-DIS', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER']
10/25/2021  - INFO -   with a confidence of [0.7078945  0.7114652  0.99761343 0.9904805  0.61185145 0.46429113
 0.8660393  0.9973742  0.9363849  0.6337179  0.9999236  0.99977857
 0.99998033 0.99781954 0.9992217  0.9401367  0.8089987 ]


True

['periparturient', 'diseases', 'such', 'as', 'milk', 'fever', 'ketosis', 'and', 'displaced', 'abomasums', 'can', 'all', 'be', 'associated', 'with', 'poor', 'transition']


In [152]:
### a pretter output 

ner_word = []
ner_label = []
for i, j in zip(words, label_pred): 
    if j != 'O':
        ner_word.append(i)
        ner_label.append(j)
        
# check and see 
print("ner_word: ", ner_word) 
print()
print("ner_label: ", ner_label) 

ner_word:  ['diseases', 'milk', 'fever', 'displaced', 'abomasums', 'transition']

ner_label:  ['B-DIS', 'B-DIS', 'I-DIS', 'B-DIS', 'I-DIS', 'B-PER']
