In [1]:
import pandas as pd
import numpy as np
import math
import random
import nltk
import re
from nltk.corpus import stopwords
import string

import matplotlib.pyplot as plt
from operator import itemgetter
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter

import seaborn as sns

# CRF Tests

In [16]:
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import scorers,CRF
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite import metrics
from nltk.stem import WordNetLemmatizer


In [3]:
lemmatizer = WordNetLemmatizer()

In [4]:
umn_df_ner = pd.read_parquet('umn_df_for_CRFner_40_0725.parquet')
medal_df_ner = pd.read_parquet('medal_df_for_CRFner_40_0725.parquet')

In [5]:
df_wNER = umn_df_ner.copy()
#df_wNER = medal_df_ner.copy()
all_abvs_list = df_wNER.ABV_final.explode().unique()


### Feature creation

In [6]:
def word2features(sent, i):
    word = sent[i]

    features = {
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.isABV()': word in all_abvs_list,
        'word.lemmatize()': lemmatizer.lemmatize(word),
        'word.length()': len(word)
        }

 
    if i > 0:
        word1 = sent[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1]

        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for label in sent]
    


In [7]:
from sklearn.model_selection import train_test_split

X = np.array([sent2features(s) for s in df_wNER['TEXT_clean_nostp']])
y = np.array(df_wNER['NER_labels_words'].values)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=1)
X_test, X_val, y_test, y_val = train_test_split(X_test,y_test, test_size=0.3, random_state=1) 

X_train.shape, X_test.shape

((4732,), (828,))

In [9]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=False,
    max_linesearch=100,
    verbose=True,  
)

In [10]:
crf.fit(X_train,y_train)

loading training data to CRFsuite: 100%|██████████| 4732/4732 [00:01<00:00, 2818.05it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 56389
Seconds required: 0.427

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 100

Iter 1   time=1.01  loss=106823.56 active=56283 feature_norm=1.00
Iter 2   time=0.52  loss=90692.12 active=55896 feature_norm=1.13
Iter 3   time=0.54  loss=68072.03 active=55450 feature_norm=1.45
Iter 4   time=0.52  loss=59644.62 active=38165 feature_norm=1.70
Iter 5   time=0.52  loss=54399.34 active=31688 feature_norm=1.95
Iter 6   time=0.52  loss=47390.31 active=28310 feature_norm=2.27
Iter 7   time=0.52  loss=29188.75 active=23059 feature_norm=4.20
Iter 8   time=0.52  loss=24353.05 active=22978 feature_norm=4.58
Iter 9   time=0.53  loss=18509.61 active=21691 feature_norm=6.05
Iter 10  tim



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=False,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=100, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=True)

In [11]:
pred = crf.predict(X_test)

In [12]:
true_labels = [list(x) for x in y_test]
true_predictions = pred

In [None]:
from seqeval.metrics import f1_score as seq_f1
from seqeval.metrics import precision_score, recall_score, classification_report


f1_actual = np.round(seq_f1(true_labels, true_predictions, average='macro', scheme='token' ) * 100, 2 )
pre_actual = np.round(precision_score(true_labels, true_predictions, average='macro', scheme='token' ) * 100, 2 )
rec_actual = np.round(recall_score(true_labels, true_predictions, average='macro', scheme='token' ) * 100, 2 )

In [14]:
f1_actual, pre_actual, rec_actual 

(75.92, 81.45, 75.44)

In [None]:
class_report = classification_report(true_labels, true_predictions, output_dict=True )

f1s = []
precs = []
recs = []
weights = []

for lab in class_report:
  if lab not in ['micro avg', 'macro avg','weighted avg', 'A_word']:
    f1s.append(class_report[lab]['f1-score'])
    precs.append(class_report[lab]['precision'])
    recs.append(class_report[lab]['recall'])
    weights.append(class_report[lab]['support'])

np.average(f1s, weights=weights), np.average(precs, weights=weights), np.average(recs, weights=weights)

# BiLSTM Tests

In [18]:
df_ner = pd.read_parquet('medal_df_max500v2_for_ner_1005_0726.parquet')

In [26]:
#if MeDAL:
#df_ner['TEXT_clean_nostp'] = df_ner['TEXT_clean_nostp'].apply(lambda row: [word.lower() for word in row])

In [27]:
vocab = df_ner.TEXT_clean_nostp.explode().unique()
vocab = np.append(vocab, 'ENDPAD')
print(vocab)
unique_tags = df_ner.NER_labels_words.explode().unique()
unique_tags = np.append(unique_tags, 'ENDPAD')
print(len(unique_tags))
word2idx = {w: i for i, w in enumerate(vocab)}
tag2idx = {t: i for i, t in enumerate(unique_tags)}


['reduced' 'coenzyme' 'qcytochrome' ... 'origamilike' 'normalnormal'
 'ENDPAD']
1007


In [28]:
from keras.preprocessing.sequence import pad_sequences
max_len = 115

data = [[word2idx[w] for w in x] for x in df_ner['TEXT_clean_nostp']]
data = pad_sequences(data, maxlen=max_len, padding='post', value=word2idx['ENDPAD'])

tags = [[tag2idx[w] for w in x] for x in df_ner['NER_labels_words']]
tags = pad_sequences(tags, maxlen=max_len, padding='post', value=tag2idx['ENDPAD'])

X_train, X_test, y_train, y_test = train_test_split(data, tags, test_size=0.2,random_state=1)
X_test, X_val, y_test, y_val = train_test_split(X_test,y_test, test_size=0.3, random_state=1) 

X_train.shape, X_test.shape

((58556, 115), (10248, 115))

In [None]:
#model foundation from: https://colab.research.google.com/drive/1mnz-P30CLxrxQ0yyqpcLwVJgi7e59shi?usp=sharing

from keras.models import Model, Input, Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam

in_dim = len(vocab)
in_len = max_len
n_tags = len(unique_tags)



model = Sequential()

model.add(Embedding(input_dim=in_dim, output_dim=256, input_length=in_len))

model.add(Dropout(0.3))
# Add BiLSTM
model.add(Bidirectional(LSTM(units=256, return_sequences=True), merge_mode = 'concat'))

model.add(Dropout(0.3))
model.add(LSTM(units=128, return_sequences=True))

# Add timeDistributed Layer
model.add(TimeDistributed(Dense(n_tags, activation="softmax")))

#Optimiser 
adam = Adam(lr=0.005)

# Compile model
model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])


model.summary()

### Set class weights

In [None]:
#change class weights
class_weights = {i:100 for i in range(n_tags)}
class_weights[0] = 1
class_weights[n_tags-1] = 0



In [None]:
history = model.fit(X_train, y_train, batch_size=64, epochs=30, validation_data=(X_val, y_val), class_weight=class_weights)

In [35]:
from seqeval.metrics import f1_score as seq_f1
from seqeval.metrics import precision_score, recall_score, classification_report


def get_metrics(preds,labels):
    true_predictions = [
    [list(tag2idx.keys())[p] for (p, l) in zip(prediction, label) if l != 1006]
    for prediction, label in zip(preds, labels)
    ]
    true_labels = [
        [list(tag2idx.keys())[l] for (p, l) in zip(prediction, label) if l != 1006]
        for prediction, label in zip(preds, labels)
    ]

    f1_actual = np.round(seq_f1(true_labels, true_predictions, average='macro', scheme='token' ) * 100, 2 )
    pre_actual = np.round(precision_score(true_labels, true_predictions, average='macro', scheme='token' ) * 100, 2 )
    rec_actual = np.round(recall_score(true_labels, true_predictions, average='macro', scheme='token' ) * 100, 2 )

    print('Macro Performance (F1, Precision, Recall):\t', f1_actual, pre_actual, rec_actual)

    class_report = classification_report(true_labels, true_predictions, output_dict=True )

    f1s = []
    precs = []
    recs = []
    weights = []

    for lab in class_report:
        if lab not in ['micro avg', 'macro avg','weighted avg', 'A_word']:
            f1s.append(class_report[lab]['f1-score'])
            precs.append(class_report[lab]['precision'])
            recs.append(class_report[lab]['recall'])
            weights.append(class_report[lab]['support'])

    print('Weighted Performance (F1, Precision, Recall):\t', (np.average(f1s, weights=weights), np.average(precs, weights=weights), np.average(recs, weights=weights)))

    true_predictions = []
    true_labels = []

    for prediction, label in zip(preds, labels):
        preds = []
        labs = []
        for (p, l) in zip(prediction, label):
            if l != tag2idx['ENDPAD']:
                if p > 0:
                    preds.append('ABV')
                elif p == 0:
                    preds.append('word')

                if l > 0:
                    labs.append('ABV')
                elif l == 0:
                    labs.append('word')
        true_predictions.append(preds)
        true_labels.append(labs)

    f1_actual = np.round(seq_f1(true_labels, true_predictions, average=None, scheme='token' ) * 100, 2 )
    pre_actual = np.round(precision_score(true_labels, true_predictions, average=None, scheme='token' ) * 100, 2 )
    rec_actual = np.round(recall_score(true_labels, true_predictions, average=None, scheme='token' ) * 100, 2 )

    print('ABV Identification Performance (F1, Precision, Recall):\t', f1_actual, pre_actual, rec_actual)
