In [1]:
import numpy as np
import pandas as pd
import json
#import missingno as msno

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

from collections import Counter

In [2]:
# we have saved the model, so now we just load it
model = keras.models.load_model('final_model.h5')

2022-04-05 16:55:33.867412: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
test = test = pd.read_csv('Data/seq_labeling/opener_en-test-masked.conll',sep = '\t',header = None,encoding = 'utf-8',comment = '#')

In [4]:
val = val = pd.read_csv('Data/seq_labeling/opener_en-dev.conll',sep = '\t',header = None,encoding = 'utf-8',comment = '#')

In [5]:
hard = pd.read_csv('Group3_hardsentences_converted.conll',sep = '\t',header = None,encoding = 'utf-8',comment = '#')

In [6]:
train = pd.read_csv('Data/seq_labeling/opener_en-train.conll',sep = '\t',header = None,encoding = 'utf-8',comment = '#')
count_vec = CountVectorizer()
count_vec.fit(train[1])

CountVectorizer()

In [7]:
def make_list_of_lists(data):
    X = []
    Y = []
    first = True
    for elm in data.iterrows():
        if elm[1][0] == 1:
            if first != True:
                X.append(x)
                Y.append(y)
            x = []
            y = []
            first = False
        x.append(elm[1][1])
        y.append(elm[1][2])
    X.append(x)
    Y.append(y)
    return X, Y
        

In [8]:
def encode_list_of_lists(X, max_len):
    li = []

    for elm in X:
        t = []
        for i in range(max_len):
            if i >= len(elm):
                t.append(count_vec.transform(['asdfasdfasdfasdfasdfsaf']).toarray().flatten() == 1)
            else:
                t.append(count_vec.transform([elm[i]]).toarray().flatten() == 1)
        t = np.array(t)
        li.append(t)
    return np.array(li) 

In [9]:
lookup_table = {
    'O':0,
    'B-Negative': 1,
    'B-Positive': 2,
    'I-Negative': 3,
    'I-Positive':4,
    'B-Negative|I-Negative': 5,
    'B-Positive|I-Positive':6,
    'I-Positive|B-Positive': 7,
    'I-Positive|I-Positive': 8
}

def encode_labels(Y, lookup_labels, max_len):
    li = []
    for elm in Y:
        t = []
        counter = 0
        for i in range(max_len):
            if counter >= len(elm):
                one_hot = np.zeros(9)
                one_hot[0] = 1
                t.append(np.zeros(9))
            else:
                one_hot = np.zeros(9)
                one_hot[lookup_labels[elm[i]]] = 1
                t.append(one_hot)
            counter += 1
        li.append(np.array(t))
    return np.array(li)

In [10]:
lookup_reverse = {
    0:'O',
    1:'B-Negative',
    2:'B-Positive',
    3:'I-Negative',
    4:'I-Positive',
    5:'B-Negative|I-Negative',
    6:'B-Positive|I-Positive',
    7:'I-Positive|B-Positive',
    8:'I-Positive|I-Positive'
}

def dump_pred(pred, data_raw, input_path, output_path, lookup): 
    comments = []
    with open(input_path) as f:
        for line in f.readlines():
            if line[0] == '#':
                comments.append(line)

    with open(output_path, 'w', encoding='UTF-8') as f:
        for sentence in range(len(comments)):
            f.write(comments[sentence])
            for word in range(len(pred[sentence])):
                f.write(str(word + 1) + '\t' + data_raw[sentence][word] + '\t' + lookup[pred[sentence][word]] + '\n')
            f.write('\n')

In [11]:
def make_predictions(model, X, sentence_lengths):
    pred = [[np.argmax(word) for word in sentence] for sentence in model.predict(X)]
    pred_no_padding = [labels[:length] for labels, length in zip(pred, sentence_lengths)]
    return pred_no_padding

In [12]:
# we have saved the model, so now we just load it
model = keras.models.load_model('final_model.h5')

In [13]:
# Preprocessing
X_train_unencoded, Y_train_unencoded = make_list_of_lists(train)
X_test_unencoded, Y_test_unencoded = make_list_of_lists(test)
X_hard_unencoded, Y_hard_unencoded = make_list_of_lists(hard)


# this value is for training, but will be treated as universal 
# ie. the longest sentece in training will be assumed to be longest overall

max_len_train = max([len(elm) for elm in X_train_unencoded])
X_test = encode_list_of_lists(X_test_unencoded, max_len_train)
Y_test = encode_labels(Y_test_unencoded, lookup_table, max_len_train)

X_hard = encode_list_of_lists(X_hard_unencoded,max_len_train)
Y_hard = encode_labels(Y_hard_unencoded, lookup_table, max_len_train)

# get predictions

test_lengths = [len(sentence) for sentence in Y_test_unencoded]
test_pred = make_predictions(model, X_test, test_lengths)

hard_lengths = [len(sentence) for sentence in Y_hard_unencoded]
hard_pred = make_predictions(model, X_hard, hard_lengths)
hard_pred_flattened = sum(hard_pred, [])

In [14]:
# Preprocessing

X_val_unencoded, Y_val_unencoded = make_list_of_lists(val)

X_val = encode_list_of_lists(X_val_unencoded, max_len_train)
Y_val = encode_labels(Y_val_unencoded, lookup_table, max_len_train) 

In [15]:
# get predictions val

val_lengths = [len(sentence) for sentence in Y_val_unencoded]
val_pred = make_predictions(model, X_val, val_lengths)

val_pred_flattened = sum(val_pred, [])

In [16]:
# get ground truth val

val_gt = [[np.argmax(word) for word in sentence] for sentence in Y_val]
val_gt = [labels[:length] for labels, length in zip(val_gt, val_lengths)]
val_gt_flattened = sum(val_gt, [])

# hard sentences
hard_gt = [[np.argmax(word) for word in sentence] for sentence in Y_hard]
hard_gt = [labels[:length] for labels, length in zip(hard_gt, hard_lengths)]
hard_gt_flattened = sum(hard_gt, [])

In [17]:
#dump_pred(test_pred, X_test_unencoded, 'Group3_hardsentences_converted.conll', 'hardsentences_predict.conll', lookup_reverse)

In [18]:
# validation score

print(classification_report(hard_gt_flattened, hard_pred_flattened))

              precision    recall  f1-score   support

           0       0.61      0.76      0.68       638
           1       0.08      0.07      0.08        56
           2       0.25      0.17      0.20        94
           3       0.25      0.17      0.20       143
           4       0.23      0.14      0.18       202

    accuracy                           0.49      1133
   macro avg       0.28      0.26      0.27      1133
weighted avg       0.44      0.49      0.46      1133



# check the format

In [19]:
inputPath = 'Group3_hardsentences_converted.conll'

numSents = 0

for lineIdx, line in enumerate(open(inputPath)):
    if line[0] == '#':
        continue
    if len(line) < 2:
        numSents += 1
        continue
    tok = line.strip().split('\t')
    if len(tok) < 3:
        print(str(lineIdx) + ': Not all columns defined: ' + line)
        exit(1)
    if not tok[0].isdigit():
        print(str(lineIdx) + ': Invalid word index found: ' + line)
        exit(1)
    if len(tok[1].strip()) == 0:
        print(str(lineIdx) + ': Empty token: ' + line)
        exit(1)        
    if tok[2] not in ['B-Positive', 'I-Positive', 'O', 'B-Negative', 'I-Negative']:
        print(str(lineIdx) + ': Label is invalid: ' + line)
        exit(1)
        
if numSents+1 < 50:
    print('Too little instances(' + str(numSents) + '), please generate more')
if numSents > 1000:
    print('Too many instances(' + str(numSents) + '), please generate more')