In [1]:
import numpy as np
import pandas as pd
import json
#import missingno as msno

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from collections import defaultdict
from collections import Counter
import pp_functions as pp
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import pp_functions as pp
#nltk.download('wordnet')
#nltk.download('omw-1.4')
#nltk.download('averaged_perceptron_tagger')

In [2]:
train_raw = pd.read_csv('../Data/seq_labeling/opener_en-train.conll',sep = '\t',header = None,encoding = 'utf-8',comment = '#')
val_raw = pd.read_csv('../Data/seq_labeling/opener_en-dev.conll',sep = '\t',header = None,encoding = 'utf-8',comment = '#')

In [3]:
train = pp.Preprocessor(train_raw)
val = pp.Preprocessor(val_raw)

## Baseline model

In [4]:
train.fit_word2vec('../glove.6B.100d.txt')

In [5]:
X_train_w2v = train.make_word2vec(train)
Y_train = train.make_labels(train)

X_val_w2v = val.make_word2vec(train)
Y_val = val.make_labels(train)

In [6]:
epochs = 10
no_neurons = 50
lr = 0.01

In [7]:
macro_avg_f1s = []
for _ in range(5):
    w2v_input = keras.Input(shape = (None,X_train_w2v.shape[2]))
    w2v_mask = layers.Masking(mask_value = 0)(w2v_input)

    lstm1 = layers.Bidirectional(layers.LSTM(no_neurons, return_sequences = True, activation='tanh'), merge_mode='sum')(w2v_mask)
    lstm2 = layers.Bidirectional(layers.LSTM(no_neurons, return_sequences = True, activation='tanh'), merge_mode='sum')(lstm1)

    outputs = layers.Dense(9, activation='Softmax')(lstm2)

    model = keras.Model(inputs=w2v_input, outputs=outputs)
    model.compile(optimizer = keras.optimizers.Adam(learning_rate=lr), loss = keras.losses.CategoricalCrossentropy(), metrics=["accuracy"])
    
    model.fit(X_train_w2v, Y_train, epochs=epochs);
    gt, pred = val.test_model(X_val_w2v, Y_val, model)
    macro_avg_f1s.append(f1_score(gt, pred ,average = 'macro'))
base_avgs = macro_avg_f1s

2022-05-26 23:10:03.486007: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [23]:
print(np.mean(base_avgs))

0.5407442405194707


## Lemmatisation + Baseline

In [None]:
nltk.download('averaged_perceptron_tagger')

In [9]:
train_obj = pp.Lemmatize_groups(train.X_unencoded)      
train_obj.fit_lemmatize()

X_train_lem = train_obj.transform(train_obj)
X_train_lem = pp.padding(X_train_lem,train.max_len)

val_obj = pp.Lemmatize_groups(val.X_unencoded)  
X_val_lem = val_obj.transform(train_obj)
X_val_lem = pp.padding(X_val_lem, train.max_len)

lemmatized done
fit complete
lemmatized done


In [10]:
epochs = 10
no_neurons = 50
lr = 0.01

In [11]:
macro_avg_f1s = []
for _ in range(5):
    w2v_input = keras.Input(shape = (None,X_train_w2v.shape[2]))
    w2v_mask = layers.Masking(mask_value = 0)(w2v_input)

    lem_input = keras.Input(shape=(None,))
    embed_layer = layers.Embedding(input_dim=sorted(train_obj.group_dict.values(),reverse= True)[0]+1, output_dim=10, mask_zero=True)(lem_input)

    concat = layers.Concatenate()([w2v_mask] + [embed_layer])

    lstm1 = layers.Bidirectional(layers.LSTM(no_neurons, return_sequences = True, activation='tanh'), merge_mode='sum')(concat)
    lstm2 = layers.Bidirectional(layers.LSTM(no_neurons, return_sequences = True, activation='tanh'), merge_mode='sum')(lstm1)

    outputs = layers.Dense(9, activation='Softmax')(lstm2)

    model = keras.Model(inputs=[w2v_input, lem_input], outputs=outputs)
    model.compile(optimizer = keras.optimizers.Adam(learning_rate=lr), loss = keras.losses.CategoricalCrossentropy(), metrics=["accuracy"])
    
    model.fit([X_train_w2v, X_train_lem] , Y_train, epochs=epochs);
    gt, pred = val.test_model([X_val_w2v, X_val_lem], Y_val, model)
    macro_avg_f1s.append(f1_score(gt, pred, average = 'macro'))
lem_avgs = macro_avg_f1s

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
print(np.mean(lem_avgs))

0.549001481778295


## Affixes + Baseline

In [None]:
train.fit_affix_int(ngram_lengths=[3], min_occurences=40)

In [None]:
X_train_af = train.make_affix_int(train)
X_val_af = val.make_affix_int(train)

In [None]:
epochs = 10
no_neurons = 50
lr = 0.01

In [None]:
macro_avg_f1s = []
for _ in range(1):
    w2v_input = keras.Input(shape = (None,X_train_w2v.shape[2]))
    w2v_mask = layers.Masking(mask_value = 0)(w2v_input)

    embed_layers = pp.make_embedding_layers(train, output_dim=10)

    concat = layers.Concatenate()([w2v_mask] + embed_layers['embeddings'])

    lstm1 = layers.Bidirectional(layers.LSTM(no_neurons, return_sequences = True, activation='tanh'), merge_mode='sum')(concat)
    lstm2 = layers.Bidirectional(layers.LSTM(no_neurons, return_sequences = True, activation='tanh'), merge_mode='sum')(lstm1)

    outputs = layers.Dense(9, activation='Softmax')(lstm2)
    
    model = keras.Model(inputs=[w2v_input] + embed_layers['inputs'], outputs=outputs)
    model.compile(optimizer = keras.optimizers.Adam(learning_rate=lr), loss = keras.losses.CategoricalCrossentropy(), metrics=["accuracy"])
    
    model.fit([X_train_w2v] + list(X_train_af), Y_train, epochs=1);
    gt, pred = val.test_model([X_val_w2v] + list(X_val_af), Y_val, model)
    macro_avg_f1s.append(f1_score(gt, pred, average = 'macro'))
af_avgs = macro_avg_f1s

In [None]:
print(af_avgs)

## POS-tags + Baseline

In [13]:
X_train_pos = np.array(pp.pos_tag(train.X_unencoded, train.max_len, 'numbers'))
X_val_pos = np.array(pp.pos_tag(val.X_unencoded, train.max_len, 'numbers'))

In [14]:
epochs = 10
no_neurons = 50
lr = 0.01

In [17]:
macro_avg_f1s = []
for _ in range(5):
    w2v_input = keras.Input(shape = (None,X_train_w2v.shape[2]))
    w2v_mask = layers.Masking(mask_value = 0)(w2v_input)

    pos_input = keras.Input(shape=(None,))
    embed_layer = layers.Embedding(input_dim=45+1, output_dim=10, mask_zero=True)(pos_input)

    concat = layers.Concatenate()([w2v_mask] + [embed_layer])

    lstm1 = layers.Bidirectional(layers.LSTM(no_neurons, return_sequences = True, activation='tanh'), merge_mode='sum')(concat)
    lstm2 = layers.Bidirectional(layers.LSTM(no_neurons, return_sequences = True, activation='tanh'), merge_mode='sum')(lstm1)

    outputs = layers.Dense(9, activation='Softmax')(lstm2)

    model = keras.Model(inputs=[w2v_input, pos_input], outputs=outputs)
    
    
    model.compile(optimizer = keras.optimizers.Adam(learning_rate=lr), loss = keras.losses.CategoricalCrossentropy(), metrics=["accuracy"])
    model.fit([X_train_w2v, X_train_pos] , Y_train, epochs=epochs);
    gt, pred = val.test_model([X_val_w2v, X_val_pos], Y_val, model)
    macro_avg_f1s.append(f1_score(gt, pred, average = 'macro'))
pos_avgs = macro_avg_f1s

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
print(np.mean(pos_avgs))

0.5425449612357641


# REMOVE IF KEPT OUT ALSO FROM PP FUNCTIONS

## TF-IDF + Baseline

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus_train = []
corpus_val = []
for i in train.X_unencoded:
    listToStr = ' '.join([str(elem) for elem in i])
    corpus_train.append(listToStr)
tfidf_obj = TfidfVectorizer()
tfidf_dat = tfidf_obj.fit_transform(corpus_train)

In [20]:
X_train_tf = pp.tfidf_repr(train.X_unencoded,train.max_len, tfidf_obj, tfidf_dat)
X_val_tf = pp.tfidf_repr(val.X_unencoded,train.max_len, tfidf_obj, tfidf_dat)

In [21]:
macro_avg_f1s = []
for _ in range(5):
    w2v_input = keras.Input(shape = (None,X_train_w2v.shape[2]))
    w2v_mask = layers.Masking(mask_value = 0)(w2v_input)

    tfidf_input = keras.Input(shape=(None,X_train_tf.shape[2]))
    tfidf_mask = layers.Masking(mask_value = 0)(tfidf_input)

    concat = layers.Concatenate()([w2v_mask] + [tfidf_mask])

    lstm1 = layers.Bidirectional(layers.LSTM(no_neurons, return_sequences = True, activation='tanh'), merge_mode='sum')(concat)
    lstm2 = layers.Bidirectional(layers.LSTM(no_neurons, return_sequences = True, activation='tanh'), merge_mode='sum')(lstm1)

    outputs = layers.Dense(9, activation='Softmax')(lstm2)

    model = keras.Model(inputs=[w2v_input, tfidf_input], outputs=outputs)
    
    model.compile(optimizer = keras.optimizers.Adam(learning_rate=lr), loss = keras.losses.CategoricalCrossentropy(), metrics=["accuracy"])
    model.fit([X_train_w2v, X_train_tf] , Y_train, epochs=epochs);
    gt, pred = val.test_model([X_val_w2v, X_val_tf], Y_val, model)
    macro_avg_f1s.append(f1_score(gt, pred, average = 'macro'))
tfidf_avgs = macro_avg_f1s

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
print(np.mean(tfidf_avgs))

0.38712206300070506


## 