In [1]:
#General imports
import copy
import os  
import pandas as pd  #  dataframe management
import matplotlib.pyplot as plt
import numpy as np  #  data manipulation
import re
import sys 
from tqdm import tqdm
from typing import Callable, List, Dict, Tuple, Set

#tensorflow imports
from tensorflow.keras.layers import Bidirectional,  Dense, Dropout, Embedding, GRU, Input, LSTM, TimeDistributed
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.random import set_seed


#fixed seeds to get reproducible results
np.random.seed(42)
set_seed(42)

#Bulding the dataframe

## Dataset download and extraction

In [2]:
import urllib.request  #  download files
import zipfile  #  unzip files

DATASET_NAME = "dataset.zip"
DATASET_FOLDERNAME = "Dataset"
DATASET_SUBFOLDER = "dependency_treebank/"
SPLIT_DISTRIBUTION = [100, 150, 199]

working_folder = os.getcwd()

print("Current working directory: " + str(working_folder))

dataset_folder = os.path.join(os.getcwd(), DATASET_FOLDERNAME)

if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)

url = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip'

dataset_path = os.path.join(dataset_folder, "dataset.zip")

if not os.path.exists(dataset_path):
    urllib.request.urlretrieve(url, dataset_path)
    print("Successful download")

with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
    zip_ref.extractall(dataset_folder)
    print("Successful extraction")

#update folder to the extracted one
dataset_folder = os.path.join(dataset_folder, DATASET_SUBFOLDER)

Current working directory: /content
Successful download
Successful extraction


In [3]:
print(dataset_folder)

/content/Dataset/dependency_treebank/


## Dataframe construction

In [4]:
def encode_dataset(dataset_folder: str, 
                   split_dist: list(), ) -> Dict[str,pd.DataFrame]:
    
    df_dict = {"train": pd.DataFrame(columns=['sentence', 'labels']),
                "val": pd.DataFrame(columns=['sentence', 'labels']),
                "test":pd.DataFrame(columns=['sentence', 'labels'])}
    split = ""

    for filename in sorted(os.listdir(dataset_folder)):
        file_path = os.path.join(dataset_folder, filename)
        try:
            if os.path.isfile(file_path):
                with open(file_path, mode='r', encoding='utf-8') as text_file:
                    # read it and extract 
                    document_number = filename.split("_")[1].split(".")[0]
                    if int(document_number) <= split_dist[0]:
                        split = "train"
                    elif split_dist[0] < int(document_number) <= split_dist[1]:
                        split = "val"
                    else:
                        split = "test"

                    df_file = pd.read_table(
                        file_path, 
                        delimiter='\t', 
                        names=['word', 'label'], 
                        usecols=[0,1],
                        skip_blank_lines=False)
                    
                    #splitting file content in sentences
                    idx = list(df_file.loc[df_file.isnull()['word']].index)
                    idx.append(len(df_file))
                    prev = 0
                    for sep in idx:
                        df_sentence = pd.DataFrame({
                            'sentence': [df_file['word'][prev:sep].to_list()], 
                            'labels': [df_file['label'][prev:sep].to_list()]})
                        df_dict[split] = pd.concat([df_dict[split], df_sentence], ignore_index=True)
                        prev = sep + 1
                    
        except Exception as e:
            print('Failed to process %s. Reason: %s' % (file_path, e))
            sys.exit(0)

    return df_dict

In [5]:
df_dict = encode_dataset(dataset_folder, SPLIT_DISTRIBUTION)

In [6]:
df_dict["train"].head()

Unnamed: 0,sentence,labels
0,"[Pierre, Vinken, ,, 61, years, old, ,, will, j...","[NNP, NNP, ,, CD, NNS, JJ, ,, MD, VB, DT, NN, ..."
1,"[Mr., Vinken, is, chairman, of, Elsevier, N.V....","[NNP, NNP, VBZ, NN, IN, NNP, NNP, ,, DT, NNP, ..."
2,"[Rudolph, Agnew, ,, 55, years, old, and, forme...","[NNP, NNP, ,, CD, NNS, JJ, CC, JJ, NN, IN, NNP..."
3,"[A, form, of, asbestos, once, used, to, make, ...","[DT, NN, IN, NN, RB, VBN, TO, VB, NNP, NN, NNS..."
4,"[The, asbestos, fiber, ,, crocidolite, ,, is, ...","[DT, NN, NN, ,, NN, ,, VBZ, RB, JJ, IN, PRP, V..."


In [7]:
df_dict["val"].head()

Unnamed: 0,sentence,labels
0,"[A, House-Senate, conference, approved, major,...","[DT, NNP, NN, VBD, JJ, NNS, IN, DT, NN, IN, JJ..."
1,"[For, the, Agency, for, International, Develop...","[IN, DT, NNP, IN, NNP, NNP, ,, NNS, VBD, $, CD..."
2,"[The, conference, approved, at, least, $, 55, ...","[DT, NN, VBD, IN, JJS, $, CD, CD, IN, JJ, NN, ..."
3,"[The, agreement, on, Poland, contrasts, with, ...","[DT, NN, IN, NNP, VBZ, IN, DT, JJ, NNS, VBG, I..."
4,"[These, fiscal, pressures, are, also, a, facto...","[DT, JJ, NNS, VBP, RB, DT, NN, IN, VBG, DT, NN..."


In [8]:
df_dict["test"].head()

Unnamed: 0,sentence,labels
0,"[Intelogic, Trace, Inc., ,, San, Antonio, ,, T...","[NNP, NNP, NNP, ,, NNP, NNP, ,, NNP, ,, VBD, P..."
1,"[The, move, boosts, Intelogic, Chairman, Asher...","[DT, NN, VBZ, NNP, NNP, NNP, NNP, POS, NN, TO,..."
2,"[Mr., Ackerman, already, is, seeking, to, oust...","[NNP, NNP, RB, VBZ, VBG, TO, VB, NNP, NNP, IN,..."
3,"[The, action, followed, by, one, day, an, Inte...","[DT, NN, VBN, IN, CD, NN, DT, NNP, NN, IN, PRP..."
4,"[In, New, York, Stock, Exchange, composite, tr...","[IN, NNP, NNP, NNP, NNP, JJ, NN, NN, ,, NNP, N..."


In [9]:
x = {"train": df_dict['train']['sentence'],
     "val": df_dict['val']['sentence'],
     "test": df_dict['test']['sentence']}

y = {"train": df_dict['train']['labels'],
     "val": df_dict['val']['labels'],
     "test": df_dict['test']['labels']}

# Glove Embedding model, vocabulary and OOV detection

## Load Glove embedding

In [10]:
import gensim
import gensim.downloader as gloader

def load_GloVe_embedding(embedding_dimension: int = 50) -> gensim.models.keyedvectors.KeyedVectors:
    """
    Loads a pre-trained word embedding model via gensim library.

    :param embedding_dimension: size of the embedding space to consider

    :return
        - pre-trained word embedding model (gensim KeyedVectors object)
    """
    download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Glove: 50, 100, 200, 300")
        raise e

    return emb_model

In [11]:
EMBEDDING_DIMENSION = 50
glove_emb_model = load_GloVe_embedding(EMBEDDING_DIMENSION)



##Creating initial vocabulary

In [12]:
from collections import OrderedDict

def build_vocabulary(sr: pd.Series) -> List[str]:
    """
    Given a dataset, builds the corresponding word vocabulary.

    :param df: dataset from which we want to build the word vocabulary (pandas.DataFrame)
    :return:
      - vocabulary: set of unique terms that build up the vocabulary
    """
    vocabulary = []
    for sentence in tqdm(sr):
        for token in sentence:
            if token not in vocabulary:
                vocabulary.append(token)

    return vocabulary

In [13]:
vocabulary_dict = {}
for split in df_dict.keys():
    vocabulary_dict[split] = build_vocabulary(x[split])
    print()
    print(f'[Debug] {split} vocabulary size: {len(vocabulary_dict[split])}')

100%|██████████| 1963/1963 [00:01<00:00, 1888.71it/s]



[Debug] train vocabulary size: 8009


100%|██████████| 1299/1299 [00:00<00:00, 2837.07it/s]



[Debug] val vocabulary size: 5892


100%|██████████| 652/652 [00:00<00:00, 4436.77it/s]


[Debug] test vocabulary size: 3623





## OOV detection

In [14]:
def check_OOV_terms(vocabulary: List[str],
                    word_listing: List[str]):
    """
    Checks differences between pre-trained embedding model vocabulary
    and dataset specific vocabulary in order to highlight out-of-vocabulary terms.

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param word_listing: dataset specific vocabulary (list)

    :return
        - list of OOV terms
    """
    embedding_vocabulary = set(vocabulary)
    oov = set(word_listing).difference(embedding_vocabulary)
    return list(oov)

In [15]:
OOV1 = check_OOV_terms(glove_emb_model.vocab.keys(), vocabulary_dict["train"])
OOV1_percentage = float(len(OOV1)) * 100 / len(vocabulary_dict["train"])
print(f"Total OOV terms: {len(OOV1)} ({OOV1_percentage:.2f}%)")

Total OOV terms: 2346 (29.29%)


A lot of words are OOV simply because they start with capital letter, so we will lower all the words and check again the OOV. Before this section we can insert a graph showing the OOV words

In [16]:
OOV1_lowercase = check_OOV_terms(glove_emb_model.vocab.keys(), [v.lower() for v in vocabulary_dict["train"]])
OOV1_lowercase_percentage = float(len(OOV1_lowercase)) * 100 / len(vocabulary_dict["train"])
print(f"Total OOV terms: {len(OOV1_lowercase)} ({OOV1_lowercase_percentage:.2f}%)")

Total OOV terms: 359 (4.48%)


In [17]:
print("\n".join(OOV1_lowercase))

red-flag
2645.90
lower-priority
one-upsmanship
pianist-comedian
amphobiles
436.01
preparatives
cray-3
higher-salaried
prize-fighter
1.457
bumkins
investor-relations
purhasing
we-japanese
samnick
30,841
flightiness
520-lawyer
non-encapsulating
sticker-shock
forest-product
bermuda-based
collective-bargaining
234.4
low-ability
automotive-parts
index-related
cotran
11,762
revenue-desperate
new-home
built-from-kit
236.79
ac-130u
ensrud
mouth-up
securities-based
wtd
solaia
savers\/investors
twindam
nih-appointed
more-efficient
landonne
floating-rate
18,444
one-country
trockenbeerenauslesen
lezovich
industrial-production
chafic
9,118
building-products
boorse
415.8
dust-up
pre-1917
year-ago
1\/4
red-blooded
market-share
macheski
autions
446.62
micronite
subindustry
index-arbitrage
test-coaching
anti-takeover
computer-driven
capital-gains
cash-rich
antitrust-law
year-earlier
127.03
one-yen
16.125
3\/4
money-market
rope-sight
makato
unenticing
vinken
143.93
500,004
ariail
sub-segments
chilver
so

# Preprocessing

Here we replace bracket value with their symbols: 
```
  -lrb- and -lcb-   -->  ( 
  -rrb- and -rcb-   -->  )
```

In addition, all the rational numbers will be replaced with the placeholder #number#, as long as the floating point numbers.
Note that rational numbers, instead of being like 3/4, are written as 3\/4. The cause is that symbol "/" is represented using "\/", as this happens also in other words that are notrational ones






In [32]:
def preprocessing(content_list: List[str]) -> List[str]:
    placeholder = "#number#"
    re_slashes = re.compile('\\\/')  #pattern \/ 
    re_rational = re.compile('\d+\/\d+')  #pattern rational number (e.g. 1/5)
    re_number = re.compile('[+-]?(\d*[.])\d+')  #pattern decimal number (e.g. 3.14)
    re_left_bracket = re.compile('(-lrb-)|(-lcb-)')  #pattern left bracket
    re_right_bracket = re.compile('(-rrb-)|(-rcb-)')  #pattern right bracket
    re_slashed_words = re.compile("(\w*)\/(\w*)")  #a slash separating words will be replaced with a dash, following the trend of the dataset, where composed words are in the form word-word

    content_list_preprocessed = [content.lower() for content in content_list]
    content_list_preprocessed = [re_slashes.sub("/", content) for content in content_list_preprocessed]
    content_list_preprocessed = [re_left_bracket.sub("(", content) for content in content_list_preprocessed]
    content_list_preprocessed = [re_right_bracket.sub(")", content) for content in content_list_preprocessed]
    content_list_preprocessed = [placeholder if re.match(re_rational, content) else content for content in content_list_preprocessed]
    content_list_preprocessed = [placeholder if re.match(re_number, content) else content for content in content_list_preprocessed]
    content_list_preprocessed = [content.replace("/", "-") if re.match(re_slashed_words, content) else content for content in content_list_preprocessed]

    return content_list_preprocessed


Preprocessing the training dataset

In [33]:
x_train_preprocessed = x["train"].apply(preprocessing)

Building the new vocabulary after preprocessing

In [35]:
train_vocabulary_preprocessed = build_vocabulary(x_train_preprocessed)
print()
print(f'[Debug] train vocabulary size after preprocessing: {len(train_vocabulary_preprocessed)}')

OOV1_preprocessed = check_OOV_terms(glove_emb_model.vocab.keys(), train_vocabulary_preprocessed)
OOV1_preprocessed_percentage = float(len(OOV1_preprocessed)) * 100 / len(train_vocabulary_preprocessed)
print(f"Total OOV terms: {len(OOV1_preprocessed)} ({OOV1_preprocessed_percentage:.2f}%)")

100%|██████████| 1963/1963 [00:00<00:00, 2377.54it/s]



[Debug] train vocabulary size after preprocessing: 7214
Total OOV terms: 318 (4.41%)


It can be seen that the number of OOV words has plummetted with respect to the the non preprocessed data. Similarly to train data, we apply preprocessing to validation and test splits.

In [36]:
x_pre = {"train": x_train_preprocessed,
        "val": x["val"].apply(preprocessing),
        "test": x["test"].apply(preprocessing)}

# Vocabulary creation and mapping
Since we want to work with numerical data only, we will mapp words and pos (labels) to numbers. 

In [38]:
# Methods to create mapping

#adds oov words at the end of vocabulary
def extend_vocabulary(word_to_idx_original: Dict[str, int],
                      words_to_add) -> Tuple[Dict[str, int],Dict[int, str]]:
  """
    Given mapping between word and indeces, adds new words.

    :param word_to_idx_original: dictionary with key=word and value=index to which the word is mapped
    :return:
      - word_to_idx_extended: word_to_idx with new words
      - idx_to_word_extended: swapped version of word_to_idx_extended (keys and values are swapped)
  """
  word_to_idx_extended = copy.deepcopy(word_to_idx_original)  #deep copy is needed, otherwise python does not create a copy but only a reference to the already existing object, thus reflecting changes on both
  idx = len(word_to_idx_extended.keys())
  if idx == 0: 
    idx = 1  #position 0 is reserved

  for sentence in words_to_add:
      for token in sentence:
          if token not in word_to_idx_extended:
              word_to_idx_extended[token] = idx 
              idx += 1
  idx_to_word_extended = {v: k for k, v in word_to_idx_extended.items()}

  return word_to_idx_extended, idx_to_word_extended

def encode_into_numbers(sentences: List[str],
                        word_to_idx_mapping: Dict[str, int]) -> List[int]:
    """
    Return a list of sequences encoded into integers following the mapping of the vocabulary
    """
    encoded_data = [[word_to_idx_mapping[token] for token in sentence] for sentence in sentences]
 
    return encoded_data

def decode_into_words(encoded_sentences: List[str],
                        idx_to_word_mapping: Dict[int,str]) -> List[str]:
    """
    Return a list of sequences decoded back to words following the (reverse) mapping of the vocabulary
    """
    decoded_data = [[idx_to_word_mapping[index] for index in sentence] for sentence in encoded_sentences]
 
    return decoded_data


Encoding the words

In [39]:
#creating vocabulary mapping for the words in the data set
#Note that they are incremental, this means that the val vocabulary includes the rain one, and the test one inlcudes train and val ones
#This has been made according to the guidelines on the construction of V1, V2, V3, V4. 
#All in all, the complete vocabulary is the one with _test suffix
#In the embedding section the intermediate vocabularies will be used according to what they contain. For example, to compute the embedding matrix on the train set, we will use word_to_idx_train, while for validation word_to_idx_val
word_to_idx_train, idx_to_word_train = extend_vocabulary({}, [glove_emb_model.vocab.keys()] + x_pre["train"].tolist())
print("Train vocabulary size: ", len(word_to_idx_train))
word_to_idx_val, idx_to_word_val = extend_vocabulary(word_to_idx_train, x_pre["val"].tolist())
print("Val vocabulary size: ", len(word_to_idx_val))
word_to_idx_test, idx_to_word_test = extend_vocabulary(word_to_idx_val, x_pre["test"].tolist())
print("Test vocabulary size: ", len(word_to_idx_test))

#encoding the data set

x_enc = {"train": encode_into_numbers(x_pre["train"].tolist(), word_to_idx_train),
        "val": encode_into_numbers(x_pre["val"].tolist(), word_to_idx_val),
        "test": encode_into_numbers(x_pre["test"].tolist(), word_to_idx_test)}


Train vocabulary size:  400318
Val vocabulary size:  400475
Test vocabulary size:  400571


Encoding the labels (pos)

In [40]:
#creating vocabulary mapping for the labels in the whole dataset
label_to_idx, idx_to_label = extend_vocabulary({},  y["train"].tolist() + y["val"].tolist() + y["test"].tolist())

y_enc = {"train": encode_into_numbers(y["train"].tolist(), label_to_idx),
        "val": encode_into_numbers(y["val"].tolist(), label_to_idx),
        "test": encode_into_numbers(y["test"].tolist(), label_to_idx)}

number_pos = len(label_to_idx)
print(f"In the dataset there are {number_pos} distinct POS")

In the dataset there are 45 distinct POS


In [41]:
print(x_enc["train"][0])
print(decode_into_words([x_enc["train"][0]], idx_to_word_test))
print(df_dict["train"]["sentence"].iloc[0])

[5030, 400001, 2, 4979, 83, 168, 2, 44, 1430, 1, 535, 20, 8, 128565, 370, 2344, 1264, 3]
[['pierre', 'vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.', '29', '.']]
['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']


In [51]:
print(decode_into_words([y_enc["train"][0]], idx_to_label))
print(df_dict["train"]["labels"].iloc[0])
print(y_enc["train"][0])

[['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.']]
['NNP', 'NNP', ',', 'CD', 'NNS', 'JJ', ',', 'MD', 'VB', 'DT', 'NN', 'IN', 'DT', 'JJ', 'NN', 'NNP', 'CD', '.']
[1, 1, 2, 3, 4, 5, 2, 6, 7, 8, 9, 10, 8, 5, 9, 1, 3, 11]


# Embedding matrix

In [43]:
def get_dashed_embeddings(embedding_model, word):
  if "-" in word:
    words_split = word.split("-")
    words_split.sort(key=len)  #getting the encoding of compound words starting from the longest one
    for word_piece in words_split:
      try:
        return embedding_model[word]  #if a word is found, assign its embedding to the matrix element
      except:
        pass  #if a word is not found, do nothing
  return None

def build_embedding_matrix(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                           embedding_dimension: int,
                           word_to_idx: Dict[str, int]) -> np.ndarray:
    """
    Builds the embedding matrix of a specific dataset given a pre-trained word embedding model

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param embedding_dimension: 

    :return
        - embedding matrix that assigns a high dimensional vector to each word in the dataset specific vocabulary (shape |V| x d)
    """
    embedding_matrix = np.zeros((len(word_to_idx)+1, embedding_dimension), dtype=np.float32)
    
    #adding all GloVe vocabularies embeddings
    for word, idx in tqdm(word_to_idx.items()):
      if word in embedding_model:
          embedding_matrix[idx] = embedding_model[word]
      else: 
          dashed_embedding = get_dashed_embeddings(embedding_model, word)
          if dashed_embedding is None: #it means that word has no dash or all its subwords are oov
              dashed_embedding = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)
          embedding_matrix[idx] = dashed_embedding
 
    return embedding_matrix


#This functions adds the embedding of OOV words to the embedding matrix. Note tht it directly tries to find an embedding for dashed words and if none is retrieved it uses a uniform random distribution
def extend_embedding_matrix(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                            embedding_matrix: np.ndarray,
                            word_to_idx: Dict[str, int]) -> np.ndarray:

    oov_terms = [key for key, idx in word_to_idx.items() if idx >= embedding_matrix.shape[0]] #all the terms with mapped to an index gretaer than the vocabulary size (number of rows) are not in the embedding matrix 
    oov_embedding_matrix = np.zeros((len(oov_terms), embedding_matrix.shape[1]), dtype=np.float32)
    
    for idx, oov in enumerate(oov_terms):
        dashed_embedding = get_dashed_embeddings(embedding_model, oov)
        if dashed_embedding is None: #it means that word has no dash or all its subwords are oov
            dashed_embedding = np.random.uniform(low=-0.05, high=0.05, size=embedding_matrix.shape[1])

        oov_embedding_matrix[idx] = dashed_embedding

    return np.concatenate([embedding_matrix, oov_embedding_matrix], axis=0)

In [44]:
embedding_matrix = build_embedding_matrix(glove_emb_model, 
                                          EMBEDDING_DIMENSION,
                                          word_to_idx_train)
print(embedding_matrix.shape)

embedding_matrix = extend_embedding_matrix(glove_emb_model, 
                                          embedding_matrix,
                                          word_to_idx_val)
print(embedding_matrix.shape)

embedding_matrix = extend_embedding_matrix(glove_emb_model, 
                                          embedding_matrix,
                                          word_to_idx_test)
print(embedding_matrix.shape)

100%|██████████| 400318/400318 [00:00<00:00, 448112.81it/s]


(400319, 50)
(400475, 50)
(400571, 50)


In [60]:
print(idx_to_word_train[400001])

vinken


In [61]:
print(embedding_matrix[400001])

[-0.03338091  0.04008736  0.01017872 -0.04682402  0.0133119   0.0451397
 -0.0420833   0.00824561  0.04154786 -0.04903048  0.01309129 -0.00250977
 -0.01530107  0.01275365 -0.04472979  0.0253063  -0.04955205 -0.01930643
  0.00039586 -0.01229777 -0.01925241  0.03308351  0.04348326  0.03737127
  0.00224313  0.03841642  0.04208616 -0.00907616 -0.02165541 -0.04017348
 -0.00209081  0.04462969  0.00609075  0.01675621 -0.03871162 -0.03564803
 -0.00355267  0.04512648 -0.01392154  0.0377484  -0.03797114  0.01170172
  0.0100613   0.03598747 -0.04966833 -0.03100683 -0.02803124  0.04499661
 -0.03463063 -0.04630376]


# Sequence length standardization 
Every sentence must have the same length, otherwise we would habe different input sizes

In [45]:
max_length_dict = {"train": len(max(x_enc["train"], key=len)),
                   "val": len(max(x_enc["val"], key=len)),
                   "test": len(max(x_enc["test"], key=len))}

number_pos = len(label_to_idx) + 1 #to add the padding
x_st, y_st, y_cat = {}, {}, {}

for key in max_length_dict.keys():
    x_st[key] = pad_sequences(x_enc[key], maxlen=max_length_dict[key], padding='post')
    y_st[key] = pad_sequences(y_enc[key], maxlen=max_length_dict[key], padding='post')
    y_cat[key] = to_categorical(y_st[key], num_classes=number_pos)

In [62]:
print(y_cat["train"][0])

[[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


# Models

In [46]:
def build_model(layer_list, model_name):
    input = Input(shape=(None,), dtype="int32")
    t = layer_list[0](input)
    for layer in layer_list[1:]:
        t = layer(t)
    return Model(inputs=input, outputs=t, name=model_name)

In [47]:
models = {}
history = {}
batch_size = 8
epochs = 100

## Baseline model

In [48]:
baseline_layers = [
    Embedding(input_dim=embedding_matrix.shape[0], 
              output_dim=embedding_matrix.shape[1],
              mask_zero=True, 
              weights=[embedding_matrix], 
              trainable=False),
    Bidirectional(LSTM(units=100, return_sequences=True)),
    Dense(number_pos, activation='softmax')
]

models["baseline"] = build_model(baseline_layers, "baseline")

In [49]:
models["baseline"].summary()

Model: "baseline"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 50)          20028550  
                                                                 
 bidirectional (Bidirectiona  (None, None, 200)        120800    
 l)                                                              
                                                                 
 dense (Dense)               (None, None, 46)          9246      
                                                                 
Total params: 20,158,596
Trainable params: 130,046
Non-trainable params: 20,028,550
_________________________________________________________________


In [50]:
models['baseline'].compile(optimizer = Adam(), 
                           loss = CategoricalCrossentropy(), 
                           metrics = [Accuracy()])

history['baseline'] = models['baseline'].fit(x=x_st["train"], 
                   y=y_cat["train"], 
                   batch_size=batch_size, 
                   epochs=epochs, 
                   validation_data=(x_st["val"], y_cat["val"]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
  1/246 [..............................] - ETA: 7s - loss: 0.0870 - accuracy: 0.0000e+00

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-50-18dbecacc770>", line 9, in <module>
    validation_data=(x_st["val"], y_cat["val"]))
  File "/usr/local/lib/python3.7/dist-packages/keras/utils/traceback_utils.py", line 64, in error_handler
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/keras/engine/training.py", line 1414, in fit
    callbacks.on_train_batch_end(end_step, logs)
  File "/usr/local/lib/python3.7/dist-packages/keras/callbacks.py", line 438, in on_train_batch_end
    self._call_batch_hook(ModeKeys.TRAIN, 'end', batch, logs=logs)
  File "/usr/local/lib/python3.7/dist-packages/keras/callbacks.py", line 297, in _call_batch_hook
    self._call_batch_end_hook(mode, batch, logs)
  File "/usr/local/lib/python3.7/dist-packages/keras/callbacks.py", line 318, in _call_batch_end_hook
 

KeyboardInterrupt: ignored

In [62]:
print("X: ", x_enc["train"][0])
print("X Encoded: ", x_st["train"][0], '\n\n', 100*'=', '\n')
print("Y: ", y_enc["train"][0])
print("Y Encoded:", y_st["train"][0])

X:  [4966, 97765, 2, 400001, 83, 168, 6, 158, 91, 6, 277, 1478, 942, 4, 6545, 29, 1912, 28, 96, 732, 2, 6, 5842, 33267, 2231, 2, 400001, 2, 664, 4, 44321, 2129, 1019, 2, 36, 1021, 3199, 4, 38, 1494, 3720, 6, 11262, 17564, 2854, 3]
X Encoded:  [  4966  97765      2 400001     83    168      6    158     91      6
    277   1478    942      4   6545     29   1912     28     96    732
      2      6   5842  33267   2231      2 400001      2    664      4
  44321   2129   1019      2     36   1021   3199      4     38   1494
   3720      6  11262  17564   2854      3      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0     

# Test

In [63]:
def get_model(input_dim, 
                 num_classes, 
                 embeddings, 
                 name = 'baseline',
                 use_GRU = False, 
                 additional_LSTM = False, 
                 additional_dense = False,
                 units=100,
                 dropout=0.0):
  """
  Create a Keras model for the POS-tagging task

  Parameters
  ----------
  input_dim: int
      The vocabulary size
  num_classes: int
      The number of different POS-tags (+ a class for the padding value) to be predicted 
  embeddings: numpy.ndarray
      Matrix containing the pre-trained embeddings 
  name: str
      The name of the model
  use_GRU: bool
      If True, use a BiGRU layer instead of the BiLSTM
  additional_LSTM: bool
      If True, use an additional BiLSTM layer right after the default one
  additional_dense: bool
      If True, adds a FC layer before the classifier
  units: int
      The hidden state's dimension of the RNNs
  dropout: float
      Define the drop rate of the Dropout layers. By default the dropout is disabled

  Returns
  -------
  tensorflow.keras.models.Model
      The POS-tagging model
      
  """
  inputs = Input(shape=(None,), dtype="int32")
  x = Embedding(
      input_dim=input_dim, 
      output_dim=embeddings.shape[1],
      mask_zero=True, 
      weights=[embeddings], 
      trainable=False)(inputs)
  if dropout:
    x = Dropout(dropout)(x)
  if use_GRU:
    rnn = GRU(units=units, return_sequences=True)
  else:
    rnn = LSTM(units=units, return_sequences=True, dropout=dropout)
  x = Bidirectional(rnn)(x)
  if additional_LSTM:
    x = LSTM(units=units, return_sequences=True)(x)
  if additional_dense:
    x = Dense(100, activation='relu')(x)
  x = Dense(num_classes, activation='softmax')(x)
  return Model(inputs, x, name=name)

In [64]:
batch_size = 8
epochs = 100
models = {}
histories = {}

In [65]:
models['baseline'] = get_model(embedding_matrix.shape[0], number_pos, embedding_matrix)
models['baseline'].summary()

Model: "baseline"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 50)          20028550  
                                                                 
 bidirectional_1 (Bidirectio  (None, None, 200)        120800    
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, None, 46)          9246      
                                                                 
Total params: 20,158,596
Trainable params: 130,046
Non-trainable params: 20,028,550
_________________________________________________________________


In [None]:
models['baseline'].compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
histories['baseline'] = models['baseline'].fit(x=x_st["train"], 
                   y=y_cat["train"], 
                   batch_size=batch_size, epochs=epochs, 
                   validation_data=(x_st["val"], y_cat["val"]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100

 # The folllowing section is only for reference, as it has been completely rewritten

##OOV handling

In [None]:
#this function extends the current embedding matrix with the embeddings of the oov terms
def extend_embedding_matrix(embedding_matrix: np.ndarray,
                            oov_terms: List[str]) -> np.ndarray:

    oov_embedding_matrix = np.zeros((len(oov_terms), embedding_matrix.shape[1]), dtype=np.float32)
    for idx, oov in enumerate(oov_terms):
        embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_matrix.shape[1])
        oov_embedding_matrix[idx] = embedding_vector

    new_embedding_matrix = np.concatenate([embedding_matrix, oov_embedding_matrix])

    return new_embedding_matrix

### Check OOV1 (oov in the train set) and add OOV1 embeddings to the matrix

In [None]:
OOV1 = check_OOV_terms(set(glove_emb_model.vocab.keys()), word_listing_train)

NameError: ignored

In [None]:
oov_percentage = float(len(OOV1)) * 100 / len(word_listing_train)
print(f"Total OOV terms: {len(OOV1)} ({oov_percentage:.2f}%)")

#### Adding OOV1 embeddings to the matrix

In [None]:
embedding_matrix = extend_embedding_matrix(embedding_matrix, OOV1)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

### Check OOV2 (oov in the validation set) and add OOV2 embeddings to the matrix

In [None]:
OOV2 = check_OOV_terms(set(glove_emb_model.vocab.keys()).union(set(OOV1)), word_listing_val)

In [None]:
oov2_percentage = float(len(OOV2)) * 100 / len(word_listing_val)
print(f"Total OOV terms: {len(OOV2)} ({oov2_percentage:.2f}%)")

####Adding OOV2 embeddings to the matrix

This cell is useless because OOV2 is empty

In [None]:
embedding_matrix = extend_embedding_matrix(embedding_matrix, OOV2)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

### Check OOV3 (oov in the test set) and add OOV3 embeddings to the matrix

In [None]:
OOV3 = check_OOV_terms(set(glove_emb_model.vocab.keys()).union(set(OOV1)).union(set(OOV2)), word_listing_test)

In [None]:
oov3_percentage = float(len(OOV3)) * 100 / len(word_listing_test)
print(f"Total OOV terms: {len(OOV3)} ({oov3_percentage:.2f}%)")

####Adding OOV3 embeddings to the matrix
This cell is useless too, as no oov term is present in the test set

In [None]:
embedding_matrix = extend_embedding_matrix(embedding_matrix, OOV3)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

## Build embedding matrix

Starting from GloVe matrix

In [None]:
def build_embedding_matrix(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                           embedding_dimension: int) -> np.ndarray:
    """
    Builds the embedding matrix of a specific dataset given a pre-trained word embedding model

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param embedding_dimension: 

    :return
        - embedding matrix that assigns a high dimensional vector to each word in the dataset specific vocabulary (shape |V| x d)
    """
    embedding_matrix = np.zeros((len(embedding_model.index2word), embedding_dimension), dtype=np.float32)
    
    #adding all GloVe vocabularies embeddings
    for idx, word in enumerate(embedding_model.index2word):
         embedding_vector = embedding_model[word]
         embedding_matrix[idx] = embedding_vector

    return embedding_matrix

In [None]:
#vocab_size = len(glove_emb_model.index2word) + len(OOV1)
embedding_matrix = build_embedding_matrix(glove_emb_model, EMBEDDING_DIMENSION)
print()
print(f"Embedding matrix shape: {embedding_matrix.shape}")