### Libraries

In [None]:
# Installing LaTeX. This is not mandatory, and it is used to plot using LaTex.
# !sudo apt-get install dvipng texlive-latex-extra texlive-fonts-recommended cm-super

In [25]:
# Importing os.
import os

# Importing urllib.request.
import urllib.request

# Importing zipfile.
import zipfile

# Importing pandas.
import pandas as pd

# Importing numpy.
import numpy as np

# Importing random.
import random

# Importing tensorflow.
import tensorflow as tf

# Importing pad_sequences.
from keras_preprocessing.sequence import pad_sequences

# Importing Sequential.
from keras.models import Sequential

# Importing Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation.
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation, GRU

# Importing L2.
from keras.regularizers import l2

# Importing Adam.
from keras.optimizers import Adam

# Importing EarlyStopping and ReduceLROnPlateau.
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Importing classification_report.
from sklearn.metrics import classification_report

# Importing pyplot.
import matplotlib.pyplot as plt

# Sets reproducibility.
def set_reproducibility(seed):

  # Setting seeds.
  random.seed(seed)
  np.random.seed(seed)
  tf.random.set_seed(seed)
  os.environ["TF_DETERMINISTIC_OPS"] = "1"

# Setting seed.
set_reproducibility(seed = 42)

# Using TeX. This is not mandatory, and it is used to plot using LaTex.
# plt.rc("text", usetex = True)

# Setting the font family. This is not mandatory, and it is used to plot using LaTex.
# plt.rc("font", family = "serif")

# Setting the font size. This is not mandatory, and it is used to plot using LaTex.
# plt.rcParams.update({"font.size": 15})

# Using package amsmath. This is not mandatory, and it is used to plot using LaTex.
# plt.rcParams["text.latex.preamble"] = [r"\usepackage{amsmath}"]

### Data Preprocessing

In [4]:
# Function used to download .zips.
def downloader(url, folder_name, filename):

  # Defining data folder path.
  data_path = os.path.join(os.getcwd(), folder_name)

  # Creating data folder.
  if not os.path.exists(data_path):
      os.makedirs(data_path)

  # Defining .zip file path.
  zip_path = os.path.join(os.getcwd(), folder_name, filename)

  # Requesting .zip file.
  if not os.path.exists(zip_path):
      urllib.request.urlretrieve(url, zip_path)

  # Extracting data from .zip.
  with zipfile.ZipFile(zip_path, "r") as zip_ref:
      zip_ref.extractall(path = data_path)

  # Returning data_path and zip_path.
  return data_path, zip_path

In [5]:
# Downloading dataset.
data_path, _ = downloader(url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip", folder_name = "data", filename = "dependency_treebank.zip")

# Downloading glove.
glove_path, _ = downloader(url = "https://nlp.stanford.edu/data/glove.6B.zip", folder_name = "glove", filename = "glove.6B.zip")

In [6]:
# Defining the dataset name.
dataset_name = "dependency_treebank"

# Defining path to first training sample.
file_path = os.path.join(data_path, dataset_name, "wsj_0001.dp")

# Reading first training sample.
if os.path.isfile(file_path):

  # Printing file.
  with open(file_path, mode = "r") as text_file: print(text_file.read())

Pierre	NNP	2
Vinken	NNP	8
,	,	2
61	CD	5
years	NNS	6
old	JJ	2
,	,	2
will	MD	0
join	VB	8
the	DT	11
board	NN	9
as	IN	9
a	DT	15
nonexecutive	JJ	15
director	NN	12
Nov.	NNP	9
29	CD	16
.	.	8

Mr.	NNP	2
Vinken	NNP	3
is	VBZ	0
chairman	NN	3
of	IN	4
Elsevier	NNP	7
N.V.	NNP	12
,	,	12
the	DT	12
Dutch	NNP	12
publishing	VBG	12
group	NN	5
.	.	3



In [7]:
# Defining embedding size.
EMBEDDING_SIZE = 50

# Defining specific glove's file path.
glove_file = os.path.join(os.getcwd(), glove_path, f"glove.6B.{str(EMBEDDING_SIZE)}d.txt")

# Reading lines of file.
with open(glove_file, encoding = "utf8" ) as text_file: 
  lines = text_file.readlines()

# Defining initial vocabulary.
embedding_vocabulary = {}

# Reading single lines.
for line in lines:

  # Splitting line.
  splits = line.split()

  # Storing line into vocabulary.
  embedding_vocabulary[splits[0]] = np.array([float(val) for val in splits[1:]])

# Printing one entry of the vocabulary.
print("The embedding for 'the' is:\n{}.".format(embedding_vocabulary["the"]))

The embedding for 'the' is:
[ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
 -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
 -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
 -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
 -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
  4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
  1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
 -3.4758e-01 -4.5637e-02 -4.4251e-01  1.8785e-01  2.7849e-03 -1.8411e-01
 -1.1514e-01 -7.8581e-01].


In [8]:
# Function used to get a list of embeddings.
def get_embeddings(sentence, vocabulary, embedding_size):

  # List of embeddings for the input sentence.
  embeddings = []

  # Retrieving embedding vector for each word.
  for word in sentence:

    # Computing the embedding.
    embedding = vocabulary.get(word.lower())

    # Checking the embedding.
    if embedding is not None:
      
      # Populating the list of embeddings.
      embeddings.append(embedding)
    
    else:

      # Storing vector of zeros for OOV terms.
      embeddings.append(list(np.zeros(embedding_size)))

  # Returning list of embeddings.
  return embeddings

# List containing dataframe rows.
dataframe_rows = []

# List containing words of a single sentence.
row_words = []

# List containing tags of a single sentence.
row_tags = []

# Defining data folder path.
folder = os.path.join(data_path, dataset_name)

# Storing rows.
for filename in sorted(os.listdir(folder)):

  # Computing path to file.
  file_path = os.path.join(folder, filename)

  # Checking existance of file.
  if os.path.isfile(file_path):

    # Opening the file.
    with open(file_path, mode = "r") as text_file:

      # Reading lines.
      while True:

        # Reading next line.
        line = text_file.readline()

        # Checking that line is different from "\n" (empty line) and from last line (EOF).
        if line and line != "\n":

          # Storing the word.
          row_words.append(line.split()[0])

          # Storing the POS tag.
          row_tags.append(line.split()[1])

        # Creating new dataframe row.
        else:

          # Creating a row.
          dataframe_row = {"file_id": int(filename.split(".")[0].split("_")[1]), 
                           "sentence": row_words, 
                           "tags": row_tags, 
                           "features": get_embeddings(row_words, embedding_vocabulary, EMBEDDING_SIZE)}

          # Appending row.
          dataframe_rows.append(dataframe_row)

          # Resetting row_words list so to store a new sentence.
          row_words = []

          # Resetting row_tags list so to store a new sentence.
          row_tags = []

          # If, in particular, EOF is reached, then break the inner loop.
          if not line: break

# Creating pandas dataframe.
dataframe = pd.DataFrame(dataframe_rows)

# Printing dataframe head.
dataframe.head()

Unnamed: 0,file_id,sentence,tags,features
0,1,"[Pierre, Vinken, ,, 61, years, old, ,, will, j...","[NNP, NNP, ,, CD, NNS, JJ, ,, MD, VB, DT, NN, ...","[[0.23568, 0.39638, -0.60135, -0.52681, 0.1587..."
1,1,"[Mr., Vinken, is, chairman, of, Elsevier, N.V....","[NNP, NNP, VBZ, NN, IN, NNP, NNP, ,, DT, NNP, ...","[[0.006008, 0.57028, -0.064426, -0.044687, 0.8..."
2,2,"[Rudolph, Agnew, ,, 55, years, old, and, forme...","[NNP, NNP, ,, CD, NNS, JJ, CC, JJ, NN, IN, NNP...","[[0.86274, 0.056588, -0.081828, -0.35318, -0.0..."
3,3,"[A, form, of, asbestos, once, used, to, make, ...","[DT, NN, IN, NN, RB, VBN, TO, VB, NNP, NN, NNS...","[[0.21705, 0.46515, -0.46757, 0.10082, 1.0135,..."
4,3,"[The, asbestos, fiber, ,, crocidolite, ,, is, ...","[DT, NN, NN, ,, NN, ,, VBZ, RB, JJ, IN, PRP, V...","[[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -..."


In [9]:
# Defining training split.
TRAINING_SPLIT = range(1, 101)

# Defining validation split.
VALIDATION_SPLIT = range(101, 151)

# Defining test split.
TEST_SPLIT = range(151, 200)

# Computing train dataframe.
train = dataframe.loc[dataframe["file_id"].isin(TRAINING_SPLIT)]

# Computing validation dataframe.
validation = dataframe.loc[dataframe["file_id"].isin(VALIDATION_SPLIT)]

# Computing test dataframe.
test = dataframe.loc[dataframe["file_id"].isin(TEST_SPLIT)]

In [10]:
# Computing length of longest train sentence.
MAX_LENGTH = len(max(train["sentence"].tolist(), key = len))

# Padding train features.
train_sentences = pad_sequences(train["features"].tolist(), maxlen = MAX_LENGTH, padding = "post", dtype = "float32")

# Padding validation features.
validation_sentences = pad_sequences(validation["features"].tolist(), maxlen = MAX_LENGTH, padding = "post", dtype = "float32")

# Padding test features.
test_sentences = pad_sequences(test["features"].tolist(), maxlen = MAX_LENGTH, padding = "post", dtype = "float32")

In [11]:
# Computing all possible tags. The training set already contains all of them.
tags = [item for sublist in train["tags"].tolist() for item in sublist]

# Removing duplicates from tags. By using a dict instead of a set I can get reproducible results (sets are not ordered).
tags = list(dict.fromkeys(tags))

# Vocabulary for tags.
tag_to_index = {}

# PAD is mapped onto 0.
tag_to_index["PAD"] = 0

# All other tags are mapped onto other indexes, starting from 1 up to |tags|.
for i, tag in enumerate(list(tags)): tag_to_index[tag] = i + 1

# Function used to transform list of tags into vectors of integers using a tag-to-index vocabulary.
def convert_tags(input_tags, vocabulary):

  # Output tags.
  output_tags = []

  # Converting input tags.
  for tags_list in input_tags:

    # Computing index.
    output_tags.append([vocabulary[tag] for tag in tags_list])

  # Returning output_tags.
  return output_tags

# Computing train tags.
train_tags = convert_tags(input_tags = train["tags"].tolist(), vocabulary = tag_to_index)

# Computing validation tags.
validation_tags = convert_tags(input_tags = validation["tags"].tolist(), vocabulary = tag_to_index)

# Computing test tags.
test_tags = convert_tags(input_tags = test["tags"].tolist(), vocabulary = tag_to_index)

# Padding train tags.
train_tags = pad_sequences(train_tags, maxlen = MAX_LENGTH, padding = "post")

# Padding validation tags.
validation_tags = pad_sequences(validation_tags, maxlen = MAX_LENGTH, padding = "post")

# Padding test tags.
test_tags = pad_sequences(test_tags, maxlen = MAX_LENGTH, padding = "post")

### Model Definition and Training

In [12]:
# Function that creates the model.
def get_model(name, layers, input_shape):

  # Sequential model.
  model = Sequential()

  # Adding input layer.
  model.add(InputLayer(input_shape = input_shape))

  # Adding layers.
  for layer in layers:

    # Adding layers.
    model.add(layer)
  
  # Output dense layer.
  model.add(TimeDistributed(Dense(len(tag_to_index))))

  # Softmax.
  model.add(Activation("softmax"))

  # Adding a name to the model.
  model._name = name

  # Returning the model.
  return model

In [29]:
# List of models' names.
models_name = ["m_0", "m_1", "m_2", "m_3"]

# Dictionary of models' description.
descriptions_dict = {models_name[0]: (f"Baseline model ({models_name[0]}): \n"
                                      "- Bi-directional LSTM layer. \n"
                                      "- Time-distributed dense layer. \n"
                                      "- Softmax activation function."),
                     models_name[1]: (f"GRU model ({models_name[1]}): \n"
                                      "- GRU layer. \n"
                                      "- Time-distributed dense layer. \n"
                                      "- Softmax activation function."),
                     models_name[2]: (f"Double bi-directional LSTM model ({models_name[2]}): \n"
                                      "- Bi-directional LSTM layer. \n"
                                      "- Bi-directional LSTM layer. \n"
                                      "- Time-distributed dense layer. \n"
                                      "- Softmax activation function."),
                     models_name[3]: (f"Double dense layer model ({models_name[3]}): \n"
                                      "- Bi-directional LSTM layer. \n"
                                      "- Time-distributed dense layer. \n"
                                      "- ReLU activation function. \n"
                                      "- Time-distributed dense layer. \n"
                                      "- Softmax activation function.")}

# Dictionary of models' layers.
layers_dict = {}

# Batch size.
BATCH_SIZE = 128

# Epochs.
EPOCHS = 100

# Weight decay parameter.
reg_parameter = 0.01

# Early stopping callback.
early_stopping = EarlyStopping(monitor = "val_loss", patience = 5, restore_best_weights = True)

# Reduce learning rate on plateau callback.
reduce_lr = ReduceLROnPlateau(monitor = "val_loss", patience = 3, factor = 0.1)

In [30]:
# Possible LSTM units.
LSTM_units = [64, 128, 256, 512]

# List of baseline models obtained during grid-search.
baseline_models = []

# List of histories.
histories = []

# Printing description of the model.
print(f"Grid-search, {models_name[0]} model.")

# Grid-search over possible number of units so to find the best baseline model.
for n_units in LSTM_units:

  # List of layers.
  layers = [Bidirectional(LSTM(n_units, return_sequences = True, recurrent_regularizer = l2(reg_parameter)))]

  # Creating the baseline model.
  model = get_model(name = models_name[0], layers = layers, input_shape = (MAX_LENGTH, EMBEDDING_SIZE))

  # Compiling.
  model.compile(loss = "sparse_categorical_crossentropy", optimizer = Adam(0.01), metrics = ["accuracy"])

  # Storing model.
  baseline_models.append(model)

  print()

  # Printing summary.
  baseline_models[-1].summary()

  # Fitting the model.
  history = baseline_models[-1].fit(train_sentences, train_tags, batch_size = BATCH_SIZE, epochs = EPOCHS, validation_data = (validation_sentences, validation_tags), callbacks = [early_stopping, reduce_lr])

  # Storing history.
  histories.append(history)

Grid-search m_0 model.

Model: "m_0"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_20 (Bidirecti  (None, 249, 128)         58880     
 onal)                                                           
                                                                 
 time_distributed_20 (TimeDi  (None, 249, 46)          5934      
 stributed)                                                      
                                                                 
 activation_20 (Activation)  (None, 249, 46)           0         
                                                                 
Total params: 64,814
Trainable params: 64,814
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 1

In [33]:
# Punctuation tag list.
punctuation_tag_list = ["PAD", ",", ".", "``", "''", ":", "$", "#"]

# Function used to compute macro F1-score.
def compute_F1_score(model, X, y, tag_to_index_vocabulary):

  # Computing predictions.
  pred = model.predict(X)

  # Computing classification report.
  report = classification_report(y.flatten(), 
                                 np.argmax(pred, axis = 2).flatten(), 
                                 labels = np.arange(0, len(tag_to_index_vocabulary), 1),
                                 target_names = list(tag_to_index_vocabulary.keys()),
                                 zero_division = 0,
                                 output_dict = True)

  # Macro F1-score without punctuation classes.
  macro_f1 = 0

  # Iterating over classes.
  for tag in list(tag_to_index_vocabulary.keys()):

    # Updating the macro F1-score.
    if tag not in punctuation_tag_list: macro_f1 = macro_f1 + report[tag]["f1-score"]

  # Dividing macro F1-score with the number of non-punctuation classes.
  macro_f1 = macro_f1 / (len(list(tag_to_index_vocabulary.keys())) - len(punctuation_tag_list))

  # Returning macro F1-score.
  return macro_f1

In [34]:
# Baseline models' F1-scores.
baseline_f1_scores = [compute_F1_score(model, validation_sentences, validation_tags, tag_to_index) for model in baseline_models]

# Printing macro F1-scores.
print(f"The computed macro F1-scores are: {baseline_f1_scores}.")

The computed macro F1-scores are: [0.6700926647383326, 0.6929254977496595, 0.729765334981206, 0.7225049332391134].


In [35]:
# Printing best number of LSTM units for the baseline model.
print(f"The best number of LSTM units for the baseline model is: {LSTM_units[np.argmax(baseline_f1_scores)]}.")

The best number of LSTM units for the baseline model is: 256.


In [None]:
"""
# Dictionary of possible models.
layers_dict = {models_name[0]: [Bidirectional(LSTM(256, return_sequences = True))],
               models_name[1]: [GRU(256, return_sequences = True)],
               models_name[2]: [Bidirectional(LSTM(256, return_sequences = True)),
                                Bidirectional(LSTM(256, return_sequences = True))],
               models_name[3]: [Bidirectional(LSTM(256, return_sequences = True)),
                                TimeDistributed(Dense(128)),
                                Activation("relu")]}
"""

In [None]:
# TODO: save and plot fitting history, grid-search over hyper-parameters.