In [1]:
import os, shutil  #  file management
import sys 
import pandas as pd  #  dataframe management
import numpy as np  #  data manipulation
# useful during debugging (progress bars)
from tqdm import tqdm
import re

from typing import Callable, List, Dict, Tuple, Set

#from tensorflow.random import set_seed


#fixed seeds to get reproducible results
np.random.seed(42)
#set_seed(42)

#Bulding the dataframe

## Dataset download and extraction

In [2]:
import urllib.request  #  download files
import zipfile  #  unzip files

DATASET_NAME = "dataset.zip"
DATASET_FOLDERNAME = "Dataset"
DATASET_SUBFOLDER = "dependency_treebank/"
SPLIT_DISTRIBUTION = [100, 150, 199]

working_folder = os.getcwd()

print("Current working directory: " + str(working_folder))

dataset_folder = os.path.join(os.getcwd(), DATASET_FOLDERNAME)

if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)

url = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip'

dataset_path = os.path.join(dataset_folder, "dataset.zip")

if not os.path.exists(dataset_path):
    urllib.request.urlretrieve(url, dataset_path)
    print("Successful download")

with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
    zip_ref.extractall(dataset_folder)
    print("Successful extraction")

#update folder to the extracted one
dataset_folder = os.path.join(dataset_folder, DATASET_SUBFOLDER)

Current working directory: /content
Successful download
Successful extraction


In [3]:
print(dataset_folder)

/content/Dataset/dependency_treebank/


## Dataframe construction

In [4]:
def encode_dataset(dataset_folder: str, 
                   split_dist: list(), ) -> Dict[str,pd.DataFrame]:
    
    df_dict = {"train": pd.DataFrame(columns=['sentence', 'labels']),
                "val": pd.DataFrame(columns=['sentence', 'labels']),
                "test":pd.DataFrame(columns=['sentence', 'labels'])}
    split = ""

    for filename in os.listdir(dataset_folder):
        file_path = os.path.join(dataset_folder, filename)
        try:
            if os.path.isfile(file_path):
                with open(file_path, mode='r', encoding='utf-8') as text_file:
                    # read it and extract 
                    document_number = filename.split("_")[1].split(".")[0]
                    if int(document_number) <= split_dist[0]:
                        split = "train"
                    elif split_dist[0] < int(document_number) <= split_dist[1]:
                        split = "val"
                    else:
                        split = "test"

                    df_file = pd.read_table(
                        file_path, 
                        delimiter='\t', 
                        names=['word', 'label'], 
                        usecols=[0,1],
                        skip_blank_lines=False)
                    
                    #splitting file content in sentences
                    idx = list(df_file.loc[df_file.isnull()['word']].index)
                    idx.append(len(df_file))
                    prev = 0
                    for sep in idx:
                        df_sentence = pd.DataFrame({
                            'sentence': [df_file['word'][prev:sep].to_list()], 
                            'labels': [df_file['label'][prev:sep].to_list()]})
                        df_dict[split] = pd.concat([df_dict[split], df_sentence], ignore_index=True)
                        prev = sep + 1
                    
        except Exception as e:
            print('Failed to process %s. Reason: %s' % (file_path, e))
            sys.exit(0)

    return df_dict

In [5]:
df_dict = encode_dataset(dataset_folder, SPLIT_DISTRIBUTION)

In [6]:
df_dict["train"].head()

Unnamed: 0,sentence,labels
0,"[Sir, Peter, Walters, ,, 58-year-old, chairman...","[NNP, NNP, NNP, ,, JJ, NN, IN, NNP, NNP, NNP, ..."
1,"[Sir, Peter, will, succeed, Sir, John, Milne, ...","[NNP, NNP, MD, VB, NNP, NNP, NNP, ,, CD, ,, WP..."
2,"[The, Life, Insurance, Co., of, Georgia, has, ...","[DT, NNP, NNP, NNP, IN, NNP, VBZ, RB, VBN, DT,..."
3,"[David, Wu, ,, the, company, 's, representativ...","[NNP, NNP, ,, DT, NN, POS, NN, IN, NNP, ,, VBD..."
4,"[Life, of, Georgia, is, part, of, the, Nationa...","[NNP, IN, NNP, VBZ, NN, IN, DT, NNP, NNP, NNP,..."


In [7]:
df_dict["val"].head()

Unnamed: 0,sentence,labels
0,"[Beauty, Takes, Backseat, To, Safety, on, Brid...","[NN, VBZ, NN, TO, NNP, IN, NNPS]"
1,"[EVERYONE, AGREES, that, most, of, the, nation...","[NN, VBZ, IN, JJS, IN, DT, NN, POS, JJ, NNS, V..."
2,"[But, there, 's, disagreement, over, how, to, ...","[CC, EX, VBZ, NN, IN, WRB, TO, VB, PRP, .]"
3,"[Highway, officials, insist, the, ornamental, ...","[NN, NNS, VBP, DT, JJ, NNS, IN, JJR, NNS, VBP,..."
4,"[But, other, people, do, n't, want, to, lose, ...","[CC, JJ, NNS, VBP, RB, VB, TO, VB, DT, NNS, PO..."


In [8]:
df_dict["test"].head()

Unnamed: 0,sentence,labels
0,"[John, F., Barrett, ,, 40, ,, formerly, execut...","[NNP, NNP, NNP, ,, CD, ,, RB, JJ, NN, NN, CC, ..."
1,"[Two, leading, constitutional-law, experts, sa...","[CD, VBG, NN, NNS, VBD, NNP, NNP, VBZ, RB, VB,..."
2,"[Professors, Philip, Kurland, of, the, Univers...","[NNP, NNP, NNP, IN, DT, NNP, IN, NNP, CC, NNP,..."
3,"[A, line-item, veto, is, a, procedure, that, w...","[DT, JJ, NN, VBZ, DT, NN, WDT, MD, VB, DT, NN,..."
4,"[Mr., Bush, has, said, he, would, like, to, be...","[NNP, NNP, VBZ, VBN, PRP, MD, VB, TO, VB, JJ, ..."


In [9]:
#Training data
x_train = df_dict['train']['sentence']
y_train = df_dict['train']['labels']

#Validation data
x_val = df_dict['val']['sentence']
y_val = df_dict['val']['labels']

#Test data
x_test = df_dict['test']['sentence']
y_test = df_dict['test']['labels']

# Glove Embedding model, vocabulary and OOV detection

## Load Glove embedding

In [10]:
import gensim
import gensim.downloader as gloader

def load_GloVe_embedding(embedding_dimension: int = 50) -> gensim.models.keyedvectors.KeyedVectors:
    """
    Loads a pre-trained word embedding model via gensim library.

    :param embedding_dimension: size of the embedding space to consider

    :return
        - pre-trained word embedding model (gensim KeyedVectors object)
    """
    download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Glove: 50, 100, 200, 300")
        raise e

    return emb_model

In [11]:
EMBEDDING_DIMENSION = 50
glove_emb_model = load_GloVe_embedding(EMBEDDING_DIMENSION)



##Creating Vocabulary

In [12]:
from collections import OrderedDict

def build_vocabulary(sr: pd.Series) -> List[str]:
    """
    Given a dataset, builds the corresponding word vocabulary.

    :param df: dataset from which we want to build the word vocabulary (pandas.DataFrame)
    :return:
      - vocabulary: set of unique terms that build up the vocabulary
    """
    vocabulary = []
    for sentence in tqdm(sr):
        for token in sentence:
            if token not in vocabulary:
                vocabulary.append(token)

    return vocabulary

In [13]:
vocabulary_dict = {}
for split in df_dict.keys():
    vocabulary_dict[split] = build_vocabulary(x_train)
    print()
    print(f'[Debug] {split} vocabulary size: {len(vocabulary_dict[split])}')

100%|██████████| 1963/1963 [00:01<00:00, 1556.84it/s]



[Debug] train vocabulary size: 8009


100%|██████████| 1963/1963 [00:01<00:00, 1556.31it/s]



[Debug] val vocabulary size: 8009


100%|██████████| 1963/1963 [00:01<00:00, 1567.05it/s]


[Debug] test vocabulary size: 8009





## OOV detection

In [14]:
def check_OOV_terms(vocabulary: List[str],
                    word_listing: List[str]):
    """
    Checks differences between pre-trained embedding model vocabulary
    and dataset specific vocabulary in order to highlight out-of-vocabulary terms.

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param word_listing: dataset specific vocabulary (list)

    :return
        - list of OOV terms
    """
    embedding_vocabulary = set(vocabulary)
    oov = set(word_listing).difference(embedding_vocabulary)
    return list(oov)

In [15]:
OOV1 = check_OOV_terms(glove_emb_model.vocab.keys(), vocabulary_dict["train"])
OOV1_percentage = float(len(OOV1)) * 100 / len(vocabulary_dict["train"])
print(f"Total OOV terms: {len(OOV1)} ({OOV1_percentage:.2f}%)")

Total OOV terms: 2346 (29.29%)


A lot of words are OOV simply because they start with capital letter, so we will lower all the words and check again the OOV. Before this section we can insert a graph showing the OOV words

In [16]:
OOV1_lowercase = check_OOV_terms(glove_emb_model.vocab.keys(), [v.lower() for v in vocabulary_dict["train"]])
OOV1_lowercase_percentage = float(len(OOV1_lowercase)) * 100 / len(vocabulary_dict["train"])
print(f"Total OOV terms: {len(OOV1_lowercase)} ({OOV1_lowercase_percentage:.2f}%)")

Total OOV terms: 359 (4.48%)


In [17]:
print("\n".join(OOV1_lowercase))

5\/8
12,252
62%-owned
retin-a
3057
side-crash
crocidolite
makato
program-trading
collective-bargaining
1\/4
northy
synergistics
year-ago
top-yielding
teacher-cadet
sub-segments
c.j.b.
nipponese
forest-product
security-type
coche-dury
234.4
high-balance
wheeland
automotive-parts
search-and-seizure
pattenden
sticker-shock
ratners
six-bottle
besuboru
rexinger
built-from-kit
war-rationed
4,393,237
church-goers
sacramento-based
9,118
foreign-stock
achievement-test
415.8
lap-shoulder
3,288,453
meinders
index-related
computer-driven
machine-gun-toting
college-bowl
product-design
382-37
bumkins
energy-services
1.5755
high-rate
one-yen
90-cent-an-hour
shirt-sleeved
374.20
pre-1917
gingl
government-certified
jalaalwalikraam
autions
stock-index
life-insurance
we-japanese
rate-sensitive
yen-support
three-lawyer
two-sevenths
446.62
direct-investment
test-prep
twindam
corton-charlemagne
revenue-desperate
video-viewing
front-seat
-lrb-
anti-china
durable-goods
eight-count
odd-sounding
romanee-conti
b

# Preprocessing

Here we replace bracket value with their symbols: 
```
  -lrb- and -lcb-   -->  ( 
  -rrb- and -rcb-   -->  )
```

In addition, all the rational numbers will be replaced with the placeholder #number#, as long as the floating point numbers.
Note that rational numbers, instead of being like 3/4, are written as 3\/4. The cause is that symbol "/" is represented using "\/", as this happens also in other words that are notrational ones






In [18]:
def preprocessing(content_list: List[str]) -> List[str]:
    placeholder = "#number#"
    re_slashes = re.compile('\\\/')  #pattern \/ 
    re_rational = re.compile('\d+\/\d+')  #pattern rational number (e.g. 1/5)
    re_number = re.compile('[+-]?(\d*[.])?\d+')  #pattern decimal number (e.g. 3.14)
    re_left_bracket = re.compile('(-lrb-)|(-lcb-)')  #pattern left bracket
    re_right_bracket = re.compile('(-rrb-)|(-rcb-)')  #pattern right bracket
    re_slashed_words = re.compile("(\w*)\/(\w*)")  #a slash separating words will be replaced with a dash, following the trend of the dataset, where composed words are in the form word-word

    content_list_preprocessed = [content.lower() for content in content_list]
    content_list_preprocessed = [re_slashes.sub("/", content) for content in content_list_preprocessed]
    content_list_preprocessed = [re_left_bracket.sub("(", content) for content in content_list_preprocessed]
    content_list_preprocessed = [re_right_bracket.sub(")", content) for content in content_list_preprocessed]
    content_list_preprocessed = [placeholder if re.match(re_rational, content) else content for content in content_list_preprocessed]
    content_list_preprocessed = [placeholder if re.match(re_number, content) else content for content in content_list_preprocessed]
    content_list_preprocessed = [content.replace("/", "-") if re.match(re_slashed_words, content) else content for content in content_list_preprocessed]

    return content_list_preprocessed


Preprocessing the dataset

In [19]:
x_train_preprocessed = x_train.apply(preprocessing)

Building the new vocabulary after preprocessing

In [20]:
train_vocabulary_preprocessed = build_vocabulary(x_train_preprocessed)
print()
print(f'[Debug] train vocabulary size after preprocessing: {len(train_vocabulary_preprocessed)}')

OOV1_preprocessed = check_OOV_terms(glove_emb_model.vocab.keys(), train_vocabulary_preprocessed)
OOV1_preprocessed_percentage = float(len(OOV1_preprocessed)) * 100 / len(train_vocabulary_preprocessed)
print(f"Total OOV terms: {len(OOV1_preprocessed)} ({OOV1_preprocessed_percentage:.2f}%)")

100%|██████████| 1963/1963 [00:01<00:00, 1796.22it/s]


[Debug] train vocabulary size after preprocessing: 6919
Total OOV terms: 290 (4.19%)





It can be seen that the number of OOV words has plummetted with respect to the the non preprocessed data. Similarly to train data, we apply preprocessing to validation and test splits.

In [23]:
x_val_preprocessed = x_val.apply(preprocessing)
x_test_preprocessed = x_test.apply(preprocessing)

# Mapping
Since we want to work with numerical data only, we will mapp words and pos (labels) to numbers. 

In [46]:
# Methods to create mapping

#adds oov words at the end of vocabulary
def extend_vocabulary(word_to_idx_original: Dict[str, int],
                      words_to_add) -> Tuple[Dict[str, int],Dict[int, str]]:
  """
    Given mapping between word and indeces, adds new words.

    :param word_to_idx_original: dictionary with key=word and value=index to which the word is mapped
    :return:
      - word_to_idx_extended: word_to_idx with new words
      - idx_to_word_extended: swapped version of word_to_idx_extended (keys and values are swapped)
  """
  word_to_idx_extended = word_to_idx_original
  idx = len(word_to_idx_extended.keys())
  if idx == 0: 
    idx = 1  #position 0 is reserved

  for sentence in words_to_add:
      for token in sentence:
          if token not in word_to_idx_extended:
              word_to_idx_extended[token] = idx 
              idx += 1
  idx_to_word_extended = {v: k for k, v in word_to_idx_extended.items()}

  return word_to_idx_extended, idx_to_word_extended

def encode_into_numbers(sentences: List[str],
                        word_to_idx_mapping: Dict[str, int]) -> List[int]:
    """
    Return a list of sequences encoded into integers following the mapping of the vocabulary
    """
    encoded_data = [[word_to_idx_mapping[token] for token in sentence] for sentence in sentences]
 
    return encoded_data

def decode_into_words(encoded_sentences: List[str],
                        idx_to_word_mapping: Dict[int,str]) -> List[str]:
    """
    Return a list of sequences decoded back to words following the (reverse) mapping of the vocabulary
    """
    decoded_data = [[idx_to_word_mapping[index] for index in sentence] for sentence in encoded_sentences]
 
    return decoded_data


In [43]:
#creating vocabulary mapping for the words in the data set
#Note that they are incremental, this means that the val vocabulary includes the rain one, and the test one inlcudes train and val ones
#This has been made according to the guidelines on the construction of V1, V2, V3, V4. 
#All in all, the complete vocabulary is the one with _test suffix
#In the embedding section the intermediate vocabularies will be used according to what they contain. For example, to compute the embedding matrix on the train set, we will use word_to_idx_train, while for validation word_to_idx_val
word_to_idx_train, idx_to_word_train = extend_vocabulary({}, [glove_emb_model.vocab.keys()] + x_train_preprocessed.tolist())
print("Train vocabulary size: ", len(word_to_idx_train))
word_to_idx_val, idx_to_word_val = extend_vocabulary(word_to_idx_train, x_val_preprocessed.tolist())
print("Val vocabulary size: ", len(word_to_idx_train))
word_to_idx_test, idx_to_word_test = extend_vocabulary(word_to_idx_val, x_test_preprocessed.tolist())
print("Test vocabulary size: ", len(word_to_idx_test))

#encoding the data set
x_train_enc = encode_into_numbers(x_train_preprocessed.tolist(), word_to_idx_train)

Train vocabulary size:  400290
Val vocabulary size:  400431
Test vocabulary size:  400503


In [41]:
#creating vocabulary mapping for the labels in the whole dataset
label_to_idx, idx_to_label = extend_vocabulary({},  y_train.tolist() + y_val.tolist() + y_test.tolist())

y_train_enc = encode_into_numbers(y_train.tolist(), label_to_idx)
y_val_enc = encode_into_numbers(y_val.tolist(), label_to_idx)
y_test_enc = encode_into_numbers(y_test.tolist(), label_to_idx)

number_pos = len(label_to_idx)
print(f"In the dataset there are {number_pos} distinct POS")

In the dataset there are 45 distinct POS


# Embedding matrix

In [None]:
def get_dashed_embeddings(embedding_model, word):
  if word.contains("-"):
    words_split = word.split("-").sort(key=len)  #getting the encoding of compound words starting from the longest one
    for word_piece in words_split:
      try:
        return embedding_model[word]  #if a word is found, assign its embedding to the matrix element
      except:
        pass  #if a word is not found, do nothing
  return None

def build_embedding_matrix(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                           embedding_dimension: int,
                           word_to_idx: Dict[str, int]) -> np.ndarray:
    """
    Builds the embedding matrix of a specific dataset given a pre-trained word embedding model

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param embedding_dimension: 

    :return
        - embedding matrix that assigns a high dimensional vector to each word in the dataset specific vocabulary (shape |V| x d)
    """
    embedding_matrix = np.zeros((len(word_to_idx), embedding_dimension), dtype=np.float32)
    
    #adding all GloVe vocabularies embeddings
    for word, idx in tqdm(word_to_idx.items()):
      if word in embedding_model.keys():
          embedding_matrix[idx] = embedding_model[word]
      elif word.contains("-"): 
          dashed_embedding = get_dashed_embeddings(embedding_model, word)
          if dashed_embedding is None: #it means that word has no dash or all its subwords are oov
              embedding_matrix[idx] = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)
 
    return embedding_matrix


#FUNCTION TO BE FINISHED
def extend_embedding_matrix(embedding_matrix: np.ndarray,
                            word_to_idx: Dict[str, int]) -> np.ndarray:

    oov_embedding_matrix = np.zeros((len(oov_terms), embedding_matrix.shape[1]), dtype=np.float32)
    for idx, oov in enumerate(oov_terms):
        embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_matrix.shape[1])
        oov_embedding_matrix[idx] = embedding_vector

    new_embedding_matrix = np.concatenate([embedding_matrix, oov_embedding_matrix])

    return new_embedding_matrix

 # The folllowing section is only for reference, as it has been completely rewritten

##OOV handling

In [21]:
#this function extends the current embedding matrix with the embeddings of the oov terms
def extend_embedding_matrix(embedding_matrix: np.ndarray,
                            oov_terms: List[str]) -> np.ndarray:

    oov_embedding_matrix = np.zeros((len(oov_terms), embedding_matrix.shape[1]), dtype=np.float32)
    for idx, oov in enumerate(oov_terms):
        embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_matrix.shape[1])
        oov_embedding_matrix[idx] = embedding_vector

    new_embedding_matrix = np.concatenate([embedding_matrix, oov_embedding_matrix])

    return new_embedding_matrix

### Check OOV1 (oov in the train set) and add OOV1 embeddings to the matrix

In [22]:
OOV1 = check_OOV_terms(set(glove_emb_model.vocab.keys()), word_listing_train)

NameError: ignored

In [None]:
oov_percentage = float(len(OOV1)) * 100 / len(word_listing_train)
print(f"Total OOV terms: {len(OOV1)} ({oov_percentage:.2f}%)")

#### Adding OOV1 embeddings to the matrix

In [None]:
embedding_matrix = extend_embedding_matrix(embedding_matrix, OOV1)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

### Check OOV2 (oov in the validation set) and add OOV2 embeddings to the matrix

In [None]:
OOV2 = check_OOV_terms(set(glove_emb_model.vocab.keys()).union(set(OOV1)), word_listing_val)

In [None]:
oov2_percentage = float(len(OOV2)) * 100 / len(word_listing_val)
print(f"Total OOV terms: {len(OOV2)} ({oov2_percentage:.2f}%)")

####Adding OOV2 embeddings to the matrix

This cell is useless because OOV2 is empty

In [None]:
embedding_matrix = extend_embedding_matrix(embedding_matrix, OOV2)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

### Check OOV3 (oov in the test set) and add OOV3 embeddings to the matrix

In [None]:
OOV3 = check_OOV_terms(set(glove_emb_model.vocab.keys()).union(set(OOV1)).union(set(OOV2)), word_listing_test)

In [None]:
oov3_percentage = float(len(OOV3)) * 100 / len(word_listing_test)
print(f"Total OOV terms: {len(OOV3)} ({oov3_percentage:.2f}%)")

####Adding OOV3 embeddings to the matrix
This cell is useless too, as no oov term is present in the test set

In [None]:
embedding_matrix = extend_embedding_matrix(embedding_matrix, OOV3)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

## Build embedding matrix

Starting from GloVe matrix

In [None]:
def build_embedding_matrix(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                           embedding_dimension: int) -> np.ndarray:
    """
    Builds the embedding matrix of a specific dataset given a pre-trained word embedding model

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param embedding_dimension: 

    :return
        - embedding matrix that assigns a high dimensional vector to each word in the dataset specific vocabulary (shape |V| x d)
    """
    embedding_matrix = np.zeros((len(embedding_model.index2word), embedding_dimension), dtype=np.float32)
    
    #adding all GloVe vocabularies embeddings
    for idx, word in enumerate(embedding_model.index2word):
         embedding_vector = embedding_model[word]
         embedding_matrix[idx] = embedding_vector

    return embedding_matrix

In [None]:
#vocab_size = len(glove_emb_model.index2word) + len(OOV1)
embedding_matrix = build_embedding_matrix(glove_emb_model, EMBEDDING_DIMENSION)
print()
print(f"Embedding matrix shape: {embedding_matrix.shape}")