In [41]:
import os, shutil  #  file management
import sys 
import pandas as pd  #  dataframe management
import numpy as np  #  data manipulation
# useful during debugging (progress bars)
from tqdm import tqdm
import re

from typing import Callable, List, Dict, Tuple, Set

#from tensorflow.random import set_seed


#fixed seeds to get reproducible results
np.random.seed(42)
#set_seed(42)

#Bulding the dataframe

## Dataset download and extraction

In [12]:
import urllib.request  #  download files
import zipfile  #  unzip files

DATASET_NAME = "dataset.zip"
DATASET_FOLDERNAME = "Dataset"
DATASET_SUBFOLDER = "dependency_treebank/"
SPLIT_DISTRIBUTION = [100, 150, 199]

working_folder = os.getcwd()

print("Current working directory: " + str(working_folder))

dataset_folder = os.path.join(os.getcwd(), DATASET_FOLDERNAME)

if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)

url = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip'

dataset_path = os.path.join(dataset_folder, "dataset.zip")

if not os.path.exists(dataset_path):
    urllib.request.urlretrieve(url, dataset_path)
    print("Successful download")

with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
    zip_ref.extractall(dataset_folder)
    print("Successful extraction")

#update folder to the extracted one
dataset_folder = os.path.join(dataset_folder, DATASET_SUBFOLDER)

Current working directory: /content
Successful extraction


In [10]:
print(dataset_folder)

/content/Dataset/dependency_treebank


## Dataframe construction

In [23]:
def encode_dataset(dataset_folder: str, 
                   split_dist: list(), ) -> Dict[str,pd.DataFrame]:
    
    df_dict = {"train": pd.DataFrame(columns=['sentence', 'labels']),
                "val": pd.DataFrame(columns=['sentence', 'labels']),
                "test":pd.DataFrame(columns=['sentence', 'labels'])}
    split = ""

    for filename in os.listdir(dataset_folder):
        file_path = os.path.join(dataset_folder, filename)
        try:
            if os.path.isfile(file_path):
                with open(file_path, mode='r', encoding='utf-8') as text_file:
                    # read it and extract 
                    document_number = filename.split("_")[1].split(".")[0]
                    if int(document_number) <= split_dist[0]:
                        split = "train"
                    elif split_dist[0] < int(document_number) <= split_dist[1]:
                        split = "val"
                    else:
                        split = "test"

                    df_file = pd.read_table(
                        file_path, 
                        delimiter='\t', 
                        names=['word', 'label'], 
                        usecols=[0,1],
                        skip_blank_lines=False)
                    
                    #splitting file content in sentences
                    idx = list(df_file.loc[df_file.isnull()['word']].index)
                    idx.append(len(df_file))
                    prev = 0
                    for sep in idx:
                        df_sentence = pd.DataFrame({
                            'sentence': [df_file['word'][prev:sep].to_list()], 
                            'labels': [df_file['label'][prev:sep].to_list()]})
                        df_dict[split] = pd.concat([df_dict[split], df_sentence], ignore_index=True)
                        prev = sep + 1
                    
        except Exception as e:
            print('Failed to process %s. Reason: %s' % (file_path, e))
            sys.exit(0)

    return df_dict

In [24]:
df_dict = encode_dataset(dataset_folder, SPLIT_DISTRIBUTION)

In [25]:
df_dict["train"].head()

Unnamed: 0,sentence,labels
0,"[James, L., Pate, ,, 54-year-old, executive, v...","[NNP, NNP, NNP, ,, JJ, NN, NN, NN, ,, VBD, VBN..."
1,"[The, Transportation, Department, ,, respondin...","[DT, NNP, NNP, ,, VBG, TO, NN, IN, NN, NNS, ,,..."
2,"[The, department, proposed, requiring, stronge...","[DT, NN, VBD, VBG, JJR, NNS, IN, JJ, NNS, CC, ..."
3,"[It, also, issued, a, final, rule, requiring, ...","[PRP, RB, VBD, DT, JJ, NN, VBG, NN, NNS, TO, V..."
4,"[Such, belts, already, are, required, for, the...","[JJ, NNS, RB, VBP, VBN, IN, DT, NNS, POS, JJ, ..."


In [26]:
df_dict["val"].head()

Unnamed: 0,sentence,labels
0,"[Wall, Street, 's, big, securities, firms, fac...","[NNP, NNP, POS, JJ, NNS, NNS, VBP, DT, NN, IN,..."
1,"[The, reason, :, Risks, from, the, firms, ', n...","[DT, NN, :, NNS, IN, DT, NNS, POS, JJ, ``, NN,..."
2,"[The, downgrading, of, debt, issued, by, CS, F...","[DT, NN, IN, NN, VBN, IN, NNP, NNP, NNP, NNP, ..."
3,"[With, the, shudders, came, the, realization, ...","[IN, DT, NNS, VBD, DT, NN, IN, DT, IN, NNP, NN..."
4,"[Securities, firms, are, among, the, biggest, ...","[NNS, NNS, VBP, IN, DT, JJS, NNS, IN, JJ, NN, ..."


In [27]:
df_dict["test"].head()

Unnamed: 0,sentence,labels
0,"[Dow, Jones, &, Co., said, it, extended, its, ...","[NNP, NNP, CC, NNP, VBD, PRP, VBD, PRP$, $, JJ..."
1,"[The, offer, ,, valued, at, about, $, 576, mil...","[DT, NN, ,, VBN, IN, IN, $, CD, CD, IN, DT, CD..."
2,"[Dow, Jones, ,, which, owns, about, 64, millio...","[NNP, NNP, ,, WDT, VBZ, IN, CD, CD, IN, NNP, P..."
3,"[Telerate, 's, two, independent, directors, ha...","[NNP, POS, CD, JJ, NNS, VBP, VBN, DT, NN, IN, ..."
4,"[In, composite, trading, on, the, New, York, S...","[IN, JJ, NN, IN, DT, NNP, NNP, NNP, NNP, ,, NN..."


In [92]:
XY_df_dict = {"train": {"sentence": df_dict['train']['sentence'],
                        "labels": df_dict['train']['labels']},
              "val": {"sentence": df_dict['val']['sentence'],
                        "labels": df_dict['val']['labels']},
              "test": {"sentence": df_dict['test']['sentence'],
                        "labels": df_dict['test']['labels']}}

# Embedding and vocabulary

## Load Glove embedding

In [28]:
import gensim
import gensim.downloader as gloader

def load_GloVe_embedding(embedding_dimension: int = 50) -> gensim.models.keyedvectors.KeyedVectors:
    """
    Loads a pre-trained word embedding model via gensim library.

    :param embedding_dimension: size of the embedding space to consider

    :return
        - pre-trained word embedding model (gensim KeyedVectors object)
    """
    download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name! Check the embedding dimension:")
        print("Glove: 50, 100, 200, 300")
        raise e

    return emb_model

In [29]:
EMBEDDING_DIMENSION = 50
glove_emb_model = load_GloVe_embedding(EMBEDDING_DIMENSION)



##Creating Vocabulary

In [96]:
from collections import OrderedDict

def build_vocabulary(sr: pd.Series) -> List[str]:
    """
    Given a dataset, builds the corresponding word vocabulary.

    :param df: dataset from which we want to build the word vocabulary (pandas.DataFrame)
    :return:
      - vocabulary: set of unique terms that build up the vocabulary
    """
    vocabulary = []
    for sentence in tqdm(sr):
        for token in sentence:
            if token not in vocabulary:
                vocabulary.append(token)

    return vocabulary

In [97]:
vocabulary_dict = {}
for split in df_dict.keys():
    vocabulary_dict[split] = build_vocabulary(df_dict[split]["sentence"])
    print()
    print(f'[Debug] {split} vocabulary size: {len(vocabulary_dict[split])}')

100%|██████████| 1963/1963 [00:01<00:00, 1536.45it/s]



[Debug] train vocabulary size: 8009


100%|██████████| 1299/1299 [00:00<00:00, 2037.07it/s]



[Debug] val vocabulary size: 5892


100%|██████████| 652/652 [00:00<00:00, 2866.74it/s]


[Debug] test vocabulary size: 3623





## OOV detection

In [35]:
def check_OOV_terms(vocabulary: List[str],
                    word_listing: List[str]):
    """
    Checks differences between pre-trained embedding model vocabulary
    and dataset specific vocabulary in order to highlight out-of-vocabulary terms.

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param word_listing: dataset specific vocabulary (list)

    :return
        - list of OOV terms
    """
    embedding_vocabulary = set(vocabulary)
    oov = set(word_listing).difference(embedding_vocabulary)
    return list(oov)

In [37]:
OOV1 = check_OOV_terms(glove_emb_model.vocab.keys(), vocabulary_dict["train"])
OOV1_percentage = float(len(OOV1)) * 100 / len(vocabulary_dict["train"])
print(f"Total OOV terms: {len(OOV1)} ({OOV1_percentage:.2f}%)")

Total OOV terms: 2346 (29.29%)


A lot of words are OOV simply because they start with capital letter, so we will lower all the words and check again the OOV. Before this section we can insert a graph showing the OOV words

In [100]:
OOV1_lowercase = check_OOV_terms(glove_emb_model.vocab.keys(), [v.lower() for v in vocabulary_dict["train"]])
OOV1_lowercase_percentage = float(len(OOV1_lowercase)) * 100 / len(vocabulary_dict["train"])
print(f"Total OOV terms: {len(OOV1_lowercase)} ({OOV1_lowercase_percentage:.2f}%)")

Total OOV terms: 359 (4.48%)


In [50]:
print("\n".join(OOV1_lowercase))

deposits-a
-lrb-
subindustry
co-developers
5.276
superpremiums
equal-opportunity
foreign-led
mehrens
student-test
marketing-communications
143.08
asset-sale
trading-company
1\/2
ariail
secilia
training-wage
143.80
wheeland
subminimum
test-prep
wine-buying
rubinfien
c.j.b.
low-ball
nih-appointed
subskill
ft-se
twindam
durable-goods
sacramento-based
collective-bargaining
teacher-cadet
16.125
school-improvement
prize-fighter
230-215
weisfield
anti-takeover
hallwood
69-point
big-ticket
vitulli
drag-down
securities-based
satrum
test-practice
520-lawyer
127.03
1\/10th
1\/4
2645.90
search-and-seizure
corton-charlemagne
chafic
school-research
sanderoff
centerbank
sometimes-tawdry
rope-sight
449.04
capital-gains
dollar-yen
retin-a
one-yen
62%-owned
macmillan\/mcgraw-hill
-rcb-
high-rate
car-safety
3.253
sell-offs
college-bowl
more-efficient
nissho-iwai
446.62
cray-3
auto-safety
84-month
forest-products
achievement-test
37-a-share
highest-pitched
replacement-car
4.898
dust-up
norwick
we-japanese

## Preprocessing

Here we replace bracket value with their symbols: 
```
  -lrb- and -lcb-   -->  ( 
  -rrb- and -rcb-   -->  )
```

In addition, all the rational numbers will be replaced with the placeholder #number#, as long as the floating point numbers.
Note that rational numbers, instead of being like 3/4, are written as 3\/4. The cause is that symbol "/" is represented using "\/", as this happens also in other words that are notrational ones






In [90]:
def preprocessing(content_list: List[str]) -> List[str]:
    placeholder = "#number#"
    re_slashes = re.compile('\\\/')  #pattern \/ 
    re_rational = re.compile('\d+\/\d+')  #pattern rational number (e.g. 1/5)
    re_number = re.compile('[+-]?(\d*[.])?\d+')  #pattern decimal number (e.g. 3.14)
    re_left_bracket = re.compile('(-lrb-)|(-lcb-)')  #pattern left bracket
    re_right_bracket = re.compile('(-rrb-)|(-rcb-)')  #pattern right bracket
    re_slashed_words = re.compile("(\w*)\/(\w*)")  #a slash separating words will be replaced with a dash, following the trend of the dataset, where composed words are in the form word-word

    content_list_preprocessed = [content.lower() for content in content_list]
    content_list_preprocessed = [re_slashes.sub("/", content) for content in content_list_preprocessed]
    content_list_preprocessed = [re_left_bracket.sub("(", content) for content in content_list_preprocessed]
    content_list_preprocessed = [re_right_bracket.sub(")", content) for content in content_list_preprocessed]
    content_list_preprocessed = [placeholder if re.match(re_rational, content) else content for content in content_list_preprocessed]
    content_list_preprocessed = [placeholder if re.match(re_number, content) else content for content in content_list_preprocessed]
    content_list_preprocessed = [content.replace("/", "-") if re.match(re_slashed_words, content) else content for content in content_list_preprocessed]

    return content_list_preprocessed


## Warning: the following cells seem to not work
Preprocessing the dataset

In [93]:
XY_df_preprocessed_dict = {}
for key in XY_df_dict.keys():  #preprocessing on training, valiation and test sets
   XY_df_preprocessed_dict[key] = {"sentence": XY_df_dict[key]["sentence"].apply(preprocessing),  #applies preprocessing to the sentences
                                    "labels": XY_df_dict[key]["labels"]}  #labels are untouched

Building the new vocabulary after preprocessing

In [99]:
train_vocabulary_preprocessed = build_vocabulary(XY_df_preprocessed_dict["train"]["sentence"])
print()
print(f'[Debug] train vocabulary size after preprocessing: {len(train_vocabulary_preprocessed)}')

OOV1_preprocessed = check_OOV_terms(glove_emb_model.vocab.keys(), train_vocabulary_preprocessed)
OOV1_preprocessed_percentage = float(len(OOV1_preprocessed)) * 100 / len(train_vocabulary_preprocessed)
print(f"Total OOV terms: {len(OOV1_preprocessed)} ({OOV1_preprocessed_percentage:.2f}%)")

100%|██████████| 1963/1963 [00:01<00:00, 1648.36it/s]



[Debug] train vocabulary size after preprocessing: 7526
Total OOV terms: 2282 (30.32%)


 # The folllowing section has to be checked

##OOV handling

In [None]:
#this function extends the current embedding matrix with the embeddings of the oov terms
def extend_embedding_matrix(embedding_matrix: np.ndarray,
                            oov_terms: List[str]) -> np.ndarray:

    oov_embedding_matrix = np.zeros((len(oov_terms), embedding_matrix.shape[1]), dtype=np.float32)
    for idx, oov in enumerate(oov_terms):
        embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_matrix.shape[1])
        oov_embedding_matrix[idx] = embedding_vector

    new_embedding_matrix = np.concatenate([embedding_matrix, oov_embedding_matrix])

    return new_embedding_matrix

### Check OOV1 (oov in the train set) and add OOV1 embeddings to the matrix

In [None]:
OOV1 = check_OOV_terms(set(glove_emb_model.vocab.keys()), word_listing_train)

In [None]:
oov_percentage = float(len(OOV1)) * 100 / len(word_listing_train)
print(f"Total OOV terms: {len(OOV1)} ({oov_percentage:.2f}%)")

Total OOV terms: 2346 (29.29%)


#### Adding OOV1 embeddings to the matrix

In [None]:
embedding_matrix = extend_embedding_matrix(embedding_matrix, OOV1)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

Embedding matrix shape: (402346, 50)


### Check OOV2 (oov in the validation set) and add OOV2 embeddings to the matrix

In [None]:
OOV2 = check_OOV_terms(set(glove_emb_model.vocab.keys()).union(set(OOV1)), word_listing_val)

In [None]:
oov2_percentage = float(len(OOV2)) * 100 / len(word_listing_val)
print(f"Total OOV terms: {len(OOV2)} ({oov2_percentage:.2f}%)")

Total OOV terms: 0 (0.00%)


####Adding OOV2 embeddings to the matrix

This cell is useless because OOV2 is empty

In [None]:
embedding_matrix = extend_embedding_matrix(embedding_matrix, OOV2)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

Embedding matrix shape: (402346, 50)


### Check OOV3 (oov in the test set) and add OOV3 embeddings to the matrix

In [None]:
OOV3 = check_OOV_terms(set(glove_emb_model.vocab.keys()).union(set(OOV1)).union(set(OOV2)), word_listing_test)

In [None]:
oov3_percentage = float(len(OOV3)) * 100 / len(word_listing_test)
print(f"Total OOV terms: {len(OOV3)} ({oov3_percentage:.2f}%)")

Total OOV terms: 0 (0.00%)


####Adding OOV3 embeddings to the matrix
This cell is useless too, as no oov term is present in the test set

In [None]:
embedding_matrix = extend_embedding_matrix(embedding_matrix, OOV3)
print(f"Embedding matrix shape: {embedding_matrix.shape}")

Embedding matrix shape: (402346, 50)


## Build embedding matrix

Starting from GloVe matrix

In [None]:
def build_embedding_matrix(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                           embedding_dimension: int) -> np.ndarray:
    """
    Builds the embedding matrix of a specific dataset given a pre-trained word embedding model

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param embedding_dimension: 

    :return
        - embedding matrix that assigns a high dimensional vector to each word in the dataset specific vocabulary (shape |V| x d)
    """
    embedding_matrix = np.zeros((len(embedding_model.index2word), embedding_dimension), dtype=np.float32)
    
    #adding all GloVe vocabularies embeddings
    for idx, word in enumerate(embedding_model.index2word):
         embedding_vector = embedding_model[word]
         embedding_matrix[idx] = embedding_vector

    return embedding_matrix

In [None]:
#vocab_size = len(glove_emb_model.index2word) + len(OOV1)
embedding_matrix = build_embedding_matrix(glove_emb_model, EMBEDDING_DIMENSION)
print()
print(f"Embedding matrix shape: {embedding_matrix.shape}")


Embedding matrix shape: (400000, 50)
