In [None]:
import os, shutil  #  file management
import sys 
import pandas as pd  #  dataframe management
import numpy as np  #  data manipulation
# useful during debugging (progress bars)
from tqdm import tqdm

from typing import Callable, List, Dict, Tuple

In [None]:
folder = os.getcwd()

print("Current working directory: " + str(folder))

dataset_folder = os.path.join(os.getcwd(), "Datasets")

if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)


Current working directory: /content


Dataset files extraction

In [None]:
import urllib.request  #  download files
import zipfile  #  unzip files

DATASET_NAME = "dataset.zip"
DATASET_REL_PATH = "Datasets/Original/dependency_treebank"
SPLIT_DISTRIBUTION = [100, 150, 199]

dataset_folder = os.path.join(os.getcwd(), "Datasets", "Original")

if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder)

url = 'https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/dependency_treebank.zip'

dataset_path = os.path.join(dataset_folder, "dataset.zip")

if not os.path.exists(dataset_path):
    urllib.request.urlretrieve(url, dataset_path)
    print("Successful download")

with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
    zip_ref.extractall(dataset_folder)
    print("Successful extraction")

Successful extraction


Dataframe construction

In [None]:
def encode_dataset(dataset_relative_path: str, dist: list(), ) -> pd.DataFrame:
    dataframe_rows_train = []
    dataframe_rows_val = []
    dataframe_rows_test = []

    folder = os.path.join(os.getcwd(), dataset_relative_path)
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)
        try:
            if os.path.isfile(file_path):
                with open(file_path, mode='r', encoding='utf-8') as text_file:
                    # read it and extract 
                    for line in text_file:
                        if line != "\n":
                          document = filename.split("_")[1].split(".")[0]
                          token = line.split("\t")[0]
                          pos = line.split("\t")[1]

                          # create single dataframe row
                          dataframe_row = {
                              "document": document,
                              "token": token,
                              "pos": pos
                          }
                          if int(document) <= dist[0]:
                            dataframe_rows_train.append(dataframe_row)
                          elif dist[0] < int(document) <= dist[1]:
                            dataframe_rows_val.append(dataframe_row)
                          else:
                            dataframe_rows_test.append(dataframe_row)
        except Exception as e:
            print('Failed to process %s. Reason: %s' % (file_path, e))
            sys.exit(0)

    #folder = os.path.join(os.getcwd(), "Datasets", "Dataframes", dataset_name)
    #if not os.path.exists(folder):
        #os.makedirs(folder)

    # transform the list of rows in a proper dataframe
    df_train = pd.DataFrame(dataframe_rows_train)
    df_val = pd.DataFrame(dataframe_rows_val)
    df_test = pd.DataFrame(dataframe_rows_test)
    #df = df[["file_id", "score", "sentiment", "split", "text"]]

    return [df_train, df_val, df_test]

In [None]:
df_train, df_val, df_test = encode_dataset(DATASET_REL_PATH, SPLIT_DISTRIBUTION)

In [None]:
df_train.head()

Unnamed: 0,document,token,pos
0,55,Structural,NNP
1,55,Dynamics,NNP
2,55,Research,NNP
3,55,Corp.,NNP
4,55,",",","


In [None]:
df_val.head()

Unnamed: 0,document,token,pos
0,107,The,DT
1,107,House,NNP
2,107,passed,VBD
3,107,legislation,NN
4,107,designed,VBN


In [None]:
df_test.head()

Unnamed: 0,document,token,pos,count
0,153,Rockwell,NNP,3
1,153,International,NNP,3
2,153,Corp.,NNP,4
3,153,reported,VBD,0
4,153,flat,JJ,7


Loading embedding

In [None]:
import gensim
import gensim.downloader as gloader

def load_GloVe_embedding(embedding_dimension: int = 50) -> gensim.models.keyedvectors.KeyedVectors:
    """
    Loads a pre-trained word embedding model via gensim library.

    :param embedding_dimension: size of the embedding space to consider

    :return
        - pre-trained word embedding model (gensim KeyedVectors object)
    """
    download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    emb_model = gloader.load(download_path)

    return emb_model

In [None]:
glove_emb_model = load_GloVe_embedding(50)



Creating Vocabulary

In [None]:
from collections import OrderedDict

def build_vocabulary(df: pd.DataFrame) -> Tuple[Dict[int, str],
                                           Dict[str, int],
                                           List[str]]:
    """
    Given a dataset, builds the corresponding word vocabulary.

    :param df: dataset from which we want to build the word vocabulary (pandas.DataFrame)
    :return:
      - word vocabulary: vocabulary index to word
      - inverse word vocabulary: word to vocabulary index
      - word listing: set of unique terms that build up the vocabulary
    """
    idx_to_word = OrderedDict()
    word_to_idx = OrderedDict()
    
    curr_idx = 0
    for sentence in tqdm(df.token.values):
        tokens = sentence.split()
        for token in tokens:
            if token not in word_to_idx:
                word_to_idx[token] = curr_idx
                idx_to_word[curr_idx] = token
                curr_idx += 1

    word_listing = list(idx_to_word.values())
    return idx_to_word, word_to_idx, word_listing

In [None]:
idx_to_word_train, word_to_idx_train, word_listing_train = build_vocabulary(df_train)
print()
print(f'[Debug] Index -> Word vocabulary size: {len(idx_to_word_train)}')
print(f'[Debug] Word -> Index vocabulary size: {len(word_to_idx_train)}')
print(f'[Debug] Word_listing size: {len(word_listing_train)}')
print(f'[Debug] Some words: {[(idx_to_word_train[idx], idx) for idx in np.arange(10) + 1]}')

100%|██████████| 47356/47356 [00:00<00:00, 1011315.82it/s]


[Debug] Index -> Word vocabulary size: 8009
[Debug] Word -> Index vocabulary size: 8009
[Debug] Word_listing size: 8009
[Debug] Some words: [('Dynamics', 1), ('Research', 2), ('Corp.', 3), (',', 4), ('which', 5), ('makes', 6), ('computer-aided', 7), ('engineering', 8), ('software', 9), ('said', 10)]





Check OOV for train set

In [None]:
def check_OOV_terms(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                    word_listing: List[str]):
    """
    Checks differences between pre-trained embedding model vocabulary
    and dataset specific vocabulary in order to highlight out-of-vocabulary terms.

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param word_listing: dataset specific vocabulary (list)

    :return
        - list of OOV terms
    """
    embedding_vocabulary = set(embedding_model.vocab.keys())
    oov = set(word_listing_train).difference(embedding_vocabulary)
    return list(oov)

In [None]:
OOV1 = check_OOV_terms(glove_emb_model, word_listing_train)

In [None]:
oov_percentage = float(len(OOV1)) * 100 / len(word_listing_train)
print(len(OOV1))
print(f"Total OOV terms: {len(OOV1)} ({oov_percentage:.2f}%)")

2346
Total OOV terms: 2346 (29.29%)


Random Embedding to handle OOV

In [None]:
def build_embedding_matrix(embedding_model: gensim.models.keyedvectors.KeyedVectors,
                           embedding_dimension: int,
                           #word_to_idx: Dict[str, int],
                           vocab_size: int,
                           oov_terms: List[str]) -> np.ndarray:
    """
    Builds the embedding matrix of a specific dataset given a pre-trained word embedding model

    :param embedding_model: pre-trained word embedding model (gensim wrapper)
    :param word_to_idx: vocabulary map (word -> index) (dict)
    :param vocab_size: size of the vocabulary
    :param oov_terms: list of OOV terms (list)

    :return
        - embedding matrix that assigns a high dimensional vector to each word in the dataset specific vocabulary (shape |V| x d)
    """
    embedding_matrix = np.zeros((vocab_size, embedding_dimension), dtype=np.float32)
    
    #adding all GloVe vocabularies embeddings
    for idx, word in enumerate(embedding_model.index2word):
         embedding_vector = embedding_model[word]
         embedding_matrix[idx] = embedding_vector

    idx_to_insert = len(embedding_model.index2word)
    
    for oov in oov_terms:
          embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)
          embedding_matrix[idx_to_insert] = embedding_vector
          idx_to_insert += 1

    return embedding_matrix

    

In [None]:
vocab_size = len(glove_emb_model.index2word) + len(OOV1)
embedding_dimension = 50
embedding_matrix = build_embedding_matrix(glove_emb_model, embedding_dimension, vocab_size, OOV1)
print()
print(f"Embedding matrix shape: {embedding_matrix.shape}")


Embedding matrix shape: (402346, 50)
