## Imports

In [None]:
import ast
import os
import sys

import numpy as np
import pandas as pd
import tensorflow as tf

from gensim.models import KeyedVectors

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding

dir_parts = os.getcwd().split(os.path.sep)
root_index = dir_parts.index('MyHaSpeeDe-1')
root_path = os.path.sep.join(dir_parts[:root_index + 1])
sys.path.append(root_path + '/code/')
from sentence_statistics import average_sentence_length, max_sentence_length, median_sentence_length, mode_sentence_length, plot_sentence_lengths_distribution
from word_embedding import get_key_index_mappings, get_key_index_mappings_ft, get_embedding_matrix, get_embedding_matrix_ft, sentence_to_embedding, data_to_embedding_ft, data_to_embedding

%load_ext autoreload
%autoreload 2

## Path
Loading the pre-processed dataset.

In [None]:
# Directories
fb_dir = root_path + '/data/facebook/'
tw_dir = root_path + '/data/twitter/'

preprocessed_dir = 'preprocessed/'
w2v_dir = root_path + '/data/word2vec/'

results_dir = root_path + '/results/ConvolutionGRU/'

# Filepaths (Facebook dataset)
fb_dev_preprocessed_path = fb_dir + 'dev/' + preprocessed_dir + 'fb_dev_preprocessed.csv'
fb_test_preprocessed_path = fb_dir + 'test/' + preprocessed_dir + 'fb_test_preprocessed.csv'

# Filepaths (Twitter dataset)
tw_dev_preprocessed_path = tw_dir + 'dev/' + preprocessed_dir + 'tw_dev_preprocessed.csv'
tw_test_preprocessed_path = tw_dir + 'test/' + preprocessed_dir + 'tw_test_preprocessed.csv'

# W2V + Corpus
w2v_pretrained_path = w2v_dir + 'twitter128.bin' # w2v
ft_pretrained_path = w2v_dir + 'cc.it.300.bin' # fasttex
dictionary_path = root_path + '/data/italian_words.txt' # vocabulary
bad_words_path = root_path + '/data/italian_bad_words.txt' # bad words

## Data

In [None]:
pd.set_option("display.max_colwidth", None)

In [None]:
# Facebook dataset
fb_dev_inf = open(fb_dev_preprocessed_path, encoding="utf-8")
fb_test_inf = open(fb_test_preprocessed_path, encoding="utf-8")

fb_dev = pd.read_csv(fb_dev_inf, sep=',', converters={'tokens': pd.eval, 'lemmas': pd.eval})
fb_test = pd.read_csv(fb_test_inf, sep=',', converters={'tokens': pd.eval, 'lemmas': pd.eval})

In [None]:
# Twitter dataset
tw_dev_inf = open(tw_dev_preprocessed_path, encoding="utf-8")
tw_test_inf = open(tw_test_preprocessed_path, encoding="utf-8")

tw_dev = pd.read_csv(tw_dev_inf, sep=',', converters={'tokens': pd.eval, 'lemmas': pd.eval})
tw_test = pd.read_csv(tw_test_inf, sep=',', converters={'tokens': pd.eval, 'lemmas': pd.eval})

# Word Embeddings
We leverage a pre-trained W2V model of Italian Twitter embeddings from the Italian NLP Lab [[1]](http://www.italianlp.it/resources/italian-word-embeddings/).
Here we provide a tutorial on how to create an embedding matrix for out twitter dataset, leveraging the w2v model. 

*Note: the same w2v model will be used also for the Facebook dataset (i.e., transfer learning) with some fine-tuning, as a corresponding w2v for Facebook posts was not found.*

## Useful constants

In [None]:
OOV_TOKEN = '<OOV>'

samples = fb_dev['tokens']
unique_words = set([word for words in samples for word in words])
words = [word.lower() for word in unique_words]

n_samples = len(samples)
n_unique_words = len(unique_words)

In [None]:
print(f'#samples: {n_samples} - #unique words: {n_unique_words}')

## Sentence length analysis

In [None]:
avg_len      = average_sentence_length(fb_dev['tokens'])
median_len   = median_sentence_length(fb_dev['tokens'])
mode_len     = mode_sentence_length(fb_dev['tokens'])
max_len      = max_sentence_length(fb_dev['tokens'])

In [None]:
print(f'avg_len: {avg_len} - max_len: {max_len} - median_len: {median_len} - mode_len: {mode_len}')

In [None]:
# Plot sentence length distribution dev/test
plot_sentence_lengths_distribution(fb_dev, fb_test, dataset='Facebook')

## Word2Vec
Leveraging a pre-trained W2V model (http://www.italianlp.it/resources/italian-word-embeddings/).

In [None]:
w2v = KeyedVectors.load_word2vec_format(w2v_pretrained_path, binary=True)

In [None]:
w2v.vectors.shape

In [None]:
keys_set = set(w2v.index_to_key)

known_words_w2v = [word for word in words if word in keys_set]
unknown_words_w2v = [word for word in words if word not in keys_set]

In [None]:
print(f'(W2V) #known words: {len(known_words_w2v)} - #unknown words: {len(unknown_words_w2v)}')

### Embedding matrix
We need to add a specific token for unknown words (i.e. *\<OOV\>*) in the pre-trained W2V model. Thus, we create our own index-to-key and key-to-index mappings.

In [None]:
key_to_idx, idx_to_key = get_key_index_mappings(w2v, OOV_TOKEN)
embedding_matrix, vocab_size = get_embedding_matrix(w2v, idx_to_key, OOV_TOKEN)

### Dataset to embedding
We use truncating and padding to normalize sentences of different lengths to a uniform max sentence length.

Common approaches include:
- Pre-fixed value
- Average sentence length
- Maximum sentence length

As there is a huge disparity between betwen the average and maximum length in our case, a good approach to avoid too much padding, and consequently additional computational effort, could be to take the average.
However, since our dataset is not excessively large we decided to consider the maximum length, so that no information is lost.

In [None]:
X_fb_dev = data_to_embedding(fb_dev['tokens'], embedding_matrix, key_to_idx, truncation=max_len, padding=True)

In [None]:
X_fb_dev.shape

In [None]:
X_fb_dev

## FastText

Leveraging a pre-trained FastText model (https://fasttext.cc/docs/en/crawl-vectors.html).

In [None]:
ft = fasttext.load_model(ft_pretrained_path)

In [None]:
print(f'({len(ft.get_words())}, {ft.get_dimension()})')

In [None]:
keys_set = set(ft.get_words())

known_words_ft = [word for word in words if word in keys_set]
unknown_words_ft = [word for word in words if word not in keys_set]

In [None]:
print(f'(FastText) #known words: {len(known_words_ft)} - #unknown words: {len(unknown_words_ft)}')

### Embedding Matrix
FastText can handle out-of-vocabulary words by representing them as the sum of the vectors of their character n-grams (sub-units). Thus, no need to manually insert a dedicated token.

In [None]:
key_to_idx_ft, idx_to_key_ft = get_key_index_mappings_ft(ft)
embedding_matrix_ft, vocab_size_ft = get_embedding_matrix_ft(ft, idx_to_key_ft)

### Dataset to embedding
As before, truncating and padding are applied to normalize sentences of different lengths to a uniform max sentence length.

In [None]:
X_fb_dev_ft = data_to_embedding_ft(fb_dev['tokens'], embedding_matrix_ft, key_to_idx_ft, truncation=max_len, padding=True)

In [None]:
X_fb_dev_ft.shape

In [None]:
X_fb_dev_ft