## Imports

In [None]:
import ast
import json
import math
import os
import sys
from enum import Enum

import numpy as np
import pandas as pd
import tensorflow as tf

from gensim.models import KeyedVectors

from keras.layers import LeakyReLU, Reshape, GlobalMaxPooling1D, Input, concatenate, Embedding, Flatten, Dropout, Dense, Conv1D, Activation, BatchNormalization
from keras.losses import BinaryCrossentropy
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l1_l2

from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

dir_parts = os.getcwd().split(os.path.sep)
root_index = dir_parts.index('MyHaSpeeDe-1')
root_path = os.path.sep.join(dir_parts[:root_index + 1])
sys.path.append(root_path + '/code/')
from dcnn.layers import SemiDynamicKMaxPooling, Folding
from hyperparameter_tuning import bayesian_optimization, random_search
from training.metrics import avg_f1
from training.solver import Solver
from sentence_statistics import max_sentence_length, average_sentence_length
from word_embedding import get_key_index_mappings, get_embedding_matrix, get_key_index_pos_mappings, get_pos_matrix, sentence_to_embedding, data_to_embedding, pos_to_embedding

%load_ext autoreload
%autoreload 2

## Path

In [None]:
# Directories
fb_dir = root_path + '/data/facebook/'
tw_dir = root_path + '/data/twitter/'
preprocessed_dir = 'preprocessed/'
w2v_dir = root_path + '/data/word2vec/'
results_dir = root_path + '/results/DCNN/'

# Filepaths (Facebook dataset)
fb_dev_preprocessed_path = fb_dir + 'dev/' + preprocessed_dir + 'fb_dev_preprocessed.csv'
fb_test_preprocessed_path = fb_dir + 'test/' + preprocessed_dir + 'fb_test_preprocessed.csv'

# Filepaths (Twitter dataset)
tw_dev_preprocessed_path = tw_dir + 'dev/' + preprocessed_dir + 'tw_dev_preprocessed.csv'
tw_test_preprocessed_path = tw_dir + 'test/' + preprocessed_dir + 'tw_test_preprocessed.csv'

# W2V + Corpus
w2v_pretrained_path = w2v_dir + 'twitter128.bin' # w2v
dictionary_path = root_path + '/data/italian_words.txt' # vocabulary
bad_words_path = root_path + '/data/italian_bad_words.txt' # bad words

## Task selection
The model will be evaluated and fine-tuned w.r.t the three HaSpeeDe-1 tasks:
- **Task 1 (HaSpeeDe-FB)**: only the FB dataset can be used to classify the FB test set;
- **Task 2 (HaSpeeDe-TW)**: only the TW dataset can be used to classify the TW test set;
- **Task 2 (Cross-HaspeeDe)**: only the FB dataset can be used to clasify the TW data set and viceversa (i.e. Cross-HaSpeeDe-FB and Cross-HasPeeDe-TW respectively).

In [None]:
class Task(Enum):
    HASPEEDE_FB = ('haspeede-fb', fb_dev_preprocessed_path, fb_test_preprocessed_path)
    HASPEEDE_TW = ('haspeede-tw', tw_dev_preprocessed_path, tw_test_preprocessed_path)
    CROSS_HASPEEDE_FB = ('cross-haspeede-fb', fb_dev_preprocessed_path, tw_test_preprocessed_path)
    CROSS_HASPEEDE_TW = ('cross-haspeede-tw', tw_dev_preprocessed_path, fb_test_preprocessed_path)

    def __init__(self, task_name, dev_path, test_path):
        self.task_name = task_name
        self.dev_path = dev_path
        self.test_path = test_path

In [None]:
# Choose task
#TASK = Task.HASPEEDE_FB
#TASK = Task.HASPEEDE_TW
#TASK = Task.CROSS_HASPEEDE_FB
TASK = Task.CROSS_HASPEEDE_TW

task_name = TASK.task_name
dev_path = TASK.dev_path
test_path = TASK.test_path

## Data

In [None]:
pd.set_option("display.max_colwidth", None)

In [None]:
# Load Twitter dev/test dataset
dev_inf = open(dev_path, encoding='utf-8')
dev_data = pd.read_csv(dev_inf, sep=',', converters={'tokens': pd.eval, 'lemmas': pd.eval})

test_inf = open(test_path, encoding='utf-8')
test_data = pd.read_csv(test_inf, sep=',', converters={'tokens': pd.eval, 'lemmas': pd.eval})

# Separate extra features
dev_data_extra = dev_data.drop(['id', 'text', 'label', 'hashtags', 'tokens', 'lemmas', 'PoS', 'text_en'], axis=1, errors='ignore')
test_data_extra = test_data.drop(['id', 'text', 'label', 'hashtags', 'tokens', 'lemmas', 'PoS', 'text_en'], axis=1, errors='ignore')

## W2V Embedding
Load pre-trained W2V model of Italian Twitter embeddings from the Italian NLP Lab [[1]](http://www.italianlp.it/resources/italian-word-embeddings/).

In [None]:
OOV_TOKEN = '<OOV>'

In [None]:
# W2V embedding
w2v = KeyedVectors.load_word2vec_format(w2v_pretrained_path, binary=True)

key_to_idx, idx_to_key = get_key_index_mappings(w2v, OOV_TOKEN)
embedding_matrix, vocab_size = get_embedding_matrix(w2v, idx_to_key, OOV_TOKEN)

In [None]:
VOCAB_SIZE = vocab_size
EMB_DIMS = embedding_matrix.shape[1]

## PoS Embedding

In [None]:
OOV_TOKEN = '<OOV>'

In [None]:
key_to_idx_pos, idx_to_key_pos = get_key_index_pos_mappings(dev_data["PoS"], OOV_TOKEN)
idx_to_onehot_pos = get_pos_matrix(idx_to_key_pos)

## Settings

In [None]:
MAX_LEN = (math.ceil(max_sentence_length(dev_data['tokens']) / 2.) * 2) - 1 # max sentence length
AVG_LEN = average_sentence_length(dev_data['tokens']) # average sentence length
VAL_SPLIT = 0.2 # val set percentage

## Data to embedding

In [None]:
X_dev = data_to_embedding(dev_data['tokens'], embedding_matrix, key_to_idx, truncation=MAX_LEN, padding=True)
X_dev_pos = pos_to_embedding(dev_data['PoS'], key_to_idx_pos, max_text_len=MAX_LEN)

X_test = data_to_embedding(test_data['tokens'], embedding_matrix, key_to_idx, truncation=MAX_LEN, padding=True)
X_test_pos = pos_to_embedding(test_data['PoS'], key_to_idx_pos, max_text_len=MAX_LEN)

## Split Train-Val

In [None]:
x_train, x_val, x_train_pos, x_val_pos, x_train_extra, x_val_extra, y_train, y_val = train_test_split(X_dev, X_dev_pos, dev_data_extra.values, dev_data['label'], 
                                                                                                      test_size=VAL_SPLIT, random_state=128, stratify=dev_data['label'])

# DCNN

In [None]:
TARGET = 'val_avg_f1' # optimization target

# Train data
input_train = {'text': x_train, 'PoS': x_train_pos, 'extra': x_train_extra}

# Val data
input_val = {'text': x_val, 'PoS': x_val_pos, 'extra': x_val_extra}

# Dataset-specific dimensions
POS_SHAPE = x_train_pos.shape
EXTRA_SHAPE = dev_data_extra.shape

# To-tune hyperparameters
K_TOP = 10 # fixed pooling parameter for topmost conv layer
hparams = {}

## Model

In [None]:
def get_dcnn_model(hparams, kernels=[2, 4, 6]):
    # Input
    in_text = Input(name='text', shape=(MAX_LEN, EMB_DIMS,))
    in_pos = Input(name='PoS', shape=(POS_SHAPE[1],))
    in_extra = Input(name='extra', shape=(EXTRA_SHAPE[1],))
        
    # Embedding PoS layer
    pos_emb = Embedding(24, 23, input_length=MAX_LEN)(in_pos)
    
    # Conv1D + Folding + DynamicKMaxPooling + Activation (text)
    convs_text = []
    for i, kernel_size in enumerate(kernels, start=1):
        l_conv = Conv1D(filters=hparams['n_filters'], kernel_size=kernel_size)(in_text)
        l_fold = Folding()(l_conv)  # Apply folding after convolution
        l_pool = SemiDynamicKMaxPooling(k_top=K_TOP, L=len(kernels), l=i, avg_s=AVG_LEN)(l_fold)
        l_activation = LeakyReLU(alpha=0.1)(l_pool)
        convs_text.append(l_activation)
        
    l_concat_text = concatenate(convs_text)
    l_flat_text = Flatten()(l_concat_text)
    
    # Conv1D + Folding + DynamicKMaxPooling + Activation (PoS)
    convs_pos = []
    for j, kernel_size in enumerate(kernels, start=1):
        l_conv = Conv1D(filters=hparams['n_filters'], kernel_size=kernel_size)(pos_emb)
        l_fold = Folding()(l_conv)  # Apply folding after convolution
        l_pool = SemiDynamicKMaxPooling(k_top=K_TOP, L=len(kernels), l=j, avg_s=AVG_LEN)(l_fold)
        l_activation = LeakyReLU(alpha=0.1)(l_pool)
        convs_pos.append(l_activation)
        
    l_concat_pos = concatenate(convs_pos)
    l_flat_pos = Flatten()(l_concat_pos)
    
     
    
    # Concat text - PoS - extra features
    input_merge = concatenate([l_flat_text, l_flat_pos, in_extra])
    
    # Add a Dense layer
    l_dense = Dropout(hparams['dropout'])(input_merge)
    l_dense = Dense(units=hparams['h_dim'])(l_dense)
    l_dense = LeakyReLU(alpha=0.1)(l_dense)
    l_dense = Dropout(hparams['dropout'])(l_dense)
    
    
    # Fully connected layer for binary classification with regularization (L2)
    output_layer = Dense(1, activation='sigmoid', kernel_regularizer=l1_l2(l1=hparams['reg'], l2=hparams['reg']))(l_dense)
    
    model = Model(inputs=[in_text, in_pos, in_extra], outputs=output_layer)
    optimizer = Adam(learning_rate=hparams['learning_rate'])
    # Compile the model
    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=[avg_f1])
    
    return model

## Hyper-parameters Tuning
A common approach is to start with a coarse random searcg across a wide range of values to find promising sub-ranges of our parameter space. Then, we can zoom into these ranges and perform another random search (or a grid search) to finetune the configurations.

In [None]:
hparams_spaces = {
    'learning_rate': ([1e-5, 1e-1], 'log'),
    'n_filters': ([25, 50, 100], 'item'),
    'h_dim': ([16, 32, 64, 128], 'item'),
    'dropout': ([0.0, 0.5], 'float'),
    'reg': ([1e-5, 1e-1], 'log'),
    'batch_size': ([16, 32, 64, 128], 'item')
}

### Bayesian Optimization

In [None]:
# Bayesian Optimization
best_target, best_config = bayesian_optimization(
    get_dcnn_model, input_train, y_train, input_val, y_val, 
    bayesian_optimization_spaces=hparams_spaces, TARGET=TARGET, N_TRIALS=50, EPOCHS=25, PATIENCE=5
)

In [None]:
hparams = best_config

# set new intervals for fine-tune random search
lr = hparams['learning_rate']
n_filters = hparams['n_filters']
h_dim = hparams['h_dim']
dropout = hparams['dropout']
reg = hparams['reg']
batch_size = hparams['batch_size']

### Random search (fine-tune)

In [None]:
epsilon = 0.2
random_search_spaces_finetune = {
    'learning_rate': ([10 ** (np.log10(lr) - epsilon), 10 ** (np.log10(lr) + epsilon)], 'float'),
    'n_filters': ([n_filters], 'item'),
    'h_dim': ([h_dim], 'item'),
    'dropout': ([10 ** (np.log10(dropout) - epsilon), 10 ** (np.log10(dropout) + epsilon)], 'float'),
    'reg': ([10 ** (np.log10(reg) - epsilon), 10 ** (np.log10(reg) + epsilon)], 'float'),
    'batch_size': ([batch_size], 'item'),
}

In [None]:
# Random search (fine-tune)
best_config, best_model, results = random_search(
    get_dcnn_model, input_train, y_train, input_val, y_val,
    random_search_spaces=random_search_spaces_finetune, TARGET=TARGET, NUM_SEARCH=30, EPOCHS=25, PATIENCE=5
)

### Save best configuration

In [None]:
if TARGET == 'val_avg_f1':
    new_best_target = max(results, key=lambda x: x[1][TARGET])[1][TARGET]
    if new_best_target > best_target:
        best_target = new_best_target
        hparams = max(results, key=lambda x: x[1][TARGET])[0]
        
tuning_result = hparams.copy()
tuning_result[TARGET] = best_target
tuning_result['n_filters'] = int(tuning_result['n_filters'])
tuning_result['h_dim'] = int(tuning_result['h_dim'])
tuning_result['batch_size'] = int(tuning_result['batch_size'])

In [None]:
# Store it
output_path = results_dir + task_name + '/best_hparams.json'
with open(output_path, 'w') as outf:
    json.dump(tuning_result, outf, indent=4)

## Training

In [None]:
with open(results_dir + task_name + '/best_hparams.json', 'r') as inf:
    hparams = json.load(inf)
del hparams[TARGET]


print(f'Config: {hparams}')
model = get_dcnn_model(hparams)
solver = Solver(model, input_train, y_train, input_val, y_val, TARGET)
solver.train(epochs=50, patience=5, batch_size=hparams['batch_size'])

In [None]:
out_path = results_dir + task_name + '/history.png'
solver.plot_history(out_path=out_path)

## Testing

In [None]:
input_test = {'text': X_test, 'PoS': X_test_pos, 'extra': test_data_extra.values}
y_test = test_data['label']

loss, metric = model.evaluate(input_test, y_test)
print(f'Test loss: {loss} - Test {TARGET}: {metric}')
string = f'Test loss: {loss} - Test {TARGET}: {metric}'

y_pred = np.where(model.predict(input_test) > 0.5, 1, 0)
report = classification_report(y_test, y_pred, digits=4)
print(report)

In [None]:
with open(results_dir + task_name + '/test_eval.txt', 'w') as outf:
    string = f"Test Loss - Average F1 Score: {loss:.5f} - {metric:.5f}\n {report}"
    outf.write(string)

## KFold + Ensemble

In [None]:
with open(results_dir + task_name + '/best_hparams.json', 'r') as inf:
    hparams = json.load(inf)
del hparams[TARGET]

input_dev = {'text': X_dev, 'PoS': X_dev_pos, 'extra': dev_data_extra.values, 'label': dev_data['label']}

print(f'Config: {hparams}')
solver = Solver(None, input_train, y_train, input_val, y_val, TARGET)
kfold_models = solver.train_with_kfold(
    get_dcnn_model, hparams, input_dev, n_splits=5, 
    batch_size=hparams['batch_size'], epochs=30, patience=5
)

In [None]:
input_test = {'text': X_test, 'PoS': X_test_pos, 'extra': test_data_extra.values}
y_test = test_data['label']

predictions = solver.ensemble_predict(input_test)
report = classification_report(y_test, predictions, digits=4)
avg_f1 = f1_score(y_test, predictions)

print(f'Average F1 Score for Ensemble: {avg_f1:.5f}')
print(f'\n{report}')

In [None]:
with open(results_dir + task_name + '/test_kfold_eval.txt', 'w') as outf:
    string = f"Average F1 Score for Ensemble: {avg_f1:.5f}\n {report}"
    outf.write(string)

## Meta-learner

In [None]:
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

meta_learners = {
    'LogisticRegression': LogisticRegression(), 
    'RandomForestClassifier': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGB': xgb.XGBClassifier(n_estimators=100, random_state=42),
    'SVC': SVC(probability=True, kernel='linear', C=1),
}

for meta_learner in meta_learners.items():
    predictions = solver.meta_learner_predict(input_test, meta_learner=meta_learner[1])
    report = classification_report(y_test, predictions, digits=4)
    print(f'---{meta_learner[0]}---\n{report}')

# Baseline SVC

In [None]:
from sklearn.svm import SVC
X_dev_reshaped = X_dev.reshape(X_dev.shape[0], -1)  # This flattens the data


svm = SVC(kernel='linear')
svm.fit(X_dev_reshaped, dev_data['label'])

In [None]:
#X_combined_test = np.concatenate((X_test, X_test_pos, test_data_extra), axis=1)
# Make predictions on the test data
X_test_reshaped = X_test.reshape(X_test.shape[0], -1)  # This flattens the data
y_pred = svm.predict(X_test_reshaped)

report = classification_report(test_data['label'], y_pred, digits=4)
print(f"Classification Report:\n{report}")