In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers

In [None]:
import ast
import json
import math
import os
import sys
from enum import Enum

from tqdm.auto import tqdm

import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

import numpy as np
import pandas as pd
import tensorflow as tf
import torch
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

from torch.optim import AdamW
from torch.utils.data import DataLoader

from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from transformers import AutoTokenizer, AutoModel


root_path = '/content/drive/MyDrive/colab_haspeede'
sys.path.append(root_path + '/code/')
from training.metrics import avg_f1
from sentence_statistics import max_sentence_length, average_sentence_length

%load_ext autoreload
%autoreload 2

**PATH**

In [None]:

# Directories
fb_dir = root_path + '/data/facebook/'
tw_dir = root_path + '/data/twitter/'
results_dir = root_path + '/results/'
preprocessed_dir = 'preprocessed/'

# Filepaths (Facebook dataset)
fb_dev_path = fb_dir + 'dev/' + preprocessed_dir + 'fb_dev_preprocessed.csv'
fb_test_path = fb_dir + 'test/' + preprocessed_dir +'fb_test_preprocessed.csv'

# Filepaths (Twitter dataset)
tw_dev_path = tw_dir + 'dev/' + preprocessed_dir +'tw_dev_preprocessed.csv'
tw_test_path = tw_dir + 'test/' + preprocessed_dir +'tw_test_preprocessed.csv'

**Task selection**

In [None]:
class Task(Enum):
    HASPEEDE_FB = ('haspeede-fb', fb_dev_path, fb_test_path)
    HASPEEDE_TW = ('haspeede-tw', tw_dev_path, tw_test_path)
    CROSS_HASPEEDE_FB = ('cross-haspeede-fb', fb_dev_path, tw_test_path)
    CROSS_HASPEEDE_TW = ('cross-haspeede-tw', tw_dev_path, fb_test_path)

    def __init__(self, task_name, dev_path, test_path):
        self.task_name = task_name
        self.dev_path = dev_path
        self.test_path = test_path

In [None]:
# Choose task
#TASK = Task.HASPEEDE_FB
#TASK = Task.HASPEEDE_TW
#TASK = Task.CROSS_HASPEEDE_FB
TASK = Task.CROSS_HASPEEDE_TW

task_name = TASK.task_name
dev_path = TASK.dev_path
test_path = TASK.test_path

**Data**

In [None]:
# Load Twitter dev/test dataset
dev_inf = open(dev_path, encoding='utf-8')
dev_data = pd.read_csv(dev_inf, sep=',')
dev_data = dev_data[['text', 'label']]

test_inf = open(test_path, encoding='utf-8')
test_data = pd.read_csv(test_inf, sep=',')
test_data = test_data[['text', 'label']]

**Split Train-Val**

In [None]:
VAL_SPLIT = 0.2 # val set percentage
x_train, x_val, y_train, y_val = train_test_split(dev_data['text'], dev_data['label'], stratify=dev_data['label'], test_size=VAL_SPLIT, random_state=128)

**DBMDZ-italian-uncased**

In [None]:
#model_id = 'DBMDZ'
#model_name = 'dbmdz/bert-base-italian-uncased'
model_id = 'DBMDZxxl'
model_name = "dbmdz/bert-base-italian-xxl-uncased"

model = BertForSequenceClassification.from_pretrained(model_name)

config = model.config

config.hidden_size = 384
config.intermediate_size = 1536

ablated_model = BertForSequenceClassification(config)


In [None]:
model.config

**BERT Tokenizer**

In [None]:
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
x_train = x_train.astype(str)
x_val = x_val.astype(str)
test_data['text'] = test_data['text'].astype(str)

In [None]:
MAX_LEN = 64

x_train_e = tokenizer(x_train.tolist(), truncation=True, padding=True, max_length=MAX_LEN, return_tensors='pt')
x_val_e = tokenizer(x_val.tolist(), truncation=True, padding=True, max_length=MAX_LEN, return_tensors='pt')
x_test_e = tokenizer(test_data['text'].tolist(), truncation=True, padding=True, max_length=MAX_LEN, return_tensors='pt')

**To PyTorch dataset**

In [None]:
class HateSpeechDataset(Dataset):
    def __init__(self, tokenized_inputs, labels=None):
        self.tokenized_inputs = tokenized_inputs
        self.labels = labels

    def __len__(self):
        return len(self.tokenized_inputs['input_ids'])

    def __getitem__(self, idx):
        item = {
            'input_ids': self.tokenized_inputs['input_ids'][idx],
            'attention_mask': self.tokenized_inputs['attention_mask'][idx],
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

In [None]:
# Create PyTorch datasets
train_dataset = HateSpeechDataset(x_train_e, y_train.tolist())
val_dataset = HateSpeechDataset(x_val_e, y_val.tolist())
test_dataset = HateSpeechDataset(x_test_e, test_data['label'].tolist())

**Dataloaders**

In [None]:
BATCH_SIZE = 16 # FACEBOOK
#BATCH_SIZE=64 # TWITTER

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

**Training**

In [None]:
EPOCHS = 3

# Hyper-parameters
lr = 2e-5
weight_decay = 0.01

# Optimizer
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

# To device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Train the model
best_val_f1 = 0.0
best_model_state_dict = None

train_losses = []
val_losses = []
train_f1_scores = []
val_f1_scores = []

for epoch in range(EPOCHS):
    # Training
    model.train()
    train_loss = 0.0
    train_preds = []
    train_labels = []

    #train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1} Training'):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        labels = batch['labels'].to(device)
        optimizer.zero_grad()

        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1).cpu().numpy()
        train_preds.extend(predictions)
        train_labels.extend(labels.cpu().numpy())

    avg_train_loss = train_loss / len(train_dataloader)
    train_f1 = f1_score(train_labels, train_preds, average='macro')

    train_losses.append(avg_train_loss)
    train_f1_scores.append(train_f1)

    # Validation
    model.eval()
    val_loss = 0.0
    val_preds = []
    val_labels = []

    #val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f'Epoch {epoch + 1} Validation'):
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)

            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1).cpu().numpy()
            val_preds.extend(predictions)
            val_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_dataloader)
    val_f1 = f1_score(val_labels, val_preds, average='macro')

    val_losses.append(avg_val_loss)
    val_f1_scores.append(val_f1)

    # Save the model with the best validation F1-score
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_model_state_dict = model.state_dict()

    # Print epoch summary
    print(f'Epoch {epoch + 1}, '
          f'Training Loss: {avg_train_loss:.4f}, Training Macro F1: {train_f1:.4f}, '
          f'Validation Loss: {avg_val_loss:.4f}, Validation Macro F1: {val_f1:.4f}')

# Load the best model's weights
if best_model_state_dict:
    model.load_state_dict(best_model_state_dict)

In [None]:
# Save best model
trained_weights_path = f'{results_dir}{model_id}/{task_name}/model_{lr}_{weight_decay}.pth'
torch.save(model.state_dict(), trained_weights_path)

In [None]:
epochs = list(range(1, len(train_losses) + 1))
plt.figure(figsize=(12, 5))

# Plotting BCE losses
ax1 = plt.subplot(1, 2, 1)  # 1 row, 2 columns, 1st subplot = Losses
ax1.xaxis.set_major_locator(MaxNLocator(integer=True))
ax1.plot(epochs, train_losses, label='Training', marker='o')
ax1.plot(epochs, val_losses, label='Validation', marker='o')
ax1.set_title('BCE losses')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.grid(True, which='both', linestyle='--', linewidth=0.5)

# Plotting average F1-scores
ax2 = plt.subplot(1, 2, 2)  # 1 row, 2 columns, 2nd subplot = F1 Scores
ax2.xaxis.set_major_locator(MaxNLocator(integer=True))
ax2.plot(epochs, train_f1_scores, label='Trainining', marker='o')
ax2.plot(epochs, val_f1_scores, label='Validation', marker='o')
ax2.set_title('Average F1-scores')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('average F1')
ax2.legend()
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.grid(True, which='both', linestyle='--', linewidth=0.5)

plt.tight_layout()
plt.savefig(f'{results_dir}{model_id}/{task_name}/history_{lr}_{weight_decay}.png', dpi=300, bbox_inches='tight')
plt.show()

**Testing**

In [None]:
# Load model
loaded_model = BertForSequenceClassification.from_pretrained(model_name)
loaded_model.load_state_dict(torch.load(trained_weights_path))

In [None]:
model.eval()
test_preds = []

loaded_model.to(device)

test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc='Testing', disable=True):
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        outputs = loaded_model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1).cpu().numpy()
        test_preds.extend(predictions)

In [None]:
y_test = test_data['label']
report = classification_report(y_test, test_preds, digits=4)
print(report)

In [None]:
best_val_f1_result = max(val_f1_scores)
print(best_val_f1_result)
final_result = f'{best_val_f1_result}\n {report}'
with open(f'{results_dir}{model_id}/{task_name}/test_eval_{lr}_{weight_decay}.txt', 'w') as outf:
    outf.write(final_result)

**KFold + Ensemble**

In [None]:
from sklearn.model_selection import KFold , StratifiedKFold

EPOCHS=3

# Hyper-parameters
lr = 2e-5
weight_decay = 0.01
n_splits=5

kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=128)

models = []  # store each fold's model
all_train_losses = []
all_val_losses = []
all_train_f1_scores = []
all_val_f1_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(dev_data['text'], dev_data['label'])):
    print(f'Training on fold {fold + 1}/{n_splits}')

    # Split data into training and validation
    input_train_fold = dev_data['text'][train_idx]
    y_train_fold = dev_data['label'][train_idx]

    input_val_fold = dev_data['text'][val_idx]
    y_val_fold = dev_data['label'][val_idx]

    input_train_fold = input_train_fold.astype(str)
    input_val_fold = input_val_fold.astype(str)

    x_train_fold_e = tokenizer(input_train_fold.tolist(), truncation=True, padding=True, max_length=MAX_LEN, return_tensors='pt')
    x_val_fold_e = tokenizer(input_val_fold.tolist(), truncation=True, padding=True, max_length=MAX_LEN, return_tensors='pt')

    # Create PyTorch datasets
    train_fold_dataset = HateSpeechDataset(x_train_fold_e, y_train_fold.tolist())
    val_fold_dataset = HateSpeechDataset(x_val_fold_e, y_val_fold.tolist())

    train_fold_dataloader = DataLoader(train_fold_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_fold_dataloader = DataLoader(val_fold_dataset, batch_size=BATCH_SIZE)

    model = BertForSequenceClassification.from_pretrained(model_name)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Optimizer
    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    # Train the model

    best_val_f1 = 0.0
    best_model_state_dict = None

    train_losses = []
    val_losses = []
    train_f1_scores = []
    val_f1_scores = []

    for epoch in range(EPOCHS):
        # Training
        model.train()
        train_loss = 0.0
        train_preds = []
        train_labels = []

        for batch in tqdm(train_fold_dataloader, desc=f'Epoch {epoch + 1} Training'):
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            optimizer.zero_grad()

            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1).cpu().numpy()
            train_preds.extend(predictions)
            train_labels.extend(labels.cpu().numpy())

        avg_train_loss = train_loss / len(train_fold_dataloader)
        train_f1 = f1_score(train_labels, train_preds, average='macro')

        train_losses.append(avg_train_loss)
        train_f1_scores.append(train_f1)

        # Validation
        model.eval()
        val_loss = 0.0
        val_preds = []
        val_labels = []

        with torch.no_grad():
            for batch in tqdm(val_fold_dataloader, desc=f'Epoch {epoch + 1} Validation'):
                inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
                labels = batch['labels'].to(device)

                outputs = model(**inputs, labels=labels)
                loss = outputs.loss
                val_loss += loss.item()

                logits = outputs.logits
                predictions = torch.argmax(logits, dim=1).cpu().numpy()
                val_preds.extend(predictions)
                val_labels.extend(labels.cpu().numpy())

        avg_val_loss = val_loss / len(val_fold_dataloader)
        val_f1 = f1_score(val_labels, val_preds, average='macro')

        val_losses.append(avg_val_loss)
        val_f1_scores.append(val_f1)

        # Save the model with the best validation F1-score
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_model_state_dict = model.state_dict()

        # Print epoch summary
        print(f'Epoch {epoch + 1}, '
              f'Training Loss: {avg_train_loss:.4f}, Training Macro F1: {train_f1:.4f}, '
              f'Validation Loss: {avg_val_loss:.4f}, Validation Macro F1: {val_f1:.4f}')

    # Load the best model's weights
    if best_model_state_dict:
        model.load_state_dict(best_model_state_dict)

    # Save the model for this fold
    model_path = f'model_fold_{fold + 1}.pth'
    torch.save(model.state_dict(), model_path)
    models.append(model_path)

    # Save training and validation results for this fold
    all_train_losses.append(train_losses)
    all_val_losses.append(val_losses)
    all_train_f1_scores.append(train_f1_scores)
    all_val_f1_scores.append(val_f1_scores)

In [None]:
ensemble_models = []
for model_path in models:
    model = BertForSequenceClassification.from_pretrained('dbmdz/bert-base-italian-xxl-uncased', num_labels=2)
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()
    ensemble_models.append(model)

In [None]:
# Test data

test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

def get_model_predictions(model):
    model.to(device)
    test_preds = []
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc='Testing', disable=True):
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            outputs = model(**inputs)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1).cpu().numpy()
            test_preds.extend(predictions)
    return test_preds

# Get probabilities for each model
model_preds1 = get_model_predictions(ensemble_models[0])
model_preds2 = get_model_predictions(ensemble_models[1])
model_preds3 = get_model_predictions(ensemble_models[2])
model_preds4 = get_model_predictions(ensemble_models[3])
model_preds5 = get_model_predictions(ensemble_models[4])
model_preds6 = get_model_predictions(ensemble_models[5])
model_preds7 = get_model_predictions(ensemble_models[6])
model_preds8 = get_model_predictions(ensemble_models[7])
model_preds9 = get_model_predictions(ensemble_models[8])
model_preds10 = get_model_predictions(ensemble_models[9])


In [None]:
# If you want to default to 0 in case of parity, you can use a conditional statement
#final_kfold_predictions = [max([elem1, elem2, elem3, elem4, elem5], key=lambda x: (x == 1, x)) for elem1, elem2, elem3, elem4, elem5 in zip(model_preds1, model_preds2, model_preds2, model_preds4, model_preds5)]
from collections import Counter

#final_kfold_predictions = [Counter([elem1, elem2, elem3, elem4, elem5]).most_common(1)[0][0] for elem1, elem2, elem3, elem4, elem5 in zip(model_preds1, model_preds2, model_preds3, model_preds4, model_preds5)]

tmp_kfold_preds = [
    Counter([elem1, elem2, elem3, elem4, elem5, elem6, elem7, elem8, elem9, elem10]).most_common(2)
    for elem1, elem2, elem3, elem4, elem5, elem6, elem7, elem8, elem9, elem10
    in zip(model_preds1, model_preds2, model_preds3, model_preds4, model_preds5,
           model_preds6, model_preds7, model_preds8, model_preds9, model_preds10)
]

# Assign the majority-voted label or default to 0 in case of ties
final_kfold_predictions = [elem[0][0] if len(elem) == 1 or (len(elem) > 1 and elem[0][1] > elem[1][1]) else 0 for elem in tmp_kfold_preds]


In [None]:
y_test = test_data['label']

report = classification_report(y_test, final_kfold_predictions, digits=4)
print(report)

In [None]:
with open(f'report_kfold_{task_name}.txt', 'w') as outf:
  outf.write(report)