In [None]:
%%capture
!pip install datasets
!pip install optuna

In [34]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from scipy.special import expit
import torch
from torch.utils.data import Dataset as TorchDataset
from transformers import DataCollatorWithPadding
import optuna
import concurrent.futures
import time
from torch.utils.data import DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, hamming_loss, precision_score, recall_score
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="transformers.optimization")

class CustomDataset(TorchDataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["label"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

class MultiLabelABSA:
    def preprocess_data(self, data, tokenizer):
        texts = data["text"].tolist()
        labels = data.iloc[:, 1:].astype(float).values.tolist()
        labels = torch.tensor(labels, dtype=torch.float32)
        encodings = tokenizer(texts, padding=True, truncation=True, max_length=256, return_tensors="pt")
        return CustomDataset(encodings, labels)

    def create_model(self):
        model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=self.data.shape[1] - 1,
            problem_type="multi_label_classification"
        )
        return model

    def compute_metrics(self, eval_pred):
        predictions, lab = eval_pred

        predictions = (expit(predictions) > 0.5)
        labels = [l==1 for l in lab]

        accuracy = accuracy_score(labels, predictions)

        f1_macro = f1_score(labels, predictions, average="macro")
        f1_micro = f1_score(labels, predictions, average="micro")
        f1_weighted = f1_score(labels, predictions, average="weighted")

        class_f1_scores = f1_score(labels, predictions, average=None)

        hamming = hamming_loss(labels, predictions)

        return {
            "hamming_loss": hamming,
            "accuracy": accuracy,
            "f1_macro": f1_macro,
            "f1_micro": f1_micro,
            "f1_weighted": f1_weighted,
            "class_f1_scores": class_f1_scores.tolist(),
        }

    def multilabel_stratified_sampling(self, data, n_splits=4, random_state=42):
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

        # Convert multilabel targets to multiclass targets
        targets = data.iloc[:, 1:].values.argmax(axis=1)

        for train_index, test_index in skf.split(data["text"], targets):
            train_data, test_data = data.iloc[train_index], data.iloc[test_index]
            yield train_data, test_data

    def objective(self, trial):        
        learning_rate = trial.suggest_float("learning_rate", 1e-5, 9e-5, log=True)
        num_train_epochs = trial.suggest_int("num_train_epochs", 2,5)
        per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [4,8,16]) # Times two since the system uses 2 GPUs

        f1_micro_scores = []
        f1_macro_scores = []
        f1_weighted_scores = []
        accuracy_scores = []
        class_f1_scores = []
        loss = []
        hamming = []

        # Start measuring the runtime
        start_time = time.time()

        for train_data, test_data in self.multilabel_stratified_sampling(self.data, n_splits = 4, random_state = 2):
            train_dataset = self.preprocess_data(train_data, self.tokenizer)
            test_dataset = self.preprocess_data(test_data, self.tokenizer)

            model = self.create_model()

            training_args = TrainingArguments(
                output_dir="output",
                learning_rate=learning_rate,
                num_train_epochs=num_train_epochs,
                per_device_train_batch_size=per_device_train_batch_size,
                per_device_eval_batch_size=16,
                evaluation_strategy="epoch",
                save_strategy="epoch",
                logging_dir="logs",
                logging_steps=100,
                logging_strategy="epoch",
                load_best_model_at_end=True,
                metric_for_best_model="f1_micro",
                fp16=True,
                report_to="none"
            )

            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=test_dataset,
                data_collator=self.data_collator,
                tokenizer=self.tokenizer,
                compute_metrics=self.compute_metrics
            )

            print("Using the following hyperparameters: lr=" + str(learning_rate) + " - epochs=" + str(num_train_epochs) + " - batch=" + str(per_device_train_batch_size))

            trainer.train()
            eval_metrics = trainer.evaluate()

            f1_micro_scores.append(eval_metrics["eval_f1_micro"])
            f1_macro_scores.append(eval_metrics["eval_f1_macro"])
            f1_weighted_scores.append(eval_metrics["eval_f1_weighted"])
            accuracy_scores.append(eval_metrics["eval_accuracy"])
            class_f1_scores.append(eval_metrics["eval_class_f1_scores"])
            loss.append(eval_metrics["eval_loss"])
            hamming.append(eval_metrics["eval_hamming_loss"])


        # Calculate runtime
        runtime = time.time() - start_time

        # Store the results in the DataFrame
        results_df.loc[len(results_df)] = [
            trial.number,
            learning_rate,
            num_train_epochs,
            per_device_train_batch_size,
            runtime,
            np.mean(loss),
            np.mean(hamming),
            np.mean(accuracy_scores),
            np.mean(f1_micro_scores),
            np.mean(f1_macro_scores),
            np.mean(f1_weighted_scores),
            [sum(col) / len(col) for col in zip(*class_f1_scores)],
        ]

        # Save the results as a TSV file
        results_df.to_csv(filename_result, sep="\t",index=False)

        return np.mean(hamming)

    def hyperparameterSearch(self, result_filename, dataset_filename, model_name, num_trials):
        # Load data
        self.data = pd.read_csv(dataset_filename, delimiter="\t", index_col=0).reset_index(drop=True)
        self.data.columns = ["text"] + [f"aspect_{i}" for i in range(1, self.data.shape[1])]
        self.model_name = model_name

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

        # Update the results_df DataFrame
        results_df = pd.DataFrame(columns=["trial", "learning_rate", "num_train_epochs", "per_device_train_batch_size", "runtime", 
                                           "loss", "hamming_loss", "accuracy", "f1_micro", "f1_macro", "f1_weighted", "class_f1_scores"])

        # Optuna optimization
        study = optuna.create_study(direction="minimize")
        study.optimize(self.objective, n_trials=num_trials)

        # Print best hyperparameters
        print("Best trial:")
        trial = study.best_trial
        print("Value:", trial.value)
        print("Params:", trial.params)

In [None]:
filename_result = "optuna_50_gbert_cat_att_pol.tsv"
file_path = "./data/complete_re_df_cat_att_pol.tsv"
model_name = "deepset/gbert-large"
num_trials = 50
# deepset/gbert-large
# dbmdz/bert-base-german-uncased
# distilbert-base-german-cased 

absa = MultiLabelABSA()
absa.hyperparameterSearch(filename_result, file_path, model_name, num_trials)