In [4]:
!pip install datasets
!pip install optuna

Collecting optuna
  Downloading optuna-3.2.0-py3-none-any.whl (390 kB)
[K     |████████████████████████████████| 390 kB 6.3 MB/s eta 0:00:01
[?25hCollecting alembic>=1.5.0
  Downloading alembic-1.11.1-py3-none-any.whl (224 kB)
[K     |████████████████████████████████| 224 kB 48.3 MB/s eta 0:00:01
[?25hCollecting cmaes>=0.9.1
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting sqlalchemy>=1.3.0
  Downloading SQLAlchemy-2.0.19-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 46.2 MB/s eta 0:00:01
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 15.6 MB/s  eta 0:00:01
Collecting greenlet!=0.4.17; platform_machine == "aarch64" or (platform_machine == "ppc64le" or (platform_machine == "x86_64" or (platform_machine == "amd64" or (platform_machine == "AMD64" or (platform

In [11]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from scipy.special import expit
import torch
from torch.utils.data import Dataset as TorchDataset
from transformers import DataCollatorWithPadding
import optuna
import concurrent.futures
import time
from torch.utils.data import DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, hamming_loss, precision_score, recall_score
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="transformers.optimization")

class CustomDataset(TorchDataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item["label"] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.labels)

class MultiLabelABSA:
    def __init__(self, data_path, result_path, model_name):
        self.data_path = data_path
        self.result_path = result_path
        self.model_id = model_name
    
    def preprocess_data(self, data, tokenizer):
        texts = data["text"].tolist()
        labels = data.iloc[:, 1:].astype(float).values.tolist()
        labels = torch.tensor(labels, dtype=torch.float32)
        encodings = tokenizer(texts, padding=True, truncation=True, max_length=256, return_tensors="pt")
        return CustomDataset(encodings, labels)

    def create_model(self):
        model = AutoModelForSequenceClassification.from_pretrained(
            self.model_id,
            num_labels=self.data.shape[1] - 1,
            problem_type="multi_label_classification"
        )
        return model

    def compute_metrics(self, eval_pred):
        predictions, lab = eval_pred

        predictions = (expit(predictions) > 0.5)
        labels = [l==1 for l in lab]

        accuracy = accuracy_score(labels, predictions)

        f1_macro = f1_score(labels, predictions, average="macro", zero_division=0)
        f1_micro = f1_score(labels, predictions, average="micro", zero_division=0)
        f1_weighted = f1_score(labels, predictions, average="weighted", zero_division=0)

        class_f1_scores = f1_score(labels, predictions, average=None, zero_division=0)

        hamming = hamming_loss(labels, predictions)

        return {
            "hamming_loss": hamming,
            "accuracy": accuracy,
            "f1_macro": f1_macro,
            "f1_micro": f1_micro,
            "f1_weighted": f1_weighted,
            "class_f1_scores": class_f1_scores.tolist(),
        }

    def multilabel_stratified_sampling(self, data, n_splits=4, random_state=42):
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

        # Convert multilabel targets to multiclass targets
        targets = data.iloc[:, 1:].values.argmax(axis=1)

        for train_index, test_index in skf.split(data["text"], targets):
            train_data, test_data = data.iloc[train_index], data.iloc[test_index]
            yield train_data, test_data

    def objective(self, trial):        
        learning_rate = trial.suggest_float("learning_rate", self.hyperparameters["learning_rate"][0], self.hyperparameters["learning_rate"][1], log=True)
        num_train_epochs = trial.suggest_int("num_train_epochs", self.hyperparameters["epochs"][0], self.hyperparameters["epochs"][1])
        per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", self.hyperparameters["batch_size"]) # Times two since the system uses 2 GPUs

        f1_micro_scores = []
        f1_macro_scores = []
        f1_weighted_scores = []
        accuracy_scores = []
        class_f1_scores = []
        loss = []
        hamming = []

        # Start measuring the runtime
        start_time = time.time()

        for train_data, test_data in self.multilabel_stratified_sampling(self.data, n_splits = 4, random_state = 2):
            train_dataset = self.preprocess_data(train_data, self.tokenizer)
            test_dataset = self.preprocess_data(test_data, self.tokenizer)

            model = self.create_model()

            training_args = TrainingArguments(
                output_dir="output",
                learning_rate=learning_rate,
                num_train_epochs=num_train_epochs,
                per_device_train_batch_size=per_device_train_batch_size,
                per_device_eval_batch_size=16,
                evaluation_strategy="epoch",
                save_strategy="epoch",
                logging_dir="logs",
                logging_steps=100,
                logging_strategy="epoch",
                load_best_model_at_end=True,
                metric_for_best_model="f1_micro",
                fp16=True,
                report_to="none"
            )

            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=test_dataset,
                data_collator=self.data_collator,
                tokenizer=self.tokenizer,
                compute_metrics=self.compute_metrics
            )

            print("Using the following hyperparameters: lr=" + str(learning_rate) + " - epochs=" + str(num_train_epochs) + " - batch=" + str(per_device_train_batch_size))

            trainer.train()
            eval_metrics = trainer.evaluate()

            f1_micro_scores.append(eval_metrics["eval_f1_micro"])
            f1_macro_scores.append(eval_metrics["eval_f1_macro"])
            f1_weighted_scores.append(eval_metrics["eval_f1_weighted"])
            accuracy_scores.append(eval_metrics["eval_accuracy"])
            class_f1_scores.append(eval_metrics["eval_class_f1_scores"])
            loss.append(eval_metrics["eval_loss"])
            hamming.append(eval_metrics["eval_hamming_loss"])


        # Calculate runtime
        runtime = time.time() - start_time

        # Store the results in the DataFrame
        self.results_df.loc[len(self.results_df)] = [
            trial.number,
            learning_rate,
            num_train_epochs,
            per_device_train_batch_size,
            runtime,
            np.mean(loss),
            np.mean(hamming),
            np.mean(accuracy_scores),
            np.mean(f1_micro_scores),
            np.mean(f1_macro_scores),
            np.mean(f1_weighted_scores),
            [sum(col) / len(col) for col in zip(*class_f1_scores)],
        ]

        # Save the results as a TSV file
        self.results_df.to_csv(self.result_path, sep="\t",index=False)

        return np.mean(hamming)
            
    def hyperparameterSearch(self, hp_config):
        # Load data
        self.hyperparameters = hp_config
        
        self.data = pd.read_csv(self.data_path, delimiter="\t", index_col=0).reset_index(drop=True)
        self.data.columns = ["text"] + [f"aspect_{i}" for i in range(1, self.data.shape[1])]

        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

        # Update the results_df DataFrame
        self.results_df = pd.DataFrame(columns=["trial", "learning_rate", "num_train_epochs", "per_device_train_batch_size", "runtime", 
                                           "loss", "hamming_loss", "accuracy", "f1_micro", "f1_macro", "f1_weighted", "class_f1_scores"])

        # Optuna optimization
        study = optuna.create_study(direction="minimize")
        study.optimize(self.objective, n_trials=self.hyperparameters['num_trials'])

In [None]:
result_path = "optuna_50_gbert_cat_att_pol.tsv"
data_path = "./data/complete_re_df_cat_att_pol.tsv"
model_name = "deepset/gbert-large"

hyperparameters = {
    "num_trials": 50,
    "epochs": [2,5],
    "batch_size": [4,8,16], # Times the number of GPUs
    "learning_rate": [1e-5, 9e-5]
}

# deepset/gbert-large
# dbmdz/bert-base-german-uncased
# distilbert-base-german-cased 

absa = MultiLabelABSA(data_path, result_path, model_name)
absa.hyperparameterSearch(hyperparameters)