In [1]:

import torch
import gc
torch.cuda.empty_cache()
gc.collect()


0

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Initialize the console

# Step 2: Import necessary libraries
from typing import Optional, Dict, Any
import os
import torch
import pytorch_lightning as pl
import pandas as pd
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from rich.traceback import install
from pytorch_lightning.loggers import WandbLogger
import torch.nn as nn
import torch.optim as optim
from transformers import AutoConfig
from adapters import AutoAdapterModel, AdapterConfig
from adapters.composition import Stack
from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping
import torchmetrics

install(show_locals=True)

from setup import setup_src_path
print(setup_src_path())
import data.processed as processed
import config.config as config
import utils.setup as setup
import utils.functions as fn
from importlib import reload

from datasets import load_from_disk

print(config.Config.TXT_SAVE_PATH)
print(config.Config.MODEL_SAVE_PATH)

#dataset = load_from_disk(f"../{config.Config.DATASETS_SAVE_PATH}/datasets")


['/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages', '/tmp/tmpqjqr3_vs', '/home/guest/Desktop/projects/third-experiments/SDA_experiments/modules']


2024-09-28 13:52:20.898244: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-28 13:52:20.929785: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




./text-files/
./hp-model-


In [3]:
import torch
import os
import pytorch_lightning as pl
from transformers import  AutoConfig, DataCollatorForLanguageModeling
from collections import defaultdict
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
import torchmetrics

class JointDomainTaskAdapter(pl.LightningModule):
    def __init__(self, hparams,source_dataset_length,target_dataset_length):
        super(JointDomainTaskAdapter, self).__init__()
        self.save_hyperparameters(hparams)

        # Load config with hidden states output
        self.config = AutoConfig.from_pretrained(self.hparams["pretrained_model_name"])
        self.config.output_hidden_states = True
        self.model = AutoAdapterModel.from_pretrained(self.hparams["pretrained_model_name"], config=self.config)

        # Set reduction factor and leave_out layers
        self.reduction_factor = self.hparams.get("reduction_factor", 16)
        self.leave_out = self.hparams.get("leave_out", [])
        # if self.leave_out != "None":
        #     self.leave_out = self.leave_out.split(",")
        #     self.leave_out = [int(i) for i in self.leave_out]
        # else:
        #     self.leave_out = []

        # Load MLM adapter with head
        self.model.load_adapter(f"{self.hparams['saved_adapter_dir']}/{self.hparams['domain_adapter_name']}", with_head=True)

        # Add classification head for the task
        self.model.add_classification_head(f"{self.hparams['task_adapter_name']}", num_labels=self.hparams["num_classes"])

        # Set active adapters
        self.model.train_adapter(self.hparams['domain_adapter_name'])
        # Calculate alpha based on dataset lengths
        self.alpha = source_dataset_length / (source_dataset_length + target_dataset_length)

        # Initialize loss functions and metrics
        self.criterion = nn.CrossEntropyLoss()
        self.mlm_criterion = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=self.hparams["num_classes"])
        self.f1 = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="weighted")
        self.f1_macro = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="macro")
        self.f1_micro = torchmetrics.F1Score(task='multiclass',num_classes=self.hparams["num_classes"], average="micro")

        self.softmax = nn.Softmax(dim=1)
        self.validation_outputs = []
        self.test_outputs = []
        # Optimizer related variables
        self.learning_rate = self.hparams.get("learning_rate", 1e-4)
        self.scheduler_factor = self.hparams.get("scheduler_factor", 0.1)
        self.scheduler_patience = self.hparams.get("scheduler_patience", 0.05)
        self.scheduler_threshold = self.hparams.get("scheduler_threshold", 0.0001)
        self.scheduler_cooldown = self.hparams.get("scheduler_cooldown", 0)
        self.scheduler_eps = self.hparams.get("scheduler_eps", 1e-8)

    def forward(self, input_ids, attention_mask=None, labels=None, task=None):
        if task == "mlm":
            self.model.active_head= self.hparams['domain_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        elif task == "classification":
            self.model.active_head= self.hparams['task_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        else:
            raise ValueError("Task must be either 'mlm' or 'classification'.")
        return outputs

    def training_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha

        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))

        metrics = {
            "train/accuracy": accuracy,
            "train/f1": f1,
            "train/taskclf_loss": task_loss,
            "train/loss": loss,
            "train/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return loss

    def validation_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha
        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        self.validation_outputs.append({
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
                })
        
        metrics = {
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return metrics
    def on_validation_epoch_start(self):
        self.validation_outputs = []
    
    def on_validation_epoch_end(self):
        outputs= self.validation_outputs

        avg_loss = torch.stack([x["val/loss"] for x in outputs]).mean()
        avg_task_loss = torch.stack([x["val/taskclf_loss"] for x in outputs]).mean()
        avg_mlm_loss = torch.stack([x["val/mlm_loss"] for x in outputs]).mean()
        avg_accuracy = torch.stack([x["val/accuracy"] for x in outputs]).mean()
        avg_f1 = torch.stack([x["val/f1"] for x in outputs]).mean()
        print(f"val/accuracy: {avg_accuracy}")
        print(f"val/f1: {avg_f1}")
        print(f"val/taskclf_loss: {avg_task_loss}")
        print(f"val/loss: {avg_loss}")
        print(f"val/mlm_loss: {avg_mlm_loss}")
        metrics = {
            "val/avg_loss": avg_loss,
            "val/avg_taskclf_loss": avg_task_loss,
            "val/avg_mlm_loss": avg_mlm_loss,
            "val/avg_accuracy": avg_accuracy,
            "val/avg_f1": avg_f1,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
        self.log("val_loss", avg_loss)

    def test_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        target_labels = batch["label_target"]

        # Classification task for source data
        cls_outputs_source = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits_source = cls_outputs_source.logits
        task_loss_source = self.criterion(cls_logits_source, source_labels)

        # Classification task for target data
        cls_outputs_target = self(input_ids=target_input_ids, attention_mask=target_attention_mask, task="classification")
        cls_logits_target = cls_outputs_target.logits
        task_loss_target = self.criterion(cls_logits_target, target_labels)

        # Combine losses (though typically you would evaluate them separately)
        loss = task_loss_source + task_loss_target

        accuracy_source = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_source = self.f1(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_macro_source = self.f1_macro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_micro_source = self.f1_micro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
    
        accuracy_target = self.accuracy(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_target = self.f1(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_macro_target = self.f1_macro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_micro_target = self.f1_micro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))

        metrics = {
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        self.test_outputs.append({
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        })
        return metrics
    def on_test_epoch_start(self):
        self.test_outputs = []

    def on_test_epoch_end(self):
        outputs=  self.test_outputs

        avg_loss_source = torch.stack([x["source_test/loss"] for x in outputs]).mean()
        avg_task_loss_target = torch.stack([x["target_test/loss"] for x in outputs]).mean()
        avg_accuracy_source = torch.stack([x["source_test/accuracy"] for x in outputs]).mean()
        avg_f1_source = torch.stack([x["source_test/f1"] for x in outputs]).mean()
        avg_f1_macro_source = torch.stack([x["source_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_source = torch.stack([x["source_test/f1_micro"] for x in outputs]).mean()

        avg_accuracy_target = torch.stack([x["target_test/accuracy"] for x in outputs]).mean()
        avg_f1_target = torch.stack([x["target_test/f1"] for x in outputs]).mean()
        avg_f1_macro_target = torch.stack([x["target_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_target = torch.stack([x["target_test/f1_micro"] for x in outputs]).mean()

        metrics = {
            "source_test/loss": avg_loss_source,
            "target_test/loss": avg_task_loss_target,
            "source_test/accuracy": avg_accuracy_source,
            "source_test/f1": avg_f1_source,
            "source_test/f1_macro": avg_f1_macro_source,
            "source_test/f1_micro": avg_f1_micro_source,
            "target_test/accuracy": avg_accuracy_target,
            "target_test/f1": avg_f1_target,
            "target_test/f1_macro": avg_f1_macro_target,
            "target_test/f1_micro": avg_f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
    def save_adapter(self, location, adapter_name):
        self.model.save_adapter(location, adapter_name)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams["learning_rate"])
        lr_scheduler = {
            'scheduler': optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=0.0001, cooldown=0, min_lr=1e-8),
            'monitor': 'val_loss'
        }
        return [optimizer], [lr_scheduler]


In [4]:
import wandb

wandb.login()
# Wandb setup and training loop
seeds = [42, 10, 100]  # List of seeds
project_name = 'SDA_mixed_edited'  # Replace with your wandb project name
domain = 'MRBO'  # Replace with the specific domain for this notebook
type = 'union'  # Replace with the specific type for this notebook

# Initialize results dictionary
results = {
    "last_epoch": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "best_model": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "epoch_saved": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    }
}

best_val_loss = float('inf')
best_model = None
best_model_path = ""

[34m[1mwandb[0m: Currently logged in as: [33mmrawhani5[0m ([33mmrawhani[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
reload(processed)
for seed in seeds:
    #wandb.init(project=project_name, name=f'{domain}_{type}_run_with_seed_{seed}', config={'seed': seed})

    try:
        seed_everything(seed)

        hparams = {
            "source_target": "books_MR",
            "dataset_cache_dir": "./../../datasets",
            "pretrained_model_name": "bert-base-uncased",
            "padding": True,
            "source_domain": "books",
            "target_domain": "MR",
            "domain_adapter_name": "mlm_union_mr",
            "task_adapter_name": "task_BOMRUni",
            "padding": "max_length",
            "max_seq_length": 512,
            "bsz": 32,
            "num_classes": 3,
            "learning_rate": 1e-4,
            "reduction_factor": 16,
            "mode": "domain",
            "saved_adapter_dir": "../saved/adapters",
        }

        save_dir = "checkpoints"
        save_epoch_3 = 6  # Save model at the 3rd epoch
        
        dm = processed.DataModuleSourceTargetMixed(hparams)
        dm.setup('fit')
        dm.setup("test")
        source_length, target_length = dm.get_dataset_lengths()
        print(f"Source dataset length: {source_length}")
        print(f"Target dataset length: {target_length}")
        model = JointDomainTaskAdapter(hparams,source_length,target_length)

        checkpoint_callback = ModelCheckpoint(
            filename="task-BOMRUni-{epoch:02d}-{val_loss:.2f}",
            save_top_k=1,
            monitor="val_loss",
            mode="min",
        )
        save_model_callback_epoch = ModelCheckpoint(
            filename="BOMRUni-{epoch:02d}",
            every_n_epochs=save_epoch_3,
            save_top_k=-1,
        )

        #wandb_logger = WandbLogger()

    except Exception as e:
        print(f"Error during preprocessing : {e}")

    try:
        train_loader = dm.train_dataloader()
        val_loader = dm.val_dataloader()
        trainer = Trainer(
            max_epochs=10,
            accelerator="auto",
            #precision=16,
            
            default_root_dir="checkpoints",
            #logger=wandb_logger,
            callbacks=[checkpoint_callback, save_model_callback_epoch],
            limit_train_batches=1.0,
            limit_val_batches=1.0,
            limit_test_batches=1.0,
        )

        trainer.fit(model, train_loader, val_loader)
        print(f"Best checkpoint path: {checkpoint_callback.best_model_path}")
        print(f"Saved epoch checkpoint path: {save_model_callback_epoch.best_model_path}")
    except Exception as e:
        print(f"Error during training : {e}")

    try:
        dm.setup("test")
        test_loader = dm.test_dataloader()
        test_results_last = trainer.test(model, test_loader)
        print("Test Results Last Epoch:", test_results_last)

        # Collect results for last epoch model
        for key, value in test_results_last[0].items():
            results["last_epoch"][key].append(value)

        # Paths to the saved checkpoints
        best_checkpoint_path = checkpoint_callback.best_model_path
        saved_epoch_checkpoint_path = save_model_callback_epoch.best_model_path
        print(f"Best checkpoint path: {best_checkpoint_path}")
        print(f"Saved epoch checkpoint path: {saved_epoch_checkpoint_path}")

        best_model = JointDomainTaskAdapter.load_from_checkpoint(best_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_best = trainer.test(best_model, test_loader)
        print("Test Results on Best Model:", test_results_best)
        for key, value in test_results_best[0].items():
            results["best_model"][key].append(value)

        saved_epoch_model = JointDomainTaskAdapter.load_from_checkpoint(saved_epoch_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_saved_epoch = trainer.test(saved_epoch_model, test_loader)
        print("Test Results on saved epoch:", test_results_saved_epoch)
        for key, value in test_results_saved_epoch[0].items():
            results["epoch_saved"][key].append(value)

    except Exception as e:
        print(f"Error during testing: {e}")

    #wandb.finish()

Seed set to 42




Source dataset length: 1440
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.46875
val/f1: 0.6382978558540344
val/taskclf_loss: 1.0739184617996216
val/loss: 1.7595893144607544
val/mlm_loss: 2.4452600479125977


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.893750011920929
val/f1: 0.8961753249168396
val/taskclf_loss: 0.3620692193508148
val/loss: 1.282037615776062
val/mlm_loss: 2.2020058631896973


Validation: |                                                                                                 …

val/accuracy: 0.918749988079071
val/f1: 0.9200612902641296
val/taskclf_loss: 0.22171473503112793
val/loss: 1.160902738571167
val/mlm_loss: 2.100090503692627


Validation: |                                                                                                 …

val/accuracy: 0.9125000238418579
val/f1: 0.9138299226760864
val/taskclf_loss: 0.23137736320495605
val/loss: 1.2225149869918823
val/mlm_loss: 2.2136526107788086


Validation: |                                                                                                 …

val/accuracy: 0.925000011920929
val/f1: 0.9263297915458679
val/taskclf_loss: 0.2243112176656723
val/loss: 1.2116845846176147
val/mlm_loss: 2.1990578174591064


Validation: |                                                                                                 …

val/accuracy: 0.9375
val/f1: 0.9379092454910278
val/taskclf_loss: 0.19478635489940643
val/loss: 1.2555183172225952
val/mlm_loss: 2.3162503242492676


Validation: |                                                                                                 …

val/accuracy: 0.9375
val/f1: 0.9379092454910278
val/taskclf_loss: 0.18331141769886017
val/loss: 1.2710250616073608
val/mlm_loss: 2.358738660812378


Validation: |                                                                                                 …

val/accuracy: 0.9375
val/f1: 0.9379092454910278
val/taskclf_loss: 0.18757469952106476
val/loss: 1.1311912536621094
val/mlm_loss: 2.07480788230896


Validation: |                                                                                                 …

val/accuracy: 0.9375
val/f1: 0.9379092454910278
val/taskclf_loss: 0.18806146085262299
val/loss: 1.1245592832565308
val/mlm_loss: 2.0610568523406982


Validation: |                                                                                                 …

val/accuracy: 0.9437500238418579
val/f1: 0.9441531300544739
val/taskclf_loss: 0.18807151913642883
val/loss: 1.0917876958847046
val/mlm_loss: 1.9955040216445923


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.9375
val/f1: 0.9379092454910278
val/taskclf_loss: 0.19103939831256866
val/loss: 1.323960781097412
val/mlm_loss: 2.4568822383880615


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_12/checkpoints/task-BOMRUni-epoch=08-val_loss=1.09.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_12/checkpoints/BOMRUni-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.24456702172756195, 'source_test/accuracy': 0.9134615659713745, 'source_test/f1': 0.9128592610359192, 'source_test/f1_macro': 0.911518931388855, 'source_test/f1_micro': 0.9134615659713745, 'target_test/loss': 0.4860301613807678, 'target_test/accuracy': 0.8245192766189575, 'target_test/f1': 0.8252638578414917, 'target_test/f1_macro': 0.8198986649513245, 'target_test/f1_micro': 0.8245192766189575}]
Best checkpoint path: checkpoints/lightning_logs/version_12/checkpoints/task-BOMRUni-epoch=08-val_loss=1.09.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_12/checkpoints/BOMRUni-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.24106143414974213, 'source_test/accuracy': 0.9134615659713745, 'source_test/f1': 0.9130666255950928, 'source_test/f1_macro': 0.9114319086074829, 'source_test/f1_micro': 0.9134615659713745, 'target_test/loss': 0.47656431794166565, 'target_test/accuracy': 0.8197115659713745, 'target_test/f1': 0.820267915725708, 'target_test/f1_macro': 0.815007746219635, 'target_test/f1_micro': 0.8197115659713745}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 10


Test Results on saved epoch: [{'source_test/loss': 0.2308657467365265, 'source_test/accuracy': 0.9086538553237915, 'source_test/f1': 0.9080750942230225, 'source_test/f1_macro': 0.9061116576194763, 'source_test/f1_micro': 0.9086538553237915, 'target_test/loss': 0.4622054696083069, 'target_test/accuracy': 0.8197115659713745, 'target_test/f1': 0.820434033870697, 'target_test/f1_macro': 0.8150073289871216, 'target_test/f1_micro': 0.8197115659713745}]


Source dataset length: 1440
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.453125
val/f1: 0.6139858961105347
val/taskclf_loss: 1.096156358718872
val/loss: 1.627814531326294
val/mlm_loss: 2.159472703933716


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.875
val/f1: 0.8769897818565369
val/taskclf_loss: 0.3563483655452728
val/loss: 1.4322398900985718
val/mlm_loss: 2.508131504058838


Validation: |                                                                                                 …

val/accuracy: 0.9000000357627869
val/f1: 0.9014660716056824
val/taskclf_loss: 0.28288063406944275
val/loss: 1.188454508781433
val/mlm_loss: 2.0940282344818115


Validation: |                                                                                                 …

val/accuracy: 0.9125000238418579
val/f1: 0.9138913154602051
val/taskclf_loss: 0.2521074116230011
val/loss: 1.0408456325531006
val/mlm_loss: 1.829583764076233


Validation: |                                                                                                 …

val/accuracy: 0.918749988079071
val/f1: 0.9188419580459595
val/taskclf_loss: 0.19830089807510376
val/loss: 1.199814796447754
val/mlm_loss: 2.201328754425049


Validation: |                                                                                                 …

val/accuracy: 0.9125000238418579
val/f1: 0.912524402141571
val/taskclf_loss: 0.23530851304531097
val/loss: 1.1851776838302612
val/mlm_loss: 2.1350467205047607


Validation: |                                                                                                 …

val/accuracy: 0.9125000238418579
val/f1: 0.9140772223472595
val/taskclf_loss: 0.23024757206439972
val/loss: 1.1781202554702759
val/mlm_loss: 2.125992774963379


Validation: |                                                                                                 …

val/accuracy: 0.9375
val/f1: 0.9376603364944458
val/taskclf_loss: 0.20344319939613342
val/loss: 1.216905951499939
val/mlm_loss: 2.2303688526153564


Validation: |                                                                                                 …

val/accuracy: 0.9375
val/f1: 0.9376603364944458
val/taskclf_loss: 0.2102510929107666
val/loss: 1.214324951171875
val/mlm_loss: 2.2183988094329834


Validation: |                                                                                                 …

val/accuracy: 0.925000011920929
val/f1: 0.9252725839614868
val/taskclf_loss: 0.21154780685901642
val/loss: 1.1529568433761597
val/mlm_loss: 2.0943658351898193


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.925000011920929
val/f1: 0.9252725839614868
val/taskclf_loss: 0.21062521636486053
val/loss: 1.2814816236495972
val/mlm_loss: 2.352337598800659


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_13/checkpoints/task-BOMRUni-epoch=02-val_loss=1.04.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_13/checkpoints/BOMRUni-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.27550700306892395, 'source_test/accuracy': 0.9134615659713745, 'source_test/f1': 0.9128262996673584, 'source_test/f1_macro': 0.9114868640899658, 'source_test/f1_micro': 0.9134615659713745, 'target_test/loss': 0.4968617558479309, 'target_test/accuracy': 0.8197115659713745, 'target_test/f1': 0.8208510875701904, 'target_test/f1_macro': 0.8141714334487915, 'target_test/f1_micro': 0.8197115659713745}]
Best checkpoint path: checkpoints/lightning_logs/version_13/checkpoints/task-BOMRUni-epoch=02-val_loss=1.04.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_13/checkpoints/BOMRUni-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.2606770694255829, 'source_test/accuracy': 0.8870192766189575, 'source_test/f1': 0.8868027925491333, 'source_test/f1_macro': 0.8838786482810974, 'source_test/f1_micro': 0.8870192766189575, 'target_test/loss': 0.4777698218822479, 'target_test/accuracy': 0.7884615659713745, 'target_test/f1': 0.7905043959617615, 'target_test/f1_macro': 0.7817333340644836, 'target_test/f1_micro': 0.7884615659713745}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 100


Test Results on saved epoch: [{'source_test/loss': 0.26079031825065613, 'source_test/accuracy': 0.911057710647583, 'source_test/f1': 0.9106815457344055, 'source_test/f1_macro': 0.9088802933692932, 'source_test/f1_micro': 0.911057710647583, 'target_test/loss': 0.5243277549743652, 'target_test/accuracy': 0.7980769276618958, 'target_test/f1': 0.8010111451148987, 'target_test/f1_macro': 0.7899181246757507, 'target_test/f1_micro': 0.7980769276618958}]


Source dataset length: 1440
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.40625
val/f1: 0.4985795319080353
val/taskclf_loss: 1.0878044366836548
val/loss: 2.103083372116089
val/mlm_loss: 3.1183624267578125


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.8812500238418579
val/f1: 0.8837844729423523
val/taskclf_loss: 0.32002687454223633
val/loss: 1.2838690280914307
val/mlm_loss: 2.247711181640625


Validation: |                                                                                                 …

val/accuracy: 0.9312500357627869
val/f1: 0.9326103329658508
val/taskclf_loss: 0.2212400883436203
val/loss: 1.0398225784301758
val/mlm_loss: 1.8584051132202148


Validation: |                                                                                                 …

val/accuracy: 0.9125000238418579
val/f1: 0.9138913154602051
val/taskclf_loss: 0.24732457101345062
val/loss: 1.3105783462524414
val/mlm_loss: 2.3738319873809814


Validation: |                                                                                                 …

val/accuracy: 0.9437500238418579
val/f1: 0.9445198178291321
val/taskclf_loss: 0.19073529541492462
val/loss: 1.2317208051681519
val/mlm_loss: 2.2727062702178955


Validation: |                                                                                                 …

val/accuracy: 0.918749988079071
val/f1: 0.9190379977226257
val/taskclf_loss: 0.1996711641550064
val/loss: 1.140588402748108
val/mlm_loss: 2.08150577545166


Validation: |                                                                                                 …

val/accuracy: 0.9312500357627869
val/f1: 0.9315395355224609
val/taskclf_loss: 0.19493930041790009
val/loss: 1.2177006006240845
val/mlm_loss: 2.240462064743042


Validation: |                                                                                                 …

val/accuracy: 0.9312500357627869
val/f1: 0.9315395355224609
val/taskclf_loss: 0.19766728579998016
val/loss: 1.2327470779418945
val/mlm_loss: 2.267826795578003


Validation: |                                                                                                 …

val/accuracy: 0.925000011920929
val/f1: 0.9252711534500122
val/taskclf_loss: 0.19921059906482697
val/loss: 1.186657428741455
val/mlm_loss: 2.1741044521331787


Validation: |                                                                                                 …

val/accuracy: 0.925000011920929
val/f1: 0.9252711534500122
val/taskclf_loss: 0.19900844991207123
val/loss: 1.1620107889175415
val/mlm_loss: 2.1250133514404297


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.9312500357627869
val/f1: 0.9315395355224609
val/taskclf_loss: 0.19864845275878906
val/loss: 1.1426392793655396
val/mlm_loss: 2.086630344390869


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_14/checkpoints/task-BOMRUni-epoch=01-val_loss=1.04.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_14/checkpoints/BOMRUni-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.26528745889663696, 'source_test/accuracy': 0.911057710647583, 'source_test/f1': 0.9106523394584656, 'source_test/f1_macro': 0.9086503386497498, 'source_test/f1_micro': 0.911057710647583, 'target_test/loss': 0.4786021411418915, 'target_test/accuracy': 0.822115421295166, 'target_test/f1': 0.8224901556968689, 'target_test/f1_macro': 0.817767858505249, 'target_test/f1_micro': 0.822115421295166}]
Best checkpoint path: checkpoints/lightning_logs/version_14/checkpoints/task-BOMRUni-epoch=01-val_loss=1.04.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_14/checkpoints/BOMRUni-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.2540753185749054, 'source_test/accuracy': 0.889423131942749, 'source_test/f1': 0.8888266682624817, 'source_test/f1_macro': 0.8864423632621765, 'source_test/f1_micro': 0.889423131942749, 'target_test/loss': 0.4719845950603485, 'target_test/accuracy': 0.795673131942749, 'target_test/f1': 0.7968828678131104, 'target_test/f1_macro': 0.7903944253921509, 'target_test/f1_micro': 0.795673131942749}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on saved epoch: [{'source_test/loss': 0.2607662081718445, 'source_test/accuracy': 0.9182692766189575, 'source_test/f1': 0.9179616570472717, 'source_test/f1_macro': 0.9158291816711426, 'source_test/f1_micro': 0.9182692766189575, 'target_test/loss': 0.4786612391471863, 'target_test/accuracy': 0.8149038553237915, 'target_test/f1': 0.8153713345527649, 'target_test/f1_macro': 0.8105397820472717, 'target_test/f1_micro': 0.8149038553237915}]


In [6]:
results.items()

dict_items([('last_epoch', {'source_test/loss': [0.24456702172756195, 0.27550700306892395, 0.26528745889663696], 'source_test/accuracy': [0.9134615659713745, 0.9134615659713745, 0.911057710647583], 'source_test/f1': [0.9128592610359192, 0.9128262996673584, 0.9106523394584656], 'source_test/f1_macro': [0.911518931388855, 0.9114868640899658, 0.9086503386497498], 'source_test/f1_micro': [0.9134615659713745, 0.9134615659713745, 0.911057710647583], 'target_test/loss': [0.4860301613807678, 0.4968617558479309, 0.4786021411418915], 'target_test/accuracy': [0.8245192766189575, 0.8197115659713745, 0.822115421295166], 'target_test/f1': [0.8252638578414917, 0.8208510875701904, 0.8224901556968689], 'target_test/f1_macro': [0.8198986649513245, 0.8141714334487915, 0.817767858505249], 'target_test/f1_micro': [0.8245192766189575, 0.8197115659713745, 0.822115421295166]}), ('best_model', {'source_test/loss': [0.24106143414974213, 0.2606770694255829, 0.2540753185749054], 'source_test/accuracy': [0.9134615

In [7]:
# Calculate mean and standard deviation for each scenario
mean_results = {scenario: {key: np.mean(values) for key, values in metrics.items()} for scenario, metrics in results.items()}
std_results = {scenario: {key: np.std(values) for key, values in metrics.items()} for scenario, metrics in results.items()}

# Log mean and standard deviation results to wandb
# wandb.init(project=project_name, name=f'{domain}_mean_results')
# for scenario in mean_results:
#     for key, value in mean_results[scenario].items():
#         wandb.log({f"{scenario}/{key}": value})
#         wandb.log({f"{scenario}/{key}_std": std_results[scenario][key]})
# wandb.finish()

print("Mean Results:", mean_results)
print("Standard Deviation Results:", std_results)

# # Save the best model's adapter
# if model:
#     adapter_save_path = f"../../saved/adapter_after_run/{hparams['task_adapter_name']}"
#     model.save_adapter(adapter_save_path, hparams['task_adapter_name'])
#     print(f"Adapter saved to {adapter_save_path}")
# else:
#     print("No best model to save.")

Mean Results: {'last_epoch': {'source_test/loss': 0.26178716123104095, 'source_test/accuracy': 0.912660280863444, 'source_test/f1': 0.9121126333872477, 'source_test/f1_macro': 0.9105520447095236, 'source_test/f1_micro': 0.912660280863444, 'target_test/loss': 0.4871646861235301, 'target_test/accuracy': 0.822115421295166, 'target_test/f1': 0.8228683670361837, 'target_test/f1_macro': 0.817279318968455, 'target_test/f1_micro': 0.822115421295166}, 'best_model': {'source_test/loss': 0.25193794071674347, 'source_test/accuracy': 0.8966346581776937, 'source_test/f1': 0.896232028802236, 'source_test/f1_macro': 0.8939176400502523, 'source_test/f1_micro': 0.8966346581776937, 'target_test/loss': 0.47543957829475403, 'target_test/accuracy': 0.8012820879618326, 'target_test/f1': 0.8025517265001932, 'target_test/f1_macro': 0.7957118352254232, 'target_test/f1_micro': 0.8012820879618326}, 'epoch_saved': {'source_test/loss': 0.25080742438634235, 'source_test/accuracy': 0.912660280863444, 'source_test/f1'

In [8]:
print('dones')

dones


In [9]:
best_val_loss

inf