In [1]:

import torch
import gc
torch.cuda.empty_cache()
gc.collect()


0

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Initialize the console

# Step 2: Import necessary libraries
from typing import Optional, Dict, Any
import os
import torch
import pytorch_lightning as pl
import pandas as pd
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from rich.traceback import install
from pytorch_lightning.loggers import WandbLogger
import torch.nn as nn
import torch.optim as optim
from transformers import AutoConfig
from adapters import AutoAdapterModel, AdapterConfig
from adapters.composition import Stack
from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping
import torchmetrics

install(show_locals=True)

from setup import setup_src_path
print(setup_src_path())
import data.processed as processed
import config.config as config
import utils.setup as setup
import utils.functions as fn
from importlib import reload

from datasets import load_from_disk

print(config.Config.TXT_SAVE_PATH)
print(config.Config.MODEL_SAVE_PATH)

#dataset = load_from_disk(f"../{config.Config.DATASETS_SAVE_PATH}/datasets")


['/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages', '/tmp/tmpilr_tz0w', '/home/guest/Desktop/projects/third-experiments/SDA_experiments/modules']


2024-09-28 12:06:57.400238: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-28 12:06:57.431117: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




./text-files/
./hp-model-


In [3]:
import torch
import os
import pytorch_lightning as pl
from transformers import  AutoConfig, DataCollatorForLanguageModeling
from collections import defaultdict
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
import torchmetrics

class JointDomainTaskAdapter(pl.LightningModule):
    def __init__(self, hparams,source_dataset_length,target_dataset_length):
        super(JointDomainTaskAdapter, self).__init__()
        self.save_hyperparameters(hparams)

        # Load config with hidden states output
        self.config = AutoConfig.from_pretrained(self.hparams["pretrained_model_name"])
        self.config.output_hidden_states = True
        self.model = AutoAdapterModel.from_pretrained(self.hparams["pretrained_model_name"], config=self.config)

        # Set reduction factor and leave_out layers
        self.reduction_factor = self.hparams.get("reduction_factor", 16)
        self.leave_out = self.hparams.get("leave_out", [])
        # if self.leave_out != "None":
        #     self.leave_out = self.leave_out.split(",")
        #     self.leave_out = [int(i) for i in self.leave_out]
        # else:
        #     self.leave_out = []

        # Load MLM adapter with head
        self.model.load_adapter(f"{self.hparams['saved_adapter_dir']}/{self.hparams['domain_adapter_name']}", with_head=True)

        # Add classification head for the task
        self.model.add_classification_head(f"{self.hparams['task_adapter_name']}", num_labels=self.hparams["num_classes"])

        # Set active adapters
        self.model.train_adapter(self.hparams['domain_adapter_name'])
        # Calculate alpha based on dataset lengths
        self.alpha = source_dataset_length / (source_dataset_length + target_dataset_length)

        # Initialize loss functions and metrics
        self.criterion = nn.CrossEntropyLoss()
        self.mlm_criterion = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=self.hparams["num_classes"])
        self.f1 = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="weighted")
        self.f1_macro = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="macro")
        self.f1_micro = torchmetrics.F1Score(task='multiclass',num_classes=self.hparams["num_classes"], average="micro")

        self.softmax = nn.Softmax(dim=1)
        self.validation_outputs = []
        self.test_outputs = []
        # Optimizer related variables
        self.learning_rate = self.hparams.get("learning_rate", 1e-4)
        self.scheduler_factor = self.hparams.get("scheduler_factor", 0.1)
        self.scheduler_patience = self.hparams.get("scheduler_patience", 0.05)
        self.scheduler_threshold = self.hparams.get("scheduler_threshold", 0.0001)
        self.scheduler_cooldown = self.hparams.get("scheduler_cooldown", 0)
        self.scheduler_eps = self.hparams.get("scheduler_eps", 1e-8)

    def forward(self, input_ids, attention_mask=None, labels=None, task=None):
        if task == "mlm":
            self.model.active_head= self.hparams['domain_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        elif task == "classification":
            self.model.active_head= self.hparams['task_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        else:
            raise ValueError("Task must be either 'mlm' or 'classification'.")
        return outputs

    def training_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha

        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))

        metrics = {
            "train/accuracy": accuracy,
            "train/f1": f1,
            "train/taskclf_loss": task_loss,
            "train/loss": loss,
            "train/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return loss

    def validation_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha
        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        self.validation_outputs.append({
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
                })
        
        metrics = {
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return metrics
    def on_validation_epoch_start(self):
        self.validation_outputs = []
    
    def on_validation_epoch_end(self):
        outputs= self.validation_outputs

        avg_loss = torch.stack([x["val/loss"] for x in outputs]).mean()
        avg_task_loss = torch.stack([x["val/taskclf_loss"] for x in outputs]).mean()
        avg_mlm_loss = torch.stack([x["val/mlm_loss"] for x in outputs]).mean()
        avg_accuracy = torch.stack([x["val/accuracy"] for x in outputs]).mean()
        avg_f1 = torch.stack([x["val/f1"] for x in outputs]).mean()
        print(f"val/accuracy: {avg_accuracy}")
        print(f"val/f1: {avg_f1}")
        print(f"val/taskclf_loss: {avg_task_loss}")
        print(f"val/loss: {avg_loss}")
        print(f"val/mlm_loss: {avg_mlm_loss}")
        metrics = {
            "val/avg_loss": avg_loss,
            "val/avg_taskclf_loss": avg_task_loss,
            "val/avg_mlm_loss": avg_mlm_loss,
            "val/avg_accuracy": avg_accuracy,
            "val/avg_f1": avg_f1,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
        self.log("val_loss", avg_loss)

    def test_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        target_labels = batch["label_target"]

        # Classification task for source data
        cls_outputs_source = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits_source = cls_outputs_source.logits
        task_loss_source = self.criterion(cls_logits_source, source_labels)

        # Classification task for target data
        cls_outputs_target = self(input_ids=target_input_ids, attention_mask=target_attention_mask, task="classification")
        cls_logits_target = cls_outputs_target.logits
        task_loss_target = self.criterion(cls_logits_target, target_labels)

        # Combine losses (though typically you would evaluate them separately)
        loss = task_loss_source + task_loss_target

        accuracy_source = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_source = self.f1(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_macro_source = self.f1_macro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_micro_source = self.f1_micro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
    
        accuracy_target = self.accuracy(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_target = self.f1(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_macro_target = self.f1_macro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_micro_target = self.f1_micro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))

        metrics = {
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        self.test_outputs.append({
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        })
        return metrics
    def on_test_epoch_start(self):
        self.test_outputs = []

    def on_test_epoch_end(self):
        outputs=  self.test_outputs

        avg_loss_source = torch.stack([x["source_test/loss"] for x in outputs]).mean()
        avg_task_loss_target = torch.stack([x["target_test/loss"] for x in outputs]).mean()
        avg_accuracy_source = torch.stack([x["source_test/accuracy"] for x in outputs]).mean()
        avg_f1_source = torch.stack([x["source_test/f1"] for x in outputs]).mean()
        avg_f1_macro_source = torch.stack([x["source_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_source = torch.stack([x["source_test/f1_micro"] for x in outputs]).mean()

        avg_accuracy_target = torch.stack([x["target_test/accuracy"] for x in outputs]).mean()
        avg_f1_target = torch.stack([x["target_test/f1"] for x in outputs]).mean()
        avg_f1_macro_target = torch.stack([x["target_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_target = torch.stack([x["target_test/f1_micro"] for x in outputs]).mean()

        metrics = {
            "source_test/loss": avg_loss_source,
            "target_test/loss": avg_task_loss_target,
            "source_test/accuracy": avg_accuracy_source,
            "source_test/f1": avg_f1_source,
            "source_test/f1_macro": avg_f1_macro_source,
            "source_test/f1_micro": avg_f1_micro_source,
            "target_test/accuracy": avg_accuracy_target,
            "target_test/f1": avg_f1_target,
            "target_test/f1_macro": avg_f1_macro_target,
            "target_test/f1_micro": avg_f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
    def save_adapter(self, location, adapter_name):
        self.model.save_adapter(location, adapter_name)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams["learning_rate"])
        lr_scheduler = {
            'scheduler': optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=0.0001, cooldown=0, min_lr=1e-8),
            'monitor': 'val_loss'
        }
        return [optimizer], [lr_scheduler]


In [4]:
import wandb

wandb.login()
# Wandb setup and training loop
seeds = [10, 100, 1000]  # List of seeds
project_name = 'SDA_mixed_edited'  # Replace with your wandb project name
domain = 'BAMR'  # Replace with the specific domain for this notebook
type = 'union'  # Replace with the specific type for this notebook

# Initialize results dictionary
results = {
    "last_epoch": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "best_model": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "epoch_saved": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    }
}

best_val_loss = float('inf')
best_model = None
best_model_path = ""

[34m[1mwandb[0m: Currently logged in as: [33mmrawhani5[0m ([33mmrawhani[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
reload(processed)
for seed in seeds:
    #wandb.init(project=project_name, name=f'{domain}_{type}_run_with_seed_{seed}', config={'seed': seed})

    try:
        seed_everything(seed)

        hparams = {
            "source_target": "baby_MR",
            "dataset_cache_dir": "./../../datasets",
            "pretrained_model_name": "bert-base-uncased",
            "padding": True,
            "source_domain": "baby",
            "target_domain": "MR",
            "domain_adapter_name": "mlm_union_mr",
            "task_adapter_name": "BAMRUni",
            "padding": "max_length",
            "max_seq_length": 512,
            "bsz": 32,
            "num_classes": 3,
            "learning_rate": 1e-4,
            "reduction_factor": 16,
            "mode": "domain",
            "saved_adapter_dir": "../saved/adapters",
        }

        save_dir = "checkpoints"
        save_epoch_3 = 6  # Save model at the 3rd epoch
        
        dm = processed.DataModuleSourceTargetMixed(hparams)
        dm.setup('fit')
        dm.setup("test")
        source_length, target_length = dm.get_dataset_lengths()
        print(f"Source dataset length: {source_length}")
        print(f"Target dataset length: {target_length}")
        model = JointDomainTaskAdapter(hparams,source_length,target_length)

        checkpoint_callback = ModelCheckpoint(
            filename="task-BAMRUni-{epoch:02d}-{val_loss:.2f}",
            save_top_k=1,
            monitor="val_loss",
            mode="min",
        )
        save_model_callback_epoch = ModelCheckpoint(
            filename="BAMRUni-{epoch:02d}",
            every_n_epochs=save_epoch_3,
            save_top_k=-1,
        )

        #wandb_logger = WandbLogger()

    except Exception as e:
        print(f"Error during preprocessing : {e}")

    try:
        train_loader = dm.train_dataloader()
        val_loader = dm.val_dataloader()
        trainer = Trainer(
            max_epochs=10,
            accelerator="auto",
            #precision=16,
            
            default_root_dir="checkpoints",
            #logger=wandb_logger,
            callbacks=[checkpoint_callback, save_model_callback_epoch],
            limit_train_batches=1.0,
            limit_val_batches=1.0,
            limit_test_batches=1.0,
        )

        trainer.fit(model, train_loader, val_loader)
        print(f"Best checkpoint path: {checkpoint_callback.best_model_path}")
        print(f"Saved epoch checkpoint path: {save_model_callback_epoch.best_model_path}")
    except Exception as e:
        print(f"Error during training : {e}")

    try:
        dm.setup("test")
        test_loader = dm.test_dataloader()
        test_results_last = trainer.test(model, test_loader)
        print("Test Results Last Epoch:", test_results_last)

        # Collect results for last epoch model
        for key, value in test_results_last[0].items():
            results["last_epoch"][key].append(value)

        # Paths to the saved checkpoints
        best_checkpoint_path = checkpoint_callback.best_model_path
        saved_epoch_checkpoint_path = save_model_callback_epoch.best_model_path
        print(f"Best checkpoint path: {best_checkpoint_path}")
        print(f"Saved epoch checkpoint path: {saved_epoch_checkpoint_path}")

        best_model = JointDomainTaskAdapter.load_from_checkpoint(best_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_best = trainer.test(best_model, test_loader)
        print("Test Results on Best Model:", test_results_best)
        for key, value in test_results_best[0].items():
            results["best_model"][key].append(value)

        saved_epoch_model = JointDomainTaskAdapter.load_from_checkpoint(saved_epoch_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_saved_epoch = trainer.test(saved_epoch_model, test_loader)
        print("Test Results on saved epoch:", test_results_saved_epoch)
        for key, value in test_results_saved_epoch[0].items():
            results["epoch_saved"][key].append(value)

    except Exception as e:
        print(f"Error during testing: {e}")

    #wandb.finish()

Seed set to 10




Source dataset length: 1350
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.4375
val/f1: 0.6080378293991089
val/taskclf_loss: 1.1097078323364258
val/loss: 1.6515218019485474
val/mlm_loss: 2.159472703933716


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.84375
val/f1: 0.8439604640007019
val/taskclf_loss: 0.4149326980113983
val/loss: 1.489432454109192
val/mlm_loss: 2.4967758655548096


Validation: |                                                                                                 …

val/accuracy: 0.8687500357627869
val/f1: 0.8688709139823914
val/taskclf_loss: 0.31212952733039856
val/loss: 1.2255337238311768
val/mlm_loss: 2.081850051879883


Validation: |                                                                                                 …

val/accuracy: 0.856249988079071
val/f1: 0.856055736541748
val/taskclf_loss: 0.31303611397743225
val/loss: 1.0972684621810913
val/mlm_loss: 1.8324863910675049


Validation: |                                                                                                 …

val/accuracy: 0.862500011920929
val/f1: 0.8625532984733582
val/taskclf_loss: 0.2936737537384033
val/loss: 1.2733261585235596
val/mlm_loss: 2.1917502880096436


Validation: |                                                                                                 …

val/accuracy: 0.8500000238418579
val/f1: 0.850707471370697
val/taskclf_loss: 0.48732826113700867
val/loss: 1.3327009677886963
val/mlm_loss: 2.1252381801605225


Validation: |                                                                                                 …

val/accuracy: 0.84375
val/f1: 0.8446053862571716
val/taskclf_loss: 0.5645343661308289
val/loss: 1.3727556467056274
val/mlm_loss: 2.130463123321533


Validation: |                                                                                                 …

val/accuracy: 0.856249988079071
val/f1: 0.8566280603408813
val/taskclf_loss: 0.36356446146965027
val/loss: 1.3367737531661987
val/mlm_loss: 2.249157428741455


Validation: |                                                                                                 …

val/accuracy: 0.856249988079071
val/f1: 0.8566280603408813
val/taskclf_loss: 0.38848015666007996
val/loss: 1.339909553527832
val/mlm_loss: 2.231874704360962


Validation: |                                                                                                 …

val/accuracy: 0.856249988079071
val/f1: 0.8566280603408813
val/taskclf_loss: 0.408048152923584
val/loss: 1.2759088277816772
val/mlm_loss: 2.0895280838012695


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.856249988079071
val/f1: 0.8566280603408813
val/taskclf_loss: 0.41102147102355957
val/loss: 1.4167641401290894
val/mlm_loss: 2.3596479892730713


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_6/checkpoints/task-BAMRUni-epoch=02-val_loss=1.10.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_6/checkpoints/BAMRUni-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.22126220166683197, 'source_test/accuracy': 0.9350962042808533, 'source_test/f1': 0.9355105757713318, 'source_test/f1_macro': 0.9293885231018066, 'source_test/f1_micro': 0.9350962042808533, 'target_test/loss': 0.5666314363479614, 'target_test/accuracy': 0.7620192766189575, 'target_test/f1': 0.7622856497764587, 'target_test/f1_macro': 0.7569375038146973, 'target_test/f1_micro': 0.7620192766189575}]
Best checkpoint path: checkpoints/lightning_logs/version_6/checkpoints/task-BAMRUni-epoch=02-val_loss=1.10.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_6/checkpoints/BAMRUni-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.2089785486459732, 'source_test/accuracy': 0.915865421295166, 'source_test/f1': 0.9161143898963928, 'source_test/f1_macro': 0.90845787525177, 'source_test/f1_micro': 0.915865421295166, 'target_test/loss': 0.5439696907997131, 'target_test/accuracy': 0.75, 'target_test/f1': 0.7523181438446045, 'target_test/f1_macro': 0.7437962889671326, 'target_test/f1_micro': 0.75}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 100


Test Results on saved epoch: [{'source_test/loss': 0.2799225449562073, 'source_test/accuracy': 0.8942307829856873, 'source_test/f1': 0.8944777250289917, 'source_test/f1_macro': 0.8864961266517639, 'source_test/f1_micro': 0.8942307829856873, 'target_test/loss': 0.7494023442268372, 'target_test/accuracy': 0.723557710647583, 'target_test/f1': 0.7345360517501831, 'target_test/f1_macro': 0.7111196517944336, 'target_test/f1_micro': 0.723557710647583}]


Source dataset length: 1350
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.40625
val/f1: 0.47630566358566284
val/taskclf_loss: 1.095911979675293
val/loss: 2.1397571563720703
val/mlm_loss: 3.1183624267578125


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.8500000238418579
val/f1: 0.8501921892166138
val/taskclf_loss: 0.3968508243560791
val/loss: 1.3513721227645874
val/mlm_loss: 2.2462358474731445


Validation: |                                                                                                 …

val/accuracy: 0.856249988079071
val/f1: 0.8561772704124451
val/taskclf_loss: 0.31877192854881287
val/loss: 1.117201805114746
val/mlm_loss: 1.8657299280166626


Validation: |                                                                                                 …

val/accuracy: 0.8500000238418579
val/f1: 0.850195050239563
val/taskclf_loss: 0.36768198013305664
val/loss: 1.410003662109375
val/mlm_loss: 2.3871803283691406


Validation: |                                                                                                 …

val/accuracy: 0.856249988079071
val/f1: 0.8562885522842407
val/taskclf_loss: 0.32222920656204224
val/loss: 1.3336594104766846
val/mlm_loss: 2.2818753719329834


Validation: |                                                                                                 …

val/accuracy: 0.84375
val/f1: 0.8443613052368164
val/taskclf_loss: 0.5617190599441528
val/loss: 1.3472384214401245
val/mlm_loss: 2.083662748336792


Validation: |                                                                                                 …

val/accuracy: 0.856249988079071
val/f1: 0.8566654324531555
val/taskclf_loss: 0.3623936176300049
val/loss: 1.3308305740356445
val/mlm_loss: 2.2387404441833496


Validation: |                                                                                                 …

val/accuracy: 0.856249988079071
val/f1: 0.8566654324531555
val/taskclf_loss: 0.3677801787853241
val/loss: 1.349737524986267
val/mlm_loss: 2.270322799682617


Validation: |                                                                                                 …

val/accuracy: 0.856249988079071
val/f1: 0.8565239310264587
val/taskclf_loss: 0.3800317347049713
val/loss: 1.3091405630111694
val/mlm_loss: 2.180180311203003


Validation: |                                                                                                 …

val/accuracy: 0.856249988079071
val/f1: 0.8565239310264587
val/taskclf_loss: 0.3774048984050751
val/loss: 1.278159499168396
val/mlm_loss: 2.12261700630188


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.8500000238418579
val/f1: 0.8505447506904602
val/taskclf_loss: 0.37552186846733093
val/loss: 1.2548288106918335
val/mlm_loss: 2.079179286956787


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_7/checkpoints/task-BAMRUni-epoch=01-val_loss=1.12.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_7/checkpoints/BAMRUni-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.1937747746706009, 'source_test/accuracy': 0.9350962042808533, 'source_test/f1': 0.9352640509605408, 'source_test/f1_macro': 0.9292432069778442, 'source_test/f1_micro': 0.9350962042808533, 'target_test/loss': 0.5428446531295776, 'target_test/accuracy': 0.7836538553237915, 'target_test/f1': 0.7838585376739502, 'target_test/f1_macro': 0.7796016931533813, 'target_test/f1_micro': 0.7836538553237915}]
Best checkpoint path: checkpoints/lightning_logs/version_7/checkpoints/task-BAMRUni-epoch=01-val_loss=1.12.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_7/checkpoints/BAMRUni-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.23590029776096344, 'source_test/accuracy': 0.9038462042808533, 'source_test/f1': 0.9040473103523254, 'source_test/f1_macro': 0.8958268165588379, 'source_test/f1_micro': 0.9038462042808533, 'target_test/loss': 0.5181047916412354, 'target_test/accuracy': 0.7620192766189575, 'target_test/f1': 0.763387143611908, 'target_test/f1_macro': 0.7569631934165955, 'target_test/f1_micro': 0.7620192766189575}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 1000


Test Results on saved epoch: [{'source_test/loss': 0.18866018950939178, 'source_test/accuracy': 0.9350962042808533, 'source_test/f1': 0.9352640509605408, 'source_test/f1_macro': 0.9292432069778442, 'source_test/f1_micro': 0.9350962042808533, 'target_test/loss': 0.5188104510307312, 'target_test/accuracy': 0.790865421295166, 'target_test/f1': 0.7909207344055176, 'target_test/f1_macro': 0.7869389057159424, 'target_test/f1_micro': 0.790865421295166}]


Source dataset length: 1350
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.046875
val/f1: 0.01732456125319004
val/taskclf_loss: 1.1648356914520264
val/loss: 1.830263614654541
val/mlm_loss: 2.4541025161743164


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.831250011920929
val/f1: 0.8318657279014587
val/taskclf_loss: 0.39629408717155457
val/loss: 1.5180885791778564
val/mlm_loss: 2.5697708129882812


Validation: |                                                                                                 …

val/accuracy: 0.8812500238418579
val/f1: 0.8812188506126404
val/taskclf_loss: 0.3170977234840393
val/loss: 1.3635236024856567
val/mlm_loss: 2.344547986984253


Validation: |                                                                                                 …

val/accuracy: 0.856249988079071
val/f1: 0.856055736541748
val/taskclf_loss: 0.3422779142856598
val/loss: 1.3034042119979858
val/mlm_loss: 2.2044601440429688


Validation: |                                                                                                 …

val/accuracy: 0.8687500357627869
val/f1: 0.8690189719200134
val/taskclf_loss: 0.29843077063560486
val/loss: 1.3735932111740112
val/mlm_loss: 2.3815581798553467


Validation: |                                                                                                 …

val/accuracy: 0.856249988079071
val/f1: 0.8563904166221619
val/taskclf_loss: 0.44666776061058044
val/loss: 1.323683261871338
val/mlm_loss: 2.1458852291107178


Validation: |                                                                                                 …

val/accuracy: 0.831250011920929
val/f1: 0.833469569683075
val/taskclf_loss: 0.6789659857749939
val/loss: 1.3136646747589111
val/mlm_loss: 1.908694863319397


Validation: |                                                                                                 …

val/accuracy: 0.8812500238418579
val/f1: 0.8818427920341492
val/taskclf_loss: 0.35007452964782715
val/loss: 1.2297725677490234
val/mlm_loss: 2.0544896125793457


Validation: |                                                                                                 …

val/accuracy: 0.862500011920929
val/f1: 0.863124668598175
val/taskclf_loss: 0.3768443763256073
val/loss: 1.3933343887329102
val/mlm_loss: 2.3462941646575928


Validation: |                                                                                                 …

val/accuracy: 0.8687500357627869
val/f1: 0.8693313598632812
val/taskclf_loss: 0.38474446535110474
val/loss: 1.2179043292999268
val/mlm_loss: 1.998991847038269


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.862500011920929
val/f1: 0.8632106781005859
val/taskclf_loss: 0.3967437744140625
val/loss: 1.2898285388946533
val/mlm_loss: 2.1270956993103027


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_8/checkpoints/task-BAMRUni-epoch=08-val_loss=1.22.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_8/checkpoints/BAMRUni-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.21652288734912872, 'source_test/accuracy': 0.9350962042808533, 'source_test/f1': 0.9354134798049927, 'source_test/f1_macro': 0.9293866753578186, 'source_test/f1_micro': 0.9350962042808533, 'target_test/loss': 0.569366991519928, 'target_test/accuracy': 0.7716346383094788, 'target_test/f1': 0.7716100215911865, 'target_test/f1_macro': 0.76749587059021, 'target_test/f1_micro': 0.7716346383094788}]
Best checkpoint path: checkpoints/lightning_logs/version_8/checkpoints/task-BAMRUni-epoch=08-val_loss=1.22.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_8/checkpoints/BAMRUni-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.21147790551185608, 'source_test/accuracy': 0.9350962042808533, 'source_test/f1': 0.9354134798049927, 'source_test/f1_macro': 0.9293866753578186, 'source_test/f1_micro': 0.9350962042808533, 'target_test/loss': 0.5584709644317627, 'target_test/accuracy': 0.7716346383094788, 'target_test/f1': 0.7716100215911865, 'target_test/f1_macro': 0.76749587059021, 'target_test/f1_micro': 0.7716346383094788}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on saved epoch: [{'source_test/loss': 0.36785098910331726, 'source_test/accuracy': 0.879807710647583, 'source_test/f1': 0.8810542225837708, 'source_test/f1_macro': 0.870366632938385, 'source_test/f1_micro': 0.879807710647583, 'target_test/loss': 0.7285793423652649, 'target_test/accuracy': 0.7355769276618958, 'target_test/f1': 0.7441346645355225, 'target_test/f1_macro': 0.7253203988075256, 'target_test/f1_micro': 0.7355769276618958}]


In [6]:
results.items()

dict_items([('last_epoch', {'source_test/loss': [0.22126220166683197, 0.1937747746706009, 0.21652288734912872], 'source_test/accuracy': [0.9350962042808533, 0.9350962042808533, 0.9350962042808533], 'source_test/f1': [0.9355105757713318, 0.9352640509605408, 0.9354134798049927], 'source_test/f1_macro': [0.9293885231018066, 0.9292432069778442, 0.9293866753578186], 'source_test/f1_micro': [0.9350962042808533, 0.9350962042808533, 0.9350962042808533], 'target_test/loss': [0.5666314363479614, 0.5428446531295776, 0.569366991519928], 'target_test/accuracy': [0.7620192766189575, 0.7836538553237915, 0.7716346383094788], 'target_test/f1': [0.7622856497764587, 0.7838585376739502, 0.7716100215911865], 'target_test/f1_macro': [0.7569375038146973, 0.7796016931533813, 0.76749587059021], 'target_test/f1_micro': [0.7620192766189575, 0.7836538553237915, 0.7716346383094788]}), ('best_model', {'source_test/loss': [0.2089785486459732, 0.23590029776096344, 0.21147790551185608], 'source_test/accuracy': [0.9158

In [7]:
# Calculate mean and standard deviation for each scenario
mean_results = {scenario: {key: np.mean(values) for key, values in metrics.items()} for scenario, metrics in results.items()}
std_results = {scenario: {key: np.std(values) for key, values in metrics.items()} for scenario, metrics in results.items()}

# Log mean and standard deviation results to wandb
# wandb.init(project=project_name, name=f'{domain}_mean_results')
# for scenario in mean_results:
#     for key, value in mean_results[scenario].items():
#         wandb.log({f"{scenario}/{key}": value})
#         wandb.log({f"{scenario}/{key}_std": std_results[scenario][key]})
# wandb.finish()

print("Mean Results:", mean_results)
print("Standard Deviation Results:", std_results)

# # Save the best model's adapter
# if model:
#     adapter_save_path = f"../../saved/adapter_after_run/{hparams['task_adapter_name']}"
#     model.save_adapter(adapter_save_path, hparams['task_adapter_name'])
#     print(f"Adapter saved to {adapter_save_path}")
# else:
#     print("No best model to save.")

Mean Results: {'last_epoch': {'source_test/loss': 0.2105199545621872, 'source_test/accuracy': 0.9350962042808533, 'source_test/f1': 0.9353960355122884, 'source_test/f1_macro': 0.9293394684791565, 'source_test/f1_micro': 0.9350962042808533, 'target_test/loss': 0.559614360332489, 'target_test/accuracy': 0.7724359234174093, 'target_test/f1': 0.7725847363471985, 'target_test/f1_macro': 0.7680116891860962, 'target_test/f1_micro': 0.7724359234174093}, 'best_model': {'source_test/loss': 0.2187855839729309, 'source_test/accuracy': 0.9182692766189575, 'source_test/f1': 0.9185250600179037, 'source_test/f1_macro': 0.9112237890561422, 'source_test/f1_micro': 0.9182692766189575, 'target_test/loss': 0.5401818156242371, 'target_test/accuracy': 0.7612179716428121, 'target_test/f1': 0.762438436349233, 'target_test/f1_macro': 0.7560851176579794, 'target_test/f1_micro': 0.7612179716428121}, 'epoch_saved': {'source_test/loss': 0.2788112411896388, 'source_test/accuracy': 0.9030448993047079, 'source_test/f1

In [8]:
print('dones')

dones


In [9]:
best_val_loss

inf