In [1]:

import torch
import gc
torch.cuda.empty_cache()
gc.collect()


0

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Initialize the console

# Step 2: Import necessary libraries
from typing import Optional, Dict, Any
import os
import torch
import pytorch_lightning as pl
import pandas as pd
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from rich.traceback import install
from pytorch_lightning.loggers import WandbLogger
import torch.nn as nn
import torch.optim as optim
from transformers import AutoConfig
from adapters import AutoAdapterModel, AdapterConfig
from adapters.composition import Stack
from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping
import torchmetrics

install(show_locals=True)

from setup import setup_src_path
print(setup_src_path())
import data.processed as processed
import config.config as config
import utils.setup as setup
import utils.functions as fn
from importlib import reload

from datasets import load_from_disk

print(config.Config.TXT_SAVE_PATH)
print(config.Config.MODEL_SAVE_PATH)

#dataset = load_from_disk(f"../{config.Config.DATASETS_SAVE_PATH}/datasets")


['/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages', '/tmp/tmph_4dc8lt', '/home/guest/Desktop/projects/third-experiments/SDA_experiments/modules']


2024-09-29 19:31:18.694718: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-29 19:31:18.727133: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




./text-files/
./hp-model-


In [3]:
import torch
import os
import pytorch_lightning as pl
from transformers import  AutoConfig, DataCollatorForLanguageModeling
from collections import defaultdict
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
import torchmetrics

class JointDomainTaskAdapter(pl.LightningModule):
    def __init__(self, hparams,source_dataset_length,target_dataset_length):
        super(JointDomainTaskAdapter, self).__init__()
        self.save_hyperparameters(hparams)

        # Load config with hidden states output
        self.config = AutoConfig.from_pretrained(self.hparams["pretrained_model_name"])
        self.config.output_hidden_states = True
        self.model = AutoAdapterModel.from_pretrained(self.hparams["pretrained_model_name"], config=self.config)

        # Set reduction factor and leave_out layers
        self.reduction_factor = self.hparams.get("reduction_factor", 16)
        self.leave_out = self.hparams.get("leave_out", [])
        # if self.leave_out != "None":
        #     self.leave_out = self.leave_out.split(",")
        #     self.leave_out = [int(i) for i in self.leave_out]
        # else:
        #     self.leave_out = []

        # Load MLM adapter with head
        self.model.load_adapter(f"{self.hparams['saved_adapter_dir']}/{self.hparams['domain_adapter_name']}", with_head=True)

        # Add classification head for the task
        self.model.add_classification_head(f"{self.hparams['task_adapter_name']}", num_labels=self.hparams["num_classes"])

        # Set active adapters
        self.model.train_adapter(self.hparams['domain_adapter_name'])
        # Calculate alpha based on dataset lengths
        self.alpha = source_dataset_length / (source_dataset_length + target_dataset_length)

        # Initialize loss functions and metrics
        self.criterion = nn.CrossEntropyLoss()
        self.mlm_criterion = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=self.hparams["num_classes"])
        self.f1 = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="weighted")
        self.f1_macro = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="macro")
        self.f1_micro = torchmetrics.F1Score(task='multiclass',num_classes=self.hparams["num_classes"], average="micro")

        self.softmax = nn.Softmax(dim=1)
        self.validation_outputs = []
        self.test_outputs = []
        # Optimizer related variables
        self.learning_rate = self.hparams.get("learning_rate", 1e-4)
        self.scheduler_factor = self.hparams.get("scheduler_factor", 0.1)
        self.scheduler_patience = self.hparams.get("scheduler_patience", 0.05)
        self.scheduler_threshold = self.hparams.get("scheduler_threshold", 0.0001)
        self.scheduler_cooldown = self.hparams.get("scheduler_cooldown", 0)
        self.scheduler_eps = self.hparams.get("scheduler_eps", 1e-8)

    def forward(self, input_ids, attention_mask=None, labels=None, task=None):
        if task == "mlm":
            self.model.active_head= self.hparams['domain_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        elif task == "classification":
            self.model.active_head= self.hparams['task_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        else:
            raise ValueError("Task must be either 'mlm' or 'classification'.")
        return outputs

    def training_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha

        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))

        metrics = {
            "train/accuracy": accuracy,
            "train/f1": f1,
            "train/taskclf_loss": task_loss,
            "train/loss": loss,
            "train/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return loss

    def validation_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha
        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        self.validation_outputs.append({
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
                })
        
        metrics = {
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return metrics
    def on_validation_epoch_start(self):
        self.validation_outputs = []
    
    def on_validation_epoch_end(self):
        outputs= self.validation_outputs

        avg_loss = torch.stack([x["val/loss"] for x in outputs]).mean()
        avg_task_loss = torch.stack([x["val/taskclf_loss"] for x in outputs]).mean()
        avg_mlm_loss = torch.stack([x["val/mlm_loss"] for x in outputs]).mean()
        avg_accuracy = torch.stack([x["val/accuracy"] for x in outputs]).mean()
        avg_f1 = torch.stack([x["val/f1"] for x in outputs]).mean()
        print(f"val/accuracy: {avg_accuracy}")
        print(f"val/f1: {avg_f1}")
        print(f"val/taskclf_loss: {avg_task_loss}")
        print(f"val/loss: {avg_loss}")
        print(f"val/mlm_loss: {avg_mlm_loss}")
        metrics = {
            "val/avg_loss": avg_loss,
            "val/avg_taskclf_loss": avg_task_loss,
            "val/avg_mlm_loss": avg_mlm_loss,
            "val/avg_accuracy": avg_accuracy,
            "val/avg_f1": avg_f1,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
        self.log("val_loss", avg_loss)

    def test_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        target_labels = batch["label_target"]

        # Classification task for source data
        cls_outputs_source = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits_source = cls_outputs_source.logits
        task_loss_source = self.criterion(cls_logits_source, source_labels)

        # Classification task for target data
        cls_outputs_target = self(input_ids=target_input_ids, attention_mask=target_attention_mask, task="classification")
        cls_logits_target = cls_outputs_target.logits
        task_loss_target = self.criterion(cls_logits_target, target_labels)

        # Combine losses (though typically you would evaluate them separately)
        loss = task_loss_source + task_loss_target

        accuracy_source = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_source = self.f1(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_macro_source = self.f1_macro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_micro_source = self.f1_micro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
    
        accuracy_target = self.accuracy(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_target = self.f1(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_macro_target = self.f1_macro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_micro_target = self.f1_micro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))

        metrics = {
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        self.test_outputs.append({
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        })
        return metrics
    def on_test_epoch_start(self):
        self.test_outputs = []

    def on_test_epoch_end(self):
        outputs=  self.test_outputs

        avg_loss_source = torch.stack([x["source_test/loss"] for x in outputs]).mean()
        avg_task_loss_target = torch.stack([x["target_test/loss"] for x in outputs]).mean()
        avg_accuracy_source = torch.stack([x["source_test/accuracy"] for x in outputs]).mean()
        avg_f1_source = torch.stack([x["source_test/f1"] for x in outputs]).mean()
        avg_f1_macro_source = torch.stack([x["source_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_source = torch.stack([x["source_test/f1_micro"] for x in outputs]).mean()

        avg_accuracy_target = torch.stack([x["target_test/accuracy"] for x in outputs]).mean()
        avg_f1_target = torch.stack([x["target_test/f1"] for x in outputs]).mean()
        avg_f1_macro_target = torch.stack([x["target_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_target = torch.stack([x["target_test/f1_micro"] for x in outputs]).mean()

        metrics = {
            "source_test/loss": avg_loss_source,
            "target_test/loss": avg_task_loss_target,
            "source_test/accuracy": avg_accuracy_source,
            "source_test/f1": avg_f1_source,
            "source_test/f1_macro": avg_f1_macro_source,
            "source_test/f1_micro": avg_f1_micro_source,
            "target_test/accuracy": avg_accuracy_target,
            "target_test/f1": avg_f1_target,
            "target_test/f1_macro": avg_f1_macro_target,
            "target_test/f1_micro": avg_f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
    def save_adapter(self, location, adapter_name):
        self.model.save_adapter(location, adapter_name)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams["learning_rate"])
        lr_scheduler = {
            'scheduler': optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=0.0001, cooldown=0, min_lr=1e-8),
            'monitor': 'val_loss'
        }
        return [optimizer], [lr_scheduler]


In [4]:
import wandb

wandb.login()
# Wandb setup and training loop
seeds = [10, 100, 1000]  # List of seeds
project_name = 'SDA_mixed_edited'  # Replace with your wandb project name
domain = 'BOA'  # Replace with the specific domain for this notebook
type = 'union'  # Replace with the specific type for this notebook

# Initialize results dictionary
results = {
    "last_epoch": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "best_model": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "epoch_saved": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    }
}

best_val_loss = float('inf')
best_model = None
best_model_path = ""

[34m[1mwandb[0m: Currently logged in as: [33mmrawhani5[0m ([33mmrawhani[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
reload(processed)
for seed in seeds:
    #wandb.init(project=project_name, name=f'{domain}_{type}_run_with_seed_{seed}', config={'seed': seed})

    try:
        seed_everything(seed)

        hparams = {
            "source_target": "books_apparel",
            "dataset_cache_dir": "./../../datasets",
            "pretrained_model_name": "bert-base-uncased",
            "padding": True,
            "source_domain": "books",
            "target_domain": "apparel",
            "domain_adapter_name": "mlm_union_apparel",
            "task_adapter_name": "BOAUni",
            "padding": "max_length",
            "max_seq_length": 512,
            "bsz": 32,
            "num_classes": 3,
            "learning_rate": 1e-4,
            "reduction_factor": 16,
            "mode": "domain",
            "saved_adapter_dir": "../saved/adapters",
        }

        save_dir = "checkpoints"
        save_epoch_3 = 6  # Save model at the 3rd epoch
        
        dm = processed.DataModuleSourceTargetMixed(hparams)
        dm.setup('fit')
        dm.setup("test")
        source_length, target_length = dm.get_dataset_lengths()
        print(f"Source dataset length: {source_length}")
        print(f"Target dataset length: {target_length}")
        model = JointDomainTaskAdapter(hparams,source_length,target_length)

        checkpoint_callback = ModelCheckpoint(
            filename="task-BOAUni-{epoch:02d}-{val_loss:.2f}",
            save_top_k=1,
            monitor="val_loss",
            mode="min",
        )
        save_model_callback_epoch = ModelCheckpoint(
            filename="BOAUni-{epoch:02d}",
            every_n_epochs=save_epoch_3,
            save_top_k=-1,
        )

        #wandb_logger = WandbLogger()

    except Exception as e:
        print(f"Error during preprocessing : {e}")

    try:
        train_loader = dm.train_dataloader()
        val_loader = dm.val_dataloader()
        trainer = Trainer(
            max_epochs=10,
            accelerator="auto",
            #precision=16,
            
            default_root_dir="checkpoints",
            #logger=wandb_logger,
            callbacks=[checkpoint_callback, save_model_callback_epoch],
            limit_train_batches=1.0,
            limit_val_batches=1.0,
            limit_test_batches=1.0,
        )

        trainer.fit(model, train_loader, val_loader)
        print(f"Best checkpoint path: {checkpoint_callback.best_model_path}")
        print(f"Saved epoch checkpoint path: {save_model_callback_epoch.best_model_path}")
    except Exception as e:
        print(f"Error during training : {e}")

    try:
        dm.setup("test")
        test_loader = dm.test_dataloader()
        test_results_last = trainer.test(model, test_loader)
        print("Test Results Last Epoch:", test_results_last)

        # Collect results for last epoch model
        for key, value in test_results_last[0].items():
            results["last_epoch"][key].append(value)

        # Paths to the saved checkpoints
        best_checkpoint_path = checkpoint_callback.best_model_path
        saved_epoch_checkpoint_path = save_model_callback_epoch.best_model_path
        print(f"Best checkpoint path: {best_checkpoint_path}")
        print(f"Saved epoch checkpoint path: {saved_epoch_checkpoint_path}")

        best_model = JointDomainTaskAdapter.load_from_checkpoint(best_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_best = trainer.test(best_model, test_loader)
        print("Test Results on Best Model:", test_results_best)
        for key, value in test_results_best[0].items():
            results["best_model"][key].append(value)

        saved_epoch_model = JointDomainTaskAdapter.load_from_checkpoint(saved_epoch_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_saved_epoch = trainer.test(saved_epoch_model, test_loader)
        print("Test Results on saved epoch:", test_results_saved_epoch)
        for key, value in test_results_saved_epoch[0].items():
            results["epoch_saved"][key].append(value)

    except Exception as e:
        print(f"Error during testing: {e}")

    #wandb.finish()

Seed set to 10




Source dataset length: 1440
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.46875
val/f1: 0.6382978558540344
val/taskclf_loss: 1.105840802192688
val/loss: 1.5636675357818604
val/mlm_loss: 2.021494150161743


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.856249988079071
val/f1: 0.8580679297447205
val/taskclf_loss: 0.31851133704185486
val/loss: 1.340490460395813
val/mlm_loss: 2.362469434738159


Validation: |                                                                                                 …

val/accuracy: 0.9312500357627869
val/f1: 0.9319953918457031
val/taskclf_loss: 0.22736044228076935
val/loss: 1.154279112815857
val/mlm_loss: 2.081197738647461


Validation: |                                                                                                 …

val/accuracy: 0.9312500357627869
val/f1: 0.93203204870224
val/taskclf_loss: 0.20540420711040497
val/loss: 1.1356010437011719
val/mlm_loss: 2.065797805786133


Validation: |                                                                                                 …

val/accuracy: 0.9437500238418579
val/f1: 0.9438919425010681
val/taskclf_loss: 0.18828053772449493
val/loss: 1.180930495262146
val/mlm_loss: 2.1735806465148926


Validation: |                                                                                                 …

val/accuracy: 0.918749988079071
val/f1: 0.9188298583030701
val/taskclf_loss: 0.2629232406616211
val/loss: 1.2403630018234253
val/mlm_loss: 2.2178027629852295


Validation: |                                                                                                 …

val/accuracy: 0.9125000238418579
val/f1: 0.9133005142211914
val/taskclf_loss: 0.22487211227416992
val/loss: 1.167874813079834
val/mlm_loss: 2.110877752304077


Validation: |                                                                                                 …

val/accuracy: 0.9375
val/f1: 0.9375123977661133
val/taskclf_loss: 0.19533014297485352
val/loss: 1.1769386529922485
val/mlm_loss: 2.1585471630096436


Validation: |                                                                                                 …

val/accuracy: 0.9312500357627869
val/f1: 0.9313173294067383
val/taskclf_loss: 0.20365355908870697
val/loss: 1.1333469152450562
val/mlm_loss: 2.0630404949188232


Validation: |                                                                                                 …

val/accuracy: 0.918749988079071
val/f1: 0.9190523028373718
val/taskclf_loss: 0.21173778176307678
val/loss: 1.1893093585968018
val/mlm_loss: 2.1668808460235596


Validation: |                                                                                                 …

val/accuracy: 0.9437500238418579
val/f1: 0.9437807202339172
val/taskclf_loss: 0.1948898583650589
val/loss: 1.124892234802246
val/mlm_loss: 2.0548949241638184


`Trainer.fit` stopped: `max_epochs=10` reached.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_18/checkpoints/task-BOAUni-epoch=09-val_loss=1.12.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_18/checkpoints/BOAUni-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.27715274691581726, 'source_test/accuracy': 0.915865421295166, 'source_test/f1': 0.9155707955360413, 'source_test/f1_macro': 0.9130429029464722, 'source_test/f1_micro': 0.915865421295166, 'target_test/loss': 0.3863564431667328, 'target_test/accuracy': 0.8966346383094788, 'target_test/f1': 0.8962592482566833, 'target_test/f1_macro': 0.893308699131012, 'target_test/f1_micro': 0.8966346383094788}]
Best checkpoint path: checkpoints/lightning_logs/version_18/checkpoints/task-BOAUni-epoch=09-val_loss=1.12.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_18/checkpoints/BOAUni-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.27715274691581726, 'source_test/accuracy': 0.915865421295166, 'source_test/f1': 0.9155707955360413, 'source_test/f1_macro': 0.9130429029464722, 'source_test/f1_micro': 0.915865421295166, 'target_test/loss': 0.3863564431667328, 'target_test/accuracy': 0.8966346383094788, 'target_test/f1': 0.8962592482566833, 'target_test/f1_macro': 0.893308699131012, 'target_test/f1_micro': 0.8966346383094788}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 100


Test Results on saved epoch: [{'source_test/loss': 0.3096839487552643, 'source_test/accuracy': 0.8918269276618958, 'source_test/f1': 0.8915635943412781, 'source_test/f1_macro': 0.8888198137283325, 'source_test/f1_micro': 0.8918269276618958, 'target_test/loss': 0.3104872405529022, 'target_test/accuracy': 0.911057710647583, 'target_test/f1': 0.9105370044708252, 'target_test/f1_macro': 0.907889187335968, 'target_test/f1_micro': 0.911057710647583}]


Source dataset length: 1440
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.359375
val/f1: 0.41716843843460083
val/taskclf_loss: 1.1013450622558594
val/loss: 1.5626087188720703
val/mlm_loss: 2.0238723754882812


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.862500011920929
val/f1: 0.8643714785575867
val/taskclf_loss: 0.32461434602737427
val/loss: 1.2131372690200806
val/mlm_loss: 2.1016602516174316


Validation: |                                                                                                 …

val/accuracy: 0.9375
val/f1: 0.9378722310066223
val/taskclf_loss: 0.2099606990814209
val/loss: 1.1680272817611694
val/mlm_loss: 2.126093626022339


Validation: |                                                                                                 …

val/accuracy: 0.9375
val/f1: 0.9379457831382751
val/taskclf_loss: 0.20114384591579437
val/loss: 1.1599754095077515
val/mlm_loss: 2.118807077407837


Validation: |                                                                                                 …

val/accuracy: 0.9312500357627869
val/f1: 0.9314153790473938
val/taskclf_loss: 0.20406703650951385
val/loss: 1.112794280052185
val/mlm_loss: 2.0215213298797607


Validation: |                                                                                                 …

val/accuracy: 0.918749988079071
val/f1: 0.9194130301475525
val/taskclf_loss: 0.2188248634338379
val/loss: 1.155664086341858
val/mlm_loss: 2.092503309249878


Validation: |                                                                                                 …

val/accuracy: 0.893750011920929
val/f1: 0.8950662016868591
val/taskclf_loss: 0.37640345096588135
val/loss: 1.2661341428756714
val/mlm_loss: 2.155864715576172


Validation: |                                                                                                 …

val/accuracy: 0.8812500238418579
val/f1: 0.8840110898017883
val/taskclf_loss: 0.34290793538093567
val/loss: 1.193067193031311
val/mlm_loss: 2.043226480484009


Validation: |                                                                                                 …

val/accuracy: 0.918749988079071
val/f1: 0.9188665747642517
val/taskclf_loss: 0.2537907660007477
val/loss: 1.1481307744979858
val/mlm_loss: 2.042470693588257


Validation: |                                                                                                 …

val/accuracy: 0.918749988079071
val/f1: 0.9190395474433899
val/taskclf_loss: 0.24649839103221893
val/loss: 1.1443852186203003
val/mlm_loss: 2.0422720909118652


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.925000011920929
val/f1: 0.9252956509590149
val/taskclf_loss: 0.2534042298793793
val/loss: 1.1290290355682373
val/mlm_loss: 2.0046539306640625


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_19/checkpoints/task-BOAUni-epoch=03-val_loss=1.11.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_19/checkpoints/BOAUni-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.3180871903896332, 'source_test/accuracy': 0.9038462042808533, 'source_test/f1': 0.9036710262298584, 'source_test/f1_macro': 0.9007618427276611, 'source_test/f1_micro': 0.9038462042808533, 'target_test/loss': 0.4584193825721741, 'target_test/accuracy': 0.8725962042808533, 'target_test/f1': 0.8761993050575256, 'target_test/f1_macro': 0.864202082157135, 'target_test/f1_micro': 0.8725962042808533}]
Best checkpoint path: checkpoints/lightning_logs/version_19/checkpoints/task-BOAUni-epoch=03-val_loss=1.11.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_19/checkpoints/BOAUni-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.2452647089958191, 'source_test/accuracy': 0.9062500596046448, 'source_test/f1': 0.9055120944976807, 'source_test/f1_macro': 0.9039691090583801, 'source_test/f1_micro': 0.9062500596046448, 'target_test/loss': 0.30182531476020813, 'target_test/accuracy': 0.9014423489570618, 'target_test/f1': 0.9007194638252258, 'target_test/f1_macro': 0.8988335728645325, 'target_test/f1_micro': 0.9014423489570618}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 1000


Test Results on saved epoch: [{'source_test/loss': 0.40781331062316895, 'source_test/accuracy': 0.879807710647583, 'source_test/f1': 0.8803419470787048, 'source_test/f1_macro': 0.8759679198265076, 'source_test/f1_micro': 0.879807710647583, 'target_test/loss': 0.35253405570983887, 'target_test/accuracy': 0.8870192766189575, 'target_test/f1': 0.8875463008880615, 'target_test/f1_macro': 0.8817285895347595, 'target_test/f1_micro': 0.8870192766189575}]


Source dataset length: 1440
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.09375
val/f1: 0.03696741908788681
val/taskclf_loss: 1.1342346668243408
val/loss: 1.6749227046966553
val/mlm_loss: 2.2156105041503906


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.8812500238418579
val/f1: 0.8825437426567078
val/taskclf_loss: 0.2860433757305145
val/loss: 1.2341676950454712
val/mlm_loss: 2.1822919845581055


Validation: |                                                                                                 …

val/accuracy: 0.925000011920929
val/f1: 0.9257515072822571
val/taskclf_loss: 0.23958027362823486
val/loss: 1.1666473150253296
val/mlm_loss: 2.093714475631714


Validation: |                                                                                                 …

val/accuracy: 0.925000011920929
val/f1: 0.9257515072822571
val/taskclf_loss: 0.2216780185699463
val/loss: 1.1913923025131226
val/mlm_loss: 2.161106824874878


Validation: |                                                                                                 …

val/accuracy: 0.9312500357627869
val/f1: 0.9315380454063416
val/taskclf_loss: 0.2186005413532257
val/loss: 1.113396406173706
val/mlm_loss: 2.008192300796509


Validation: |                                                                                                 …

val/accuracy: 0.9125000238418579
val/f1: 0.9138792157173157
val/taskclf_loss: 0.3037137985229492
val/loss: 1.1794253587722778
val/mlm_loss: 2.0551366806030273


Validation: |                                                                                                 …

val/accuracy: 0.875
val/f1: 0.8763969540596008
val/taskclf_loss: 0.32471275329589844
val/loss: 1.1979938745498657
val/mlm_loss: 2.071274995803833


Validation: |                                                                                                 …

val/accuracy: 0.9312500357627869
val/f1: 0.9313562512397766
val/taskclf_loss: 0.21684765815734863
val/loss: 1.164310336112976
val/mlm_loss: 2.1117732524871826


Validation: |                                                                                                 …

val/accuracy: 0.9312500357627869
val/f1: 0.9313562512397766
val/taskclf_loss: 0.2205583155155182
val/loss: 1.1676266193389893
val/mlm_loss: 2.1146950721740723


Validation: |                                                                                                 …

val/accuracy: 0.925000011920929
val/f1: 0.9251859784126282
val/taskclf_loss: 0.22092628479003906
val/loss: 1.1935341358184814
val/mlm_loss: 2.166141986846924


Validation: |                                                                                                 …

val/accuracy: 0.925000011920929
val/f1: 0.9251859784126282
val/taskclf_loss: 0.22575633227825165
val/loss: 1.0956989526748657
val/mlm_loss: 1.9656416177749634


`Trainer.fit` stopped: `max_epochs=10` reached.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_20/checkpoints/task-BOAUni-epoch=09-val_loss=1.10.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_20/checkpoints/BOAUni-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.3076305389404297, 'source_test/accuracy': 0.9230769872665405, 'source_test/f1': 0.9227647185325623, 'source_test/f1_macro': 0.9206826686859131, 'source_test/f1_micro': 0.9230769872665405, 'target_test/loss': 0.43344566226005554, 'target_test/accuracy': 0.8918269276618958, 'target_test/f1': 0.8922836184501648, 'target_test/f1_macro': 0.8875651955604553, 'target_test/f1_micro': 0.8918269276618958}]
Best checkpoint path: checkpoints/lightning_logs/version_20/checkpoints/task-BOAUni-epoch=09-val_loss=1.10.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_20/checkpoints/BOAUni-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.3076305389404297, 'source_test/accuracy': 0.9230769872665405, 'source_test/f1': 0.9227647185325623, 'source_test/f1_macro': 0.9206826686859131, 'source_test/f1_micro': 0.9230769872665405, 'target_test/loss': 0.43344566226005554, 'target_test/accuracy': 0.8918269276618958, 'target_test/f1': 0.8922836184501648, 'target_test/f1_macro': 0.8875651955604553, 'target_test/f1_micro': 0.8918269276618958}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on saved epoch: [{'source_test/loss': 0.34967532753944397, 'source_test/accuracy': 0.8725962042808533, 'source_test/f1': 0.8738097548484802, 'source_test/f1_macro': 0.8672962784767151, 'source_test/f1_micro': 0.8725962042808533, 'target_test/loss': 0.3149769604206085, 'target_test/accuracy': 0.8822115659713745, 'target_test/f1': 0.8822149634361267, 'target_test/f1_macro': 0.87750643491745, 'target_test/f1_micro': 0.8822115659713745}]


In [6]:
results.items()

dict_items([('last_epoch', {'source_test/loss': [0.27715274691581726, 0.3180871903896332, 0.3076305389404297], 'source_test/accuracy': [0.915865421295166, 0.9038462042808533, 0.9230769872665405], 'source_test/f1': [0.9155707955360413, 0.9036710262298584, 0.9227647185325623], 'source_test/f1_macro': [0.9130429029464722, 0.9007618427276611, 0.9206826686859131], 'source_test/f1_micro': [0.915865421295166, 0.9038462042808533, 0.9230769872665405], 'target_test/loss': [0.3863564431667328, 0.4584193825721741, 0.43344566226005554], 'target_test/accuracy': [0.8966346383094788, 0.8725962042808533, 0.8918269276618958], 'target_test/f1': [0.8962592482566833, 0.8761993050575256, 0.8922836184501648], 'target_test/f1_macro': [0.893308699131012, 0.864202082157135, 0.8875651955604553], 'target_test/f1_micro': [0.8966346383094788, 0.8725962042808533, 0.8918269276618958]}), ('best_model', {'source_test/loss': [0.27715274691581726, 0.2452647089958191, 0.3076305389404297], 'source_test/accuracy': [0.915865

In [7]:
# Calculate mean and standard deviation for each scenario
mean_results = {scenario: {key: np.mean(values) for key, values in metrics.items()} for scenario, metrics in results.items()}
std_results = {scenario: {key: np.std(values) for key, values in metrics.items()} for scenario, metrics in results.items()}

# Log mean and standard deviation results to wandb
# wandb.init(project=project_name, name=f'{domain}_mean_results')
# for scenario in mean_results:
#     for key, value in mean_results[scenario].items():
#         wandb.log({f"{scenario}/{key}": value})
#         wandb.log({f"{scenario}/{key}_std": std_results[scenario][key]})
# wandb.finish()

print("Mean Results:", mean_results)
print("Standard Deviation Results:", std_results)

# # Save the best model's adapter
# if model:
#     adapter_save_path = f"../../saved/adapter_after_run/{hparams['task_adapter_name']}"
#     model.save_adapter(adapter_save_path, hparams['task_adapter_name'])
#     print(f"Adapter saved to {adapter_save_path}")
# else:
#     print("No best model to save.")

Mean Results: {'last_epoch': {'source_test/loss': 0.3009568254152934, 'source_test/accuracy': 0.9142628709475199, 'source_test/f1': 0.9140021800994873, 'source_test/f1_macro': 0.9114958047866821, 'source_test/f1_micro': 0.9142628709475199, 'target_test/loss': 0.4260738293329875, 'target_test/accuracy': 0.8870192567507426, 'target_test/f1': 0.8882473905881246, 'target_test/f1_macro': 0.8816919922828674, 'target_test/f1_micro': 0.8870192567507426}, 'best_model': {'source_test/loss': 0.27668266495068866, 'source_test/accuracy': 0.9150641560554504, 'source_test/f1': 0.9146158695220947, 'source_test/f1_macro': 0.9125648935635885, 'source_test/f1_micro': 0.9150641560554504, 'target_test/loss': 0.37387580672899884, 'target_test/accuracy': 0.8966346383094788, 'target_test/f1': 0.8964207768440247, 'target_test/f1_macro': 0.8932358225186666, 'target_test/f1_micro': 0.8966346383094788}, 'epoch_saved': {'source_test/loss': 0.3557241956392924, 'source_test/accuracy': 0.881410280863444, 'source_test

In [8]:
print('dones')

dones


In [9]:
best_val_loss

inf