In [1]:

import torch
import gc
torch.cuda.empty_cache()
gc.collect()


0

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Initialize the console

# Step 2: Import necessary libraries
from typing import Optional, Dict, Any
import os
import torch
import pytorch_lightning as pl
import pandas as pd
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from rich.traceback import install
from pytorch_lightning.loggers import WandbLogger
import torch.nn as nn
import torch.optim as optim
from transformers import AutoConfig
from adapters import AutoAdapterModel, AdapterConfig
from adapters.composition import Stack
from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping
import torchmetrics

install(show_locals=True)

from setup import setup_src_path
print(setup_src_path())
import data.processed as processed
import config.config as config
import utils.setup as setup
import utils.functions as fn
from importlib import reload

from datasets import load_from_disk

print(config.Config.TXT_SAVE_PATH)
print(config.Config.MODEL_SAVE_PATH)

#dataset = load_from_disk(f"../{config.Config.DATASETS_SAVE_PATH}/datasets")


['/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages', '/tmp/tmplcocuttb', '/home/guest/Desktop/projects/third-experiments/SDA_experiments/modules']


2024-09-29 23:54:26.326624: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-29 23:54:26.358657: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




./text-files/
./hp-model-


In [3]:
import torch
import os
import pytorch_lightning as pl
from transformers import  AutoConfig, DataCollatorForLanguageModeling
from collections import defaultdict
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
import torchmetrics

class JointDomainTaskAdapter(pl.LightningModule):
    def __init__(self, hparams,source_dataset_length,target_dataset_length):
        super(JointDomainTaskAdapter, self).__init__()
        self.save_hyperparameters(hparams)

        # Load config with hidden states output
        self.config = AutoConfig.from_pretrained(self.hparams["pretrained_model_name"])
        self.config.output_hidden_states = True
        self.model = AutoAdapterModel.from_pretrained(self.hparams["pretrained_model_name"], config=self.config)

        # Set reduction factor and leave_out layers
        self.reduction_factor = self.hparams.get("reduction_factor", 16)
        self.leave_out = self.hparams.get("leave_out", [])
        # if self.leave_out != "None":
        #     self.leave_out = self.leave_out.split(",")
        #     self.leave_out = [int(i) for i in self.leave_out]
        # else:
        #     self.leave_out = []

        # Load MLM adapter with head
        self.model.load_adapter(f"{self.hparams['saved_adapter_dir']}/{self.hparams['domain_adapter_name']}", with_head=True)

        # Add classification head for the task
        self.model.add_classification_head(f"{self.hparams['task_adapter_name']}", num_labels=self.hparams["num_classes"])

        # Set active adapters
        self.model.train_adapter(self.hparams['domain_adapter_name'])
        # Calculate alpha based on dataset lengths
        self.alpha = source_dataset_length / (source_dataset_length + target_dataset_length)

        # Initialize loss functions and metrics
        self.criterion = nn.CrossEntropyLoss()
        self.mlm_criterion = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=self.hparams["num_classes"])
        self.f1 = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="weighted")
        self.f1_macro = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="macro")
        self.f1_micro = torchmetrics.F1Score(task='multiclass',num_classes=self.hparams["num_classes"], average="micro")

        self.softmax = nn.Softmax(dim=1)
        self.validation_outputs = []
        self.test_outputs = []
        # Optimizer related variables
        self.learning_rate = self.hparams.get("learning_rate", 1e-4)
        self.scheduler_factor = self.hparams.get("scheduler_factor", 0.1)
        self.scheduler_patience = self.hparams.get("scheduler_patience", 0.05)
        self.scheduler_threshold = self.hparams.get("scheduler_threshold", 0.0001)
        self.scheduler_cooldown = self.hparams.get("scheduler_cooldown", 0)
        self.scheduler_eps = self.hparams.get("scheduler_eps", 1e-8)

    def forward(self, input_ids, attention_mask=None, labels=None, task=None):
        if task == "mlm":
            self.model.active_head= self.hparams['domain_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        elif task == "classification":
            self.model.active_head= self.hparams['task_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        else:
            raise ValueError("Task must be either 'mlm' or 'classification'.")
        return outputs

    def training_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha

        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))

        metrics = {
            "train/accuracy": accuracy,
            "train/f1": f1,
            "train/taskclf_loss": task_loss,
            "train/loss": loss,
            "train/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return loss

    def validation_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha
        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        self.validation_outputs.append({
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
                })
        
        metrics = {
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return metrics
    def on_validation_epoch_start(self):
        self.validation_outputs = []
    
    def on_validation_epoch_end(self):
        outputs= self.validation_outputs

        avg_loss = torch.stack([x["val/loss"] for x in outputs]).mean()
        avg_task_loss = torch.stack([x["val/taskclf_loss"] for x in outputs]).mean()
        avg_mlm_loss = torch.stack([x["val/mlm_loss"] for x in outputs]).mean()
        avg_accuracy = torch.stack([x["val/accuracy"] for x in outputs]).mean()
        avg_f1 = torch.stack([x["val/f1"] for x in outputs]).mean()
        print(f"val/accuracy: {avg_accuracy}")
        print(f"val/f1: {avg_f1}")
        print(f"val/taskclf_loss: {avg_task_loss}")
        print(f"val/loss: {avg_loss}")
        print(f"val/mlm_loss: {avg_mlm_loss}")
        metrics = {
            "val/avg_loss": avg_loss,
            "val/avg_taskclf_loss": avg_task_loss,
            "val/avg_mlm_loss": avg_mlm_loss,
            "val/avg_accuracy": avg_accuracy,
            "val/avg_f1": avg_f1,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
        self.log("val_loss", avg_loss)

    def test_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        target_labels = batch["label_target"]

        # Classification task for source data
        cls_outputs_source = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits_source = cls_outputs_source.logits
        task_loss_source = self.criterion(cls_logits_source, source_labels)

        # Classification task for target data
        cls_outputs_target = self(input_ids=target_input_ids, attention_mask=target_attention_mask, task="classification")
        cls_logits_target = cls_outputs_target.logits
        task_loss_target = self.criterion(cls_logits_target, target_labels)

        # Combine losses (though typically you would evaluate them separately)
        loss = task_loss_source + task_loss_target

        accuracy_source = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_source = self.f1(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_macro_source = self.f1_macro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_micro_source = self.f1_micro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
    
        accuracy_target = self.accuracy(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_target = self.f1(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_macro_target = self.f1_macro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_micro_target = self.f1_micro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))

        metrics = {
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        self.test_outputs.append({
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        })
        return metrics
    def on_test_epoch_start(self):
        self.test_outputs = []

    def on_test_epoch_end(self):
        outputs=  self.test_outputs

        avg_loss_source = torch.stack([x["source_test/loss"] for x in outputs]).mean()
        avg_task_loss_target = torch.stack([x["target_test/loss"] for x in outputs]).mean()
        avg_accuracy_source = torch.stack([x["source_test/accuracy"] for x in outputs]).mean()
        avg_f1_source = torch.stack([x["source_test/f1"] for x in outputs]).mean()
        avg_f1_macro_source = torch.stack([x["source_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_source = torch.stack([x["source_test/f1_micro"] for x in outputs]).mean()

        avg_accuracy_target = torch.stack([x["target_test/accuracy"] for x in outputs]).mean()
        avg_f1_target = torch.stack([x["target_test/f1"] for x in outputs]).mean()
        avg_f1_macro_target = torch.stack([x["target_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_target = torch.stack([x["target_test/f1_micro"] for x in outputs]).mean()

        metrics = {
            "source_test/loss": avg_loss_source,
            "target_test/loss": avg_task_loss_target,
            "source_test/accuracy": avg_accuracy_source,
            "source_test/f1": avg_f1_source,
            "source_test/f1_macro": avg_f1_macro_source,
            "source_test/f1_micro": avg_f1_micro_source,
            "target_test/accuracy": avg_accuracy_target,
            "target_test/f1": avg_f1_target,
            "target_test/f1_macro": avg_f1_macro_target,
            "target_test/f1_micro": avg_f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
    def save_adapter(self, location, adapter_name):
        self.model.save_adapter(location, adapter_name)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams["learning_rate"])
        lr_scheduler = {
            'scheduler': optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=0.0001, cooldown=0, min_lr=1e-8),
            'monitor': 'val_loss'
        }
        return [optimizer], [lr_scheduler]


In [4]:
import wandb

wandb.login()
# Wandb setup and training loop
seeds = [10, 100, 1000]  # List of seeds
project_name = 'SDA_mixed_edited'  # Replace with your wandb project name
domain = 'CA'  # Replace with the specific domain for this notebook
type = 'unipelt'  # Replace with the specific type for this notebook

# Initialize results dictionary
results = {
    "last_epoch": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "best_model": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "epoch_saved": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    }
}

best_val_loss = float('inf')
best_model = None
best_model_path = ""

[34m[1mwandb[0m: Currently logged in as: [33mmrawhani5[0m ([33mmrawhani[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
reload(processed)
for seed in seeds:
    #wandb.init(project=project_name, name=f'{domain}_{type}_run_with_seed_{seed}', config={'seed': seed})

    try:
        seed_everything(seed)

        hparams = {
            "source_target": "camera_photo_apparel",
            "dataset_cache_dir": "./../../datasets",
            "pretrained_model_name": "bert-base-uncased",
            "padding": True,
            "source_domain": "camera_photo",
            "target_domain": "apparel",
            "domain_adapter_name": "mlm_unipelt_apparel",
            "task_adapter_name": "CAPelt",
            "padding": "max_length",
            "max_seq_length": 512,
            "bsz": 32,
            "num_classes": 3,
            "learning_rate": 1e-4,
            "reduction_factor": 16,
            "mode": "domain",
            "saved_adapter_dir": "../saved/adapters",
        }

        save_dir = "checkpoints"
        save_epoch_3 = 6  # Save model at the 3rd epoch
        
        dm = processed.DataModuleSourceTargetMixed(hparams)
        dm.setup('fit')
        dm.setup("test")
        source_length, target_length = dm.get_dataset_lengths()
        print(f"Source dataset length: {source_length}")
        print(f"Target dataset length: {target_length}")
        model = JointDomainTaskAdapter(hparams,source_length,target_length)

        checkpoint_callback = ModelCheckpoint(
            filename="task-CAPelt-{epoch:02d}-{val_loss:.2f}",
            save_top_k=1,
            monitor="val_loss",
            mode="min",
        )
        save_model_callback_epoch = ModelCheckpoint(
            filename="CAPelt-{epoch:02d}",
            every_n_epochs=save_epoch_3,
            save_top_k=-1,
        )

        #wandb_logger = WandbLogger()

    except Exception as e:
        print(f"Error during preprocessing : {e}")

    try:
        train_loader = dm.train_dataloader()
        val_loader = dm.val_dataloader()
        trainer = Trainer(
            max_epochs=10,
            accelerator="auto",
            #precision=16,
            
            default_root_dir="checkpoints",
            #logger=wandb_logger,
            callbacks=[checkpoint_callback, save_model_callback_epoch],
            limit_train_batches=1.0,
            limit_val_batches=1.0,
            limit_test_batches=1.0,
        )

        trainer.fit(model, train_loader, val_loader)
        print(f"Best checkpoint path: {checkpoint_callback.best_model_path}")
        print(f"Saved epoch checkpoint path: {save_model_callback_epoch.best_model_path}")
    except Exception as e:
        print(f"Error during training : {e}")

    try:
        dm.setup("test")
        test_loader = dm.test_dataloader()
        test_results_last = trainer.test(model, test_loader)
        print("Test Results Last Epoch:", test_results_last)

        # Collect results for last epoch model
        for key, value in test_results_last[0].items():
            results["last_epoch"][key].append(value)

        # Paths to the saved checkpoints
        best_checkpoint_path = checkpoint_callback.best_model_path
        saved_epoch_checkpoint_path = save_model_callback_epoch.best_model_path
        print(f"Best checkpoint path: {best_checkpoint_path}")
        print(f"Saved epoch checkpoint path: {saved_epoch_checkpoint_path}")

        best_model = JointDomainTaskAdapter.load_from_checkpoint(best_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_best = trainer.test(best_model, test_loader)
        print("Test Results on Best Model:", test_results_best)
        for key, value in test_results_best[0].items():
            results["best_model"][key].append(value)

        saved_epoch_model = JointDomainTaskAdapter.load_from_checkpoint(saved_epoch_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_saved_epoch = trainer.test(saved_epoch_model, test_loader)
        print("Test Results on saved epoch:", test_results_saved_epoch)
        for key, value in test_results_saved_epoch[0].items():
            results["epoch_saved"][key].append(value)

    except Exception as e:
        print(f"Error during testing: {e}")

    #wandb.finish()

Seed set to 10




Source dataset length: 1437
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 122 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
12.9 M    Trainable params
109 M     Non-trainable params
122 M     Total params
489.615   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.03125
val/f1: 0.011363636702299118
val/taskclf_loss: 1.1402403116226196
val/loss: 1.5536015033721924
val/mlm_loss: 1.9661014080047607


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.9000000357627869
val/f1: 0.9015825390815735
val/taskclf_loss: 0.2680310606956482
val/loss: 1.22075617313385
val/mlm_loss: 2.171496629714966


Validation: |                                                                                                 …

val/accuracy: 0.9312500357627869
val/f1: 0.931434154510498
val/taskclf_loss: 0.23651151359081268
val/loss: 1.2210445404052734
val/mlm_loss: 2.203526496887207


Validation: |                                                                                                 …

val/accuracy: 0.9125000238418579
val/f1: 0.9133575558662415
val/taskclf_loss: 0.23976722359657288
val/loss: 1.190489411354065
val/mlm_loss: 2.1392312049865723


Validation: |                                                                                                 …

val/accuracy: 0.9375
val/f1: 0.9379278421401978
val/taskclf_loss: 0.2409256249666214
val/loss: 1.2046865224838257
val/mlm_loss: 2.1664397716522217


Validation: |                                                                                                 …

val/accuracy: 0.9312500357627869
val/f1: 0.9318413138389587
val/taskclf_loss: 0.24164417386054993
val/loss: 1.2100147008895874
val/mlm_loss: 2.176367998123169


Validation: |                                                                                                 …

val/accuracy: 0.9312500357627869
val/f1: 0.9318413138389587
val/taskclf_loss: 0.27204301953315735
val/loss: 1.1944559812545776
val/mlm_loss: 2.1149470806121826


Validation: |                                                                                                 …

val/accuracy: 0.9375
val/f1: 0.9381217956542969
val/taskclf_loss: 0.23262138664722443
val/loss: 1.1764487028121948
val/mlm_loss: 2.118309736251831


Validation: |                                                                                                 …

val/accuracy: 0.949999988079071
val/f1: 0.9502760767936707
val/taskclf_loss: 0.22019349038600922
val/loss: 1.1911648511886597
val/mlm_loss: 2.160113573074341


Validation: |                                                                                                 …

val/accuracy: 0.9437500238418579
val/f1: 0.9442083239555359
val/taskclf_loss: 0.22469452023506165
val/loss: 1.1672059297561646
val/mlm_loss: 2.1077537536621094


Validation: |                                                                                                 …

val/accuracy: 0.949999988079071
val/f1: 0.9502760767936707
val/taskclf_loss: 0.2206280678510666
val/loss: 1.1260322332382202
val/mlm_loss: 2.029550313949585


`Trainer.fit` stopped: `max_epochs=10` reached.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_33/checkpoints/task-CAPelt-epoch=09-val_loss=1.13.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_33/checkpoints/CAPelt-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.2682103216648102, 'source_test/accuracy': 0.915865421295166, 'source_test/f1': 0.9159426093101501, 'source_test/f1_macro': 0.9130035042762756, 'source_test/f1_micro': 0.915865421295166, 'target_test/loss': 0.3169304430484772, 'target_test/accuracy': 0.9134615659713745, 'target_test/f1': 0.9132021069526672, 'target_test/f1_macro': 0.9102993607521057, 'target_test/f1_micro': 0.9134615659713745}]
Best checkpoint path: checkpoints/lightning_logs/version_33/checkpoints/task-CAPelt-epoch=09-val_loss=1.13.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_33/checkpoints/CAPelt-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.2682103216648102, 'source_test/accuracy': 0.915865421295166, 'source_test/f1': 0.9159426093101501, 'source_test/f1_macro': 0.9130035042762756, 'source_test/f1_micro': 0.915865421295166, 'target_test/loss': 0.3169304430484772, 'target_test/accuracy': 0.9134615659713745, 'target_test/f1': 0.9132021069526672, 'target_test/f1_macro': 0.9102993607521057, 'target_test/f1_micro': 0.9134615659713745}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 100


Test Results on saved epoch: [{'source_test/loss': 0.3113294243812561, 'source_test/accuracy': 0.8990384936332703, 'source_test/f1': 0.8989828824996948, 'source_test/f1_macro': 0.8960968255996704, 'source_test/f1_micro': 0.8990384936332703, 'target_test/loss': 0.35281720757484436, 'target_test/accuracy': 0.9062500596046448, 'target_test/f1': 0.9055940508842468, 'target_test/f1_macro': 0.9037421345710754, 'target_test/f1_micro': 0.9062500596046448}]


Source dataset length: 1437
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 122 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
12.9 M    Trainable params
109 M     Non-trainable params
122 M     Total params
489.615   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.0
val/f1: 0.0
val/taskclf_loss: 1.1697442531585693
val/loss: 1.5555758476257324
val/mlm_loss: 1.9406037330627441


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.90625
val/f1: 0.9079031348228455
val/taskclf_loss: 0.29096490144729614
val/loss: 1.188110589981079
val/mlm_loss: 2.0833873748779297


Validation: |                                                                                                 …

val/accuracy: 0.9125000238418579
val/f1: 0.9128909111022949
val/taskclf_loss: 0.24793890118598938
val/loss: 1.1811916828155518
val/mlm_loss: 2.1125001907348633


Validation: |                                                                                                 …

val/accuracy: 0.9312500357627869
val/f1: 0.9316224455833435
val/taskclf_loss: 0.22809112071990967
val/loss: 1.1576863527297974
val/mlm_loss: 2.0853450298309326


Validation: |                                                                                                 …

val/accuracy: 0.9437500238418579
val/f1: 0.9439706802368164
val/taskclf_loss: 0.23577995598316193
val/loss: 1.1143625974655151
val/mlm_loss: 1.9911152124404907


Validation: |                                                                                                 …

val/accuracy: 0.9312500357627869
val/f1: 0.9316224455833435
val/taskclf_loss: 0.24935178458690643
val/loss: 1.2064228057861328
val/mlm_loss: 2.1614997386932373


Validation: |                                                                                                 …

val/accuracy: 0.9375
val/f1: 0.9379029273986816
val/taskclf_loss: 0.24748431146144867
val/loss: 1.1686642169952393
val/mlm_loss: 2.0879249572753906


Validation: |                                                                                                 …

val/accuracy: 0.9437500238418579
val/f1: 0.9439706802368164
val/taskclf_loss: 0.2439180463552475
val/loss: 1.1376994848251343
val/mlm_loss: 2.029618978500366


Validation: |                                                                                                 …

val/accuracy: 0.9437500238418579
val/f1: 0.9441690444946289
val/taskclf_loss: 0.231789693236351
val/loss: 1.1482585668563843
val/mlm_loss: 2.0628182888031006


Validation: |                                                                                                 …

val/accuracy: 0.9437500238418579
val/f1: 0.9441690444946289
val/taskclf_loss: 0.23674486577510834
val/loss: 1.2076025009155273
val/mlm_loss: 2.1764373779296875


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.9375
val/f1: 0.9381014108657837
val/taskclf_loss: 0.2445722073316574
val/loss: 1.1422648429870605
val/mlm_loss: 2.0380873680114746


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_34/checkpoints/task-CAPelt-epoch=03-val_loss=1.11.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_34/checkpoints/CAPelt-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.2915090322494507, 'source_test/accuracy': 0.911057710647583, 'source_test/f1': 0.9110167622566223, 'source_test/f1_macro': 0.9087215065956116, 'source_test/f1_micro': 0.911057710647583, 'target_test/loss': 0.32880836725234985, 'target_test/accuracy': 0.911057710647583, 'target_test/f1': 0.9104647040367126, 'target_test/f1_macro': 0.9083486199378967, 'target_test/f1_micro': 0.911057710647583}]
Best checkpoint path: checkpoints/lightning_logs/version_34/checkpoints/task-CAPelt-epoch=03-val_loss=1.11.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_34/checkpoints/CAPelt-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.2922724187374115, 'source_test/accuracy': 0.889423131942749, 'source_test/f1': 0.8892552852630615, 'source_test/f1_macro': 0.8869093656539917, 'source_test/f1_micro': 0.889423131942749, 'target_test/loss': 0.29232892394065857, 'target_test/accuracy': 0.920673131942749, 'target_test/f1': 0.9200807809829712, 'target_test/f1_macro': 0.9182721376419067, 'target_test/f1_micro': 0.920673131942749}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 1000


Test Results on saved epoch: [{'source_test/loss': 0.27498239278793335, 'source_test/accuracy': 0.915865421295166, 'source_test/f1': 0.9158243536949158, 'source_test/f1_macro': 0.9134754538536072, 'source_test/f1_micro': 0.915865421295166, 'target_test/loss': 0.3245808184146881, 'target_test/accuracy': 0.911057710647583, 'target_test/f1': 0.9105122089385986, 'target_test/f1_macro': 0.9081137776374817, 'target_test/f1_micro': 0.911057710647583}]


Source dataset length: 1437
Target dataset length: 1440




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 122 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
12.9 M    Trainable params
109 M     Non-trainable params
122 M     Total params
489.615   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.109375
val/f1: 0.07668478041887283
val/taskclf_loss: 1.1249730587005615
val/loss: 1.7450101375579834
val/mlm_loss: 2.363755464553833


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (45) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.9125000238418579
val/f1: 0.9131447076797485
val/taskclf_loss: 0.24543502926826477
val/loss: 1.1698983907699585
val/mlm_loss: 2.092435598373413


Validation: |                                                                                                 …

val/accuracy: 0.949999988079071
val/f1: 0.9502760767936707
val/taskclf_loss: 0.21798650920391083
val/loss: 1.2552330493927002
val/mlm_loss: 2.290318727493286


Validation: |                                                                                                 …

val/accuracy: 0.918749988079071
val/f1: 0.9192798733711243
val/taskclf_loss: 0.2433781921863556
val/loss: 1.1631304025650024
val/mlm_loss: 2.0809667110443115


Validation: |                                                                                                 …

val/accuracy: 0.9125000238418579
val/f1: 0.9136649966239929
val/taskclf_loss: 0.2867593467235565
val/loss: 1.2073291540145874
val/mlm_loss: 2.125981092453003


Validation: |                                                                                                 …

val/accuracy: 0.925000011920929
val/f1: 0.9255115389823914
val/taskclf_loss: 0.27639296650886536
val/loss: 1.1812082529067993
val/mlm_loss: 2.0841386318206787


Validation: |                                                                                                 …

val/accuracy: 0.9437500238418579
val/f1: 0.9440639615058899
val/taskclf_loss: 0.22849026322364807
val/loss: 1.1533223390579224
val/mlm_loss: 2.0762276649475098


Validation: |                                                                                                 …

val/accuracy: 0.9312500357627869
val/f1: 0.9318413138389587
val/taskclf_loss: 0.2591048777103424
val/loss: 1.1860437393188477
val/mlm_loss: 2.1110517978668213


Validation: |                                                                                                 …

val/accuracy: 0.9375
val/f1: 0.9377402663230896
val/taskclf_loss: 0.2506927251815796
val/loss: 1.1831682920455933
val/mlm_loss: 2.113701343536377


Validation: |                                                                                                 …

val/accuracy: 0.9437500238418579
val/f1: 0.9439668655395508
val/taskclf_loss: 0.23516236245632172
val/loss: 1.2045475244522095
val/mlm_loss: 2.1719131469726562


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.9375
val/f1: 0.937908947467804
val/taskclf_loss: 0.25788071751594543
val/loss: 1.209692358970642
val/mlm_loss: 2.1595208644866943


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Best checkpoint path: checkpoints/lightning_logs/version_35/checkpoints/task-CAPelt-epoch=05-val_loss=1.15.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_35/checkpoints/CAPelt-epoch=05.ckpt


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.33771997690200806, 'source_test/accuracy': 0.9062500596046448, 'source_test/f1': 0.9062237739562988, 'source_test/f1_macro': 0.9041173458099365, 'source_test/f1_micro': 0.9062500596046448, 'target_test/loss': 0.35362640023231506, 'target_test/accuracy': 0.9086538553237915, 'target_test/f1': 0.9081823229789734, 'target_test/f1_macro': 0.9057879447937012, 'target_test/f1_micro': 0.9086538553237915}]
Best checkpoint path: checkpoints/lightning_logs/version_35/checkpoints/task-CAPelt-epoch=05-val_loss=1.15.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_35/checkpoints/CAPelt-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.




Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.3310769200325012, 'source_test/accuracy': 0.8870192766189575, 'source_test/f1': 0.8869894742965698, 'source_test/f1_macro': 0.8838693499565125, 'source_test/f1_micro': 0.8870192766189575, 'target_test/loss': 0.3292313516139984, 'target_test/accuracy': 0.9062500596046448, 'target_test/f1': 0.9058850407600403, 'target_test/f1_macro': 0.9027958512306213, 'target_test/f1_micro': 0.9062500596046448}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on saved epoch: [{'source_test/loss': 0.3310769200325012, 'source_test/accuracy': 0.8870192766189575, 'source_test/f1': 0.8869894742965698, 'source_test/f1_macro': 0.8838693499565125, 'source_test/f1_micro': 0.8870192766189575, 'target_test/loss': 0.3292313516139984, 'target_test/accuracy': 0.9062500596046448, 'target_test/f1': 0.9058850407600403, 'target_test/f1_macro': 0.9027958512306213, 'target_test/f1_micro': 0.9062500596046448}]


In [6]:
results.items()

dict_items([('last_epoch', {'source_test/loss': [0.2682103216648102, 0.2915090322494507, 0.33771997690200806], 'source_test/accuracy': [0.915865421295166, 0.911057710647583, 0.9062500596046448], 'source_test/f1': [0.9159426093101501, 0.9110167622566223, 0.9062237739562988], 'source_test/f1_macro': [0.9130035042762756, 0.9087215065956116, 0.9041173458099365], 'source_test/f1_micro': [0.915865421295166, 0.911057710647583, 0.9062500596046448], 'target_test/loss': [0.3169304430484772, 0.32880836725234985, 0.35362640023231506], 'target_test/accuracy': [0.9134615659713745, 0.911057710647583, 0.9086538553237915], 'target_test/f1': [0.9132021069526672, 0.9104647040367126, 0.9081823229789734], 'target_test/f1_macro': [0.9102993607521057, 0.9083486199378967, 0.9057879447937012], 'target_test/f1_micro': [0.9134615659713745, 0.911057710647583, 0.9086538553237915]}), ('best_model', {'source_test/loss': [0.2682103216648102, 0.2922724187374115, 0.3310769200325012], 'source_test/accuracy': [0.91586542

In [7]:
# Calculate mean and standard deviation for each scenario
mean_results = {scenario: {key: np.mean(values) for key, values in metrics.items()} for scenario, metrics in results.items()}
std_results = {scenario: {key: np.std(values) for key, values in metrics.items()} for scenario, metrics in results.items()}

# Log mean and standard deviation results to wandb
# wandb.init(project=project_name, name=f'{domain}_mean_results')
# for scenario in mean_results:
#     for key, value in mean_results[scenario].items():
#         wandb.log({f"{scenario}/{key}": value})
#         wandb.log({f"{scenario}/{key}_std": std_results[scenario][key]})
# wandb.finish()

print("Mean Results:", mean_results)
print("Standard Deviation Results:", std_results)

# # Save the best model's adapter
# if model:
#     adapter_save_path = f"../../saved/adapter_after_run/{hparams['task_adapter_name']}"
#     model.save_adapter(adapter_save_path, hparams['task_adapter_name'])
#     print(f"Adapter saved to {adapter_save_path}")
# else:
#     print("No best model to save.")

Mean Results: {'last_epoch': {'source_test/loss': 0.299146443605423, 'source_test/accuracy': 0.911057730515798, 'source_test/f1': 0.9110610485076904, 'source_test/f1_macro': 0.9086141188939413, 'source_test/f1_micro': 0.911057730515798, 'target_test/loss': 0.3331217368443807, 'target_test/accuracy': 0.911057710647583, 'target_test/f1': 0.910616377989451, 'target_test/f1_macro': 0.9081453084945679, 'target_test/f1_micro': 0.911057710647583}, 'best_model': {'source_test/loss': 0.29718655347824097, 'source_test/accuracy': 0.8974359432856241, 'source_test/f1': 0.8973957896232605, 'source_test/f1_macro': 0.8945940732955933, 'source_test/f1_micro': 0.8974359432856241, 'target_test/loss': 0.31283023953437805, 'target_test/accuracy': 0.9134615858395895, 'target_test/f1': 0.9130559762318929, 'target_test/f1_macro': 0.9104557832082113, 'target_test/f1_micro': 0.9134615858395895}, 'epoch_saved': {'source_test/loss': 0.3057962457338969, 'source_test/accuracy': 0.9006410638491312, 'source_test/f1':

In [8]:
print('dones')

dones


In [9]:
best_val_loss

inf