In [1]:

import torch
import gc
torch.cuda.empty_cache()
gc.collect()


0

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Initialize the console

# Step 2: Import necessary libraries
from typing import Optional, Dict, Any
import os
import torch
import pytorch_lightning as pl
import pandas as pd
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from rich.traceback import install
from pytorch_lightning.loggers import WandbLogger
import torch.nn as nn
import torch.optim as optim
from transformers import AutoConfig
from adapters import AutoAdapterModel, AdapterConfig
from adapters.composition import Stack
from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping
import torchmetrics

install(show_locals=True)

from setup import setup_src_path
print(setup_src_path())
import data.processed as processed
import config.config as config
import utils.setup as setup
import utils.functions as fn
from importlib import reload

from datasets import load_from_disk

print(config.Config.TXT_SAVE_PATH)
print(config.Config.MODEL_SAVE_PATH)

dataset = load_from_disk(f"../{config.Config.DATASETS_SAVE_PATH}/datasets")


['/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages', '/tmp/tmp6emvt_03', '/home/guest/Desktop/projects/third-experiments/domain_adaptation_project/modules']


2024-09-26 13:15:38.763656: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-26 13:15:38.830464: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




./text-files/
./hp-model-


In [3]:
import torch
import os
import pytorch_lightning as pl
from transformers import  AutoConfig, DataCollatorForLanguageModeling
from collections import defaultdict
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
import torchmetrics

class JointDomainTaskAdapter(pl.LightningModule):
    def __init__(self, hparams,source_dataset_length,target_dataset_length):
        super(JointDomainTaskAdapter, self).__init__()
        self.save_hyperparameters(hparams)

        # Load config with hidden states output
        self.config = AutoConfig.from_pretrained(self.hparams["pretrained_model_name"])
        self.config.output_hidden_states = True
        self.model = AutoAdapterModel.from_pretrained(self.hparams["pretrained_model_name"], config=self.config)

        # Set reduction factor and leave_out layers
        self.reduction_factor = self.hparams.get("reduction_factor", 16)
        self.leave_out = self.hparams.get("leave_out", [])
        # if self.leave_out != "None":
        #     self.leave_out = self.leave_out.split(",")
        #     self.leave_out = [int(i) for i in self.leave_out]
        # else:
        #     self.leave_out = []

        # Load MLM adapter with head
        self.model.load_adapter(f"{self.hparams['saved_adapter_dir']}/{self.hparams['domain_adapter_name']}", with_head=True)

        # Add classification head for the task
        self.model.add_classification_head(f"{self.hparams['task_adapter_name']}", num_labels=self.hparams["num_classes"])

        # Set active adapters
        self.model.train_adapter(self.hparams['domain_adapter_name'])
        # Calculate alpha based on dataset lengths
        self.alpha = source_dataset_length / (source_dataset_length + target_dataset_length)

        # Initialize loss functions and metrics
        self.criterion = nn.CrossEntropyLoss()
        self.mlm_criterion = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=self.hparams["num_classes"])
        self.f1 = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="weighted")
        self.f1_macro = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="macro")
        self.f1_micro = torchmetrics.F1Score(task='multiclass',num_classes=self.hparams["num_classes"], average="micro")

        self.softmax = nn.Softmax(dim=1)
        self.validation_outputs = []
        self.test_outputs = []
        # Optimizer related variables
        self.learning_rate = self.hparams.get("learning_rate", 1e-4)
        self.scheduler_factor = self.hparams.get("scheduler_factor", 0.1)
        self.scheduler_patience = self.hparams.get("scheduler_patience", 0.05)
        self.scheduler_threshold = self.hparams.get("scheduler_threshold", 0.0001)
        self.scheduler_cooldown = self.hparams.get("scheduler_cooldown", 0)
        self.scheduler_eps = self.hparams.get("scheduler_eps", 1e-8)

    def forward(self, input_ids, attention_mask=None, labels=None, task=None):
        if task == "mlm":
            self.model.active_head= self.hparams['domain_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        elif task == "classification":
            self.model.active_head= self.hparams['task_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        else:
            raise ValueError("Task must be either 'mlm' or 'classification'.")
        return outputs

    def training_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha

        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))

        metrics = {
            "train/accuracy": accuracy,
            "train/f1": f1,
            "train/taskclf_loss": task_loss,
            "train/loss": loss,
            "train/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return loss

    def validation_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha
        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        self.validation_outputs.append({
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
                })
        
        metrics = {
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return metrics
    def on_validation_epoch_start(self):
        self.validation_outputs = []
    
    def on_validation_epoch_end(self):
        outputs= self.validation_outputs

        avg_loss = torch.stack([x["val/loss"] for x in outputs]).mean()
        avg_task_loss = torch.stack([x["val/taskclf_loss"] for x in outputs]).mean()
        avg_mlm_loss = torch.stack([x["val/mlm_loss"] for x in outputs]).mean()
        avg_accuracy = torch.stack([x["val/accuracy"] for x in outputs]).mean()
        avg_f1 = torch.stack([x["val/f1"] for x in outputs]).mean()
        print(f"val/accuracy: {avg_accuracy}")
        print(f"val/f1: {avg_f1}")
        print(f"val/taskclf_loss: {avg_task_loss}")
        print(f"val/loss: {avg_loss}")
        print(f"val/mlm_loss: {avg_mlm_loss}")
        metrics = {
            "val/avg_loss": avg_loss,
            "val/avg_taskclf_loss": avg_task_loss,
            "val/avg_mlm_loss": avg_mlm_loss,
            "val/avg_accuracy": avg_accuracy,
            "val/avg_f1": avg_f1,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
        self.log("val_loss", avg_loss)

    def test_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        target_labels = batch["label_target"]

        # Classification task for source data
        cls_outputs_source = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits_source = cls_outputs_source.logits
        task_loss_source = self.criterion(cls_logits_source, source_labels)

        # Classification task for target data
        cls_outputs_target = self(input_ids=target_input_ids, attention_mask=target_attention_mask, task="classification")
        cls_logits_target = cls_outputs_target.logits
        task_loss_target = self.criterion(cls_logits_target, target_labels)

        # Combine losses (though typically you would evaluate them separately)
        loss = task_loss_source + task_loss_target

        accuracy_source = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_source = self.f1(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_macro_source = self.f1_macro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_micro_source = self.f1_micro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
    
        accuracy_target = self.accuracy(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_target = self.f1(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_macro_target = self.f1_macro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_micro_target = self.f1_micro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))

        metrics = {
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        self.test_outputs.append({
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        })
        return metrics
    def on_test_epoch_start(self):
        self.test_outputs = []

    def on_test_epoch_end(self):
        outputs=  self.test_outputs

        avg_loss_source = torch.stack([x["source_test/loss"] for x in outputs]).mean()
        avg_task_loss_target = torch.stack([x["target_test/loss"] for x in outputs]).mean()
        avg_accuracy_source = torch.stack([x["source_test/accuracy"] for x in outputs]).mean()
        avg_f1_source = torch.stack([x["source_test/f1"] for x in outputs]).mean()
        avg_f1_macro_source = torch.stack([x["source_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_source = torch.stack([x["source_test/f1_micro"] for x in outputs]).mean()

        avg_accuracy_target = torch.stack([x["target_test/accuracy"] for x in outputs]).mean()
        avg_f1_target = torch.stack([x["target_test/f1"] for x in outputs]).mean()
        avg_f1_macro_target = torch.stack([x["target_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_target = torch.stack([x["target_test/f1_micro"] for x in outputs]).mean()

        metrics = {
            "source_test/loss": avg_loss_source,
            "target_test/loss": avg_task_loss_target,
            "source_test/accuracy": avg_accuracy_source,
            "source_test/f1": avg_f1_source,
            "source_test/f1_macro": avg_f1_macro_source,
            "source_test/f1_micro": avg_f1_micro_source,
            "target_test/accuracy": avg_accuracy_target,
            "target_test/f1": avg_f1_target,
            "target_test/f1_macro": avg_f1_macro_target,
            "target_test/f1_micro": avg_f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
    def save_adapter(self, location, adapter_name):
        self.model.save_adapter(location, adapter_name)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams["learning_rate"])
        lr_scheduler = {
            'scheduler': optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=0.0001, cooldown=0, min_lr=1e-8),
            'monitor': 'val_loss'
        }
        return [optimizer], [lr_scheduler]


In [4]:
import wandb

wandb.login()
# Wandb setup and training loop
seeds = [42, 10, 100]  # List of seeds
project_name = 'mixed_edited'  # Replace with your wandb project name
domain = 'SF'  # Replace with the specific domain for this notebook
type = 'union'  # Replace with the specific type for this notebook

# Initialize results dictionary
results = {
    "last_epoch": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "best_model": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "epoch_saved": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    }
}

best_val_loss = float('inf')
best_model = None
best_model_path = ""

[34m[1mwandb[0m: Currently logged in as: [33mmrawhani5[0m ([33mmrawhani[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
reload(processed)
for seed in seeds:
    #wandb.init(project=project_name, name=f'{domain}_{type}_run_with_seed_{seed}', config={'seed': seed})

    try:
        seed_everything(seed)

        hparams = {
            "source_target": "government_telephone",
            "source_domain": "government",
            "target_domain": "telephone",
            "domain_adapter_name": "mlm_union_TE",
            "task_adapter_name": "GTEUni",
            "pretrained_model_name": "bert-base-uncased",
            "padding": "max_length",
            "max_seq_length": 128,
            "bsz": 32,
            "num_classes": 3,
            "learning_rate": 1e-4,
            "reduction_factor": 16,
            "mode": "domain",
            "saved_adapter_dir": "../../saved/adapters",
        }

        save_dir = "checkpoints"
        save_epoch_3 = 6  # Save model at the 3rd epoch
        
        dm = processed.DataModuleSourceTargetMixed(hparams)
        dm.setup('fit')
        dm.setup("test")
        source_length, target_length = dm.get_dataset_lengths()
        print(f"Source dataset length: {source_length}")
        print(f"Target dataset length: {target_length}")
        model = JointDomainTaskAdapter(hparams,source_length,target_length)

        checkpoint_callback = ModelCheckpoint(
            filename="task-GTEUni-{epoch:02d}-{val_loss:.2f}",
            save_top_k=1,
            monitor="val_loss",
            mode="min",
        )
        save_model_callback_epoch = ModelCheckpoint(
            filename="GTEUni-{epoch:02d}",
            every_n_epochs=save_epoch_3,
            save_top_k=-1,
        )

        #wandb_logger = WandbLogger()

    except Exception as e:
        print(f"Error during preprocessing : {e}")

    try:
        train_loader = dm.train_dataloader()
        val_loader = dm.val_dataloader()
        trainer = Trainer(
            max_epochs=10,
            accelerator="auto",
            precision=16,
            
            default_root_dir="checkpoints",
            #logger=wandb_logger,
            callbacks=[checkpoint_callback, save_model_callback_epoch],
            limit_train_batches=1.0,
            limit_val_batches=1.0,
            limit_test_batches=1.0,
        )

        trainer.fit(model, train_loader, val_loader)
        print(f"Best checkpoint path: {checkpoint_callback.best_model_path}")
        print(f"Saved epoch checkpoint path: {save_model_callback_epoch.best_model_path}")
    except Exception as e:
        print(f"Error during training : {e}")

    try:
        dm.setup("test")
        test_loader = dm.test_dataloader()
        test_results_last = trainer.test(model, test_loader)
        print("Test Results Last Epoch:", test_results_last)

        # Collect results for last epoch model
        for key, value in test_results_last[0].items():
            results["last_epoch"][key].append(value)

        # Paths to the saved checkpoints
        best_checkpoint_path = checkpoint_callback.best_model_path
        saved_epoch_checkpoint_path = save_model_callback_epoch.best_model_path
        print(f"Best checkpoint path: {best_checkpoint_path}")
        print(f"Saved epoch checkpoint path: {saved_epoch_checkpoint_path}")

        best_model = JointDomainTaskAdapter.load_from_checkpoint(best_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_best = trainer.test(best_model, test_loader)
        print("Test Results on Best Model:", test_results_best)
        for key, value in test_results_best[0].items():
            results["best_model"][key].append(value)

        saved_epoch_model = JointDomainTaskAdapter.load_from_checkpoint(saved_epoch_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_saved_epoch = trainer.test(saved_epoch_model, test_loader)
        print("Test Results on saved epoch:", test_results_saved_epoch)
        for key, value in test_results_saved_epoch[0].items():
            results["epoch_saved"][key].append(value)

    except Exception as e:
        print(f"Error during testing: {e}")

    #wandb.finish()

Seed set to 42




Batch size: 32


Source genre: government
Target genre: telephone
Number of target samples: 75013


Source genre: government
Target genre: telephone
Number of target samples: 75013


Source dataset length: 69615
Target dataset length: 24796


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
Using 16bit Automatic Mixed Precision (AMP)


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/amp.py:55: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


eee



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.296875
val/f1: 0.3507066071033478
val/taskclf_loss: 1.0997314453125
val/loss: 1.2874629497528076
val/mlm_loss: 1.8145205974578857


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.8117027282714844
val/f1: 0.8120768666267395
val/taskclf_loss: 0.48258623480796814
val/loss: 0.8055803179740906
val/mlm_loss: 1.7123891115188599


Validation: |                                                                                                 …

val/accuracy: 0.8189341425895691
val/f1: 0.8190726637840271
val/taskclf_loss: 0.4750020503997803
val/loss: 0.7917118668556213
val/mlm_loss: 1.6808775663375854


Validation: |                                                                                                 …

val/accuracy: 0.8266820907592773
val/f1: 0.8271040320396423
val/taskclf_loss: 0.47179216146469116
val/loss: 0.7895498871803284
val/mlm_loss: 1.6816574335098267


Validation: |                                                                                                 …

val/accuracy: 0.8258566856384277
val/f1: 0.8258665800094604
val/taskclf_loss: 0.5472965240478516
val/loss: 0.8475981950759888
val/mlm_loss: 1.6906980276107788


Validation: |                                                                                                 …

val/accuracy: 0.8225498199462891
val/f1: 0.8220021724700928
val/taskclf_loss: 0.572243332862854
val/loss: 0.8625514507293701
val/mlm_loss: 1.6775940656661987


Validation: |                                                                                                 …

val/accuracy: 0.8216459155082703
val/f1: 0.8214823603630066
val/taskclf_loss: 0.6149164438247681
val/loss: 0.8983663320541382
val/mlm_loss: 1.6941546201705933


Validation: |                                                                                                 …

val/accuracy: 0.837607741355896
val/f1: 0.8378259539604187
val/taskclf_loss: 0.6203445792198181
val/loss: 0.8984482884407043
val/mlm_loss: 1.6792271137237549


Validation: |                                                                                                 …

val/accuracy: 0.8362378478050232
val/f1: 0.8362195491790771
val/taskclf_loss: 0.6470850110054016
val/loss: 0.9138908386230469
val/mlm_loss: 1.6629506349563599


Validation: |                                                                                                 …

val/accuracy: 0.8374000191688538
val/f1: 0.8373647928237915
val/taskclf_loss: 0.6821923851966858
val/loss: 0.9417064189910889
val/mlm_loss: 1.6702945232391357


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.8399826884269714
val/f1: 0.8398417830467224
val/taskclf_loss: 0.6769310235977173
val/loss: 0.9316508173942566
val/mlm_loss: 1.64677894115448


Best checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/task-GTEUni-epoch=02-val_loss=0.79.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/GTEUni-epoch=05.ckpt


Source genre: government
Target genre: telephone
Number of target samples: 75013


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.6843253374099731, 'source_test/accuracy': 0.8380616307258606, 'source_test/f1': 0.836942732334137, 'source_test/f1_macro': 0.8336036205291748, 'source_test/f1_micro': 0.8380616307258606, 'target_test/loss': 0.9464944005012512, 'target_test/accuracy': 0.7629608511924744, 'target_test/f1': 0.762649416923523, 'target_test/f1_macro': 0.7531947493553162, 'target_test/f1_micro': 0.7629608511924744}]
Best checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/task-GTEUni-epoch=02-val_loss=0.79.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/GTEUni-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.4764903485774994, 'source_test/accuracy': 0.8310052156448364, 'source_test/f1': 0.8304678201675415, 'source_test/f1_macro': 0.8257171511650085, 'source_test/f1_micro': 0.8310052156448364, 'target_test/loss': 0.6524326801300049, 'target_test/accuracy': 0.7507200241088867, 'target_test/f1': 0.7514182925224304, 'target_test/f1_macro': 0.7391757369041443, 'target_test/f1_micro': 0.7507200241088867}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 10


Test Results on saved epoch: [{'source_test/loss': 0.6326131820678711, 'source_test/accuracy': 0.8209245204925537, 'source_test/f1': 0.819132387638092, 'source_test/f1_macro': 0.816200852394104, 'source_test/f1_micro': 0.8209245204925537, 'target_test/loss': 0.8421339988708496, 'target_test/accuracy': 0.749351978302002, 'target_test/f1': 0.7483696341514587, 'target_test/f1_macro': 0.7392754554748535, 'target_test/f1_micro': 0.749351978302002}]
Batch size: 32


Source genre: government


Target genre: telephone
Number of target samples: 75013


Source genre: government
Target genre: telephone
Number of target samples: 75013
Source dataset length: 69615
Target dataset length: 24796


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 16bit Automatic Mixed Precision (AMP)


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


eee


Sanity Checking: |                                                                                            …

val/accuracy: 0.265625
val/f1: 0.3014119267463684
val/taskclf_loss: 1.11285400390625
val/loss: 1.3127405643463135
val/mlm_loss: 1.8739242553710938


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.8117027282714844
val/f1: 0.8118407130241394
val/taskclf_loss: 0.4873972237110138
val/loss: 0.8052641749382019
val/mlm_loss: 1.6976782083511353


Validation: |                                                                                                 …

val/accuracy: 0.8188835382461548
val/f1: 0.8189265727996826
val/taskclf_loss: 0.47605156898498535
val/loss: 0.7942184209823608
val/mlm_loss: 1.6874747276306152


Validation: |                                                                                                 …

val/accuracy: 0.8245653510093689
val/f1: 0.8245497941970825
val/taskclf_loss: 0.48338502645492554
val/loss: 0.7967241406440735
val/mlm_loss: 1.6764265298843384


Validation: |                                                                                                 …

val/accuracy: 0.8230663537979126
val/f1: 0.8229361176490784
val/taskclf_loss: 0.5188309550285339
val/loss: 0.8258998990058899
val/mlm_loss: 1.6879987716674805


Validation: |                                                                                                 …

val/accuracy: 0.8104113936424255
val/f1: 0.8097793459892273
val/taskclf_loss: 0.6119846105575562
val/loss: 0.8922297358512878
val/mlm_loss: 1.6790201663970947


Validation: |                                                                                                 …

val/accuracy: 0.8346377015113831
val/f1: 0.8344698548316956
val/taskclf_loss: 0.5835063457489014
val/loss: 0.8684980273246765
val/mlm_loss: 1.6686147451400757


Validation: |                                                                                                 …

val/accuracy: 0.8346377015113831
val/f1: 0.8345300555229187
val/taskclf_loss: 0.6052293181419373
val/loss: 0.8817938566207886
val/mlm_loss: 1.6582510471343994


Validation: |                                                                                                 …

val/accuracy: 0.8339920043945312
val/f1: 0.8339654207229614
val/taskclf_loss: 0.6225550174713135
val/loss: 0.8928689956665039
val/mlm_loss: 1.651777744293213


Validation: |                                                                                                 …

val/accuracy: 0.8360581398010254
val/f1: 0.8361034393310547
val/taskclf_loss: 0.6202629804611206
val/loss: 0.8946727514266968
val/mlm_loss: 1.6650806665420532


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.8357998728752136
val/f1: 0.8358374834060669
val/taskclf_loss: 0.6211380958557129
val/loss: 0.8914002776145935
val/mlm_loss: 1.6501635313034058


Best checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/task-GTEUni-epoch=01-val_loss=0.79.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/GTEUni-epoch=05.ckpt


Source genre: government
Target genre: telephone
Number of target samples: 75013


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.6262930035591125, 'source_test/accuracy': 0.8479982614517212, 'source_test/f1': 0.8471413254737854, 'source_test/f1_macro': 0.844122588634491, 'source_test/f1_micro': 0.8479982614517212, 'target_test/loss': 0.8754822611808777, 'target_test/accuracy': 0.7685052156448364, 'target_test/f1': 0.769526481628418, 'target_test/f1_macro': 0.7574337124824524, 'target_test/f1_micro': 0.7685052156448364}]
Best checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/task-GTEUni-epoch=01-val_loss=0.79.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/GTEUni-epoch=05.ckpt


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.49072155356407166, 'source_test/accuracy': 0.8125719428062439, 'source_test/f1': 0.8116192817687988, 'source_test/f1_macro': 0.8078840970993042, 'source_test/f1_micro': 0.8125719428062439, 'target_test/loss': 0.6465293765068054, 'target_test/accuracy': 0.7513681054115295, 'target_test/f1': 0.7508292198181152, 'target_test/f1_macro': 0.7424495220184326, 'target_test/f1_micro': 0.7513681054115295}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 100


Test Results on saved epoch: [{'source_test/loss': 0.5856896638870239, 'source_test/accuracy': 0.8424538969993591, 'source_test/f1': 0.8413702249526978, 'source_test/f1_macro': 0.8383342027664185, 'source_test/f1_micro': 0.8424538969993591, 'target_test/loss': 0.8160120248794556, 'target_test/accuracy': 0.7629608511924744, 'target_test/f1': 0.763457179069519, 'target_test/f1_macro': 0.7532247304916382, 'target_test/f1_micro': 0.7629608511924744}]
Batch size: 32


Source genre: government


Target genre: telephone
Number of target samples: 75013


Source genre: government


Target genre: telephone
Number of target samples: 75013
Source dataset length: 69615
Target dataset length: 24796


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 16bit Automatic Mixed Precision (AMP)


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


eee


Sanity Checking: |                                                                                            …

val/accuracy: 0.359375
val/f1: 0.5128205418586731
val/taskclf_loss: 1.0996475219726562
val/loss: 1.287732720375061
val/mlm_loss: 1.8157832622528076


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.8120900988578796
val/f1: 0.8124966621398926
val/taskclf_loss: 0.4808879792690277
val/loss: 0.7998692393302917
val/mlm_loss: 1.6954121589660645


Validation: |                                                                                                 …

val/accuracy: 0.8204837441444397
val/f1: 0.8206235766410828
val/taskclf_loss: 0.46846649050712585
val/loss: 0.7937875986099243
val/mlm_loss: 1.7071295976638794


Validation: |                                                                                                 …

val/accuracy: 0.8197594285011292
val/f1: 0.8198452591896057
val/taskclf_loss: 0.49857595562934875
val/loss: 0.8077304363250732
val/mlm_loss: 1.6756844520568848


Validation: |                                                                                                 …

val/accuracy: 0.821516752243042
val/f1: 0.8210176229476929
val/taskclf_loss: 0.5159682631492615
val/loss: 0.8211743831634521
val/mlm_loss: 1.6780433654785156


Validation: |                                                                                                 …

val/accuracy: 0.8236333727836609
val/f1: 0.8238328099250793
val/taskclf_loss: 0.5686737298965454
val/loss: 0.8617677092552185
val/mlm_loss: 1.6846317052841187


Validation: |                                                                                                 …

val/accuracy: 0.8343513607978821
val/f1: 0.8341134786605835
val/taskclf_loss: 0.5935001373291016
val/loss: 0.8786341547966003
val/mlm_loss: 1.6791503429412842


Validation: |                                                                                                 …

val/accuracy: 0.8346096277236938
val/f1: 0.8343644142150879
val/taskclf_loss: 0.6175510287284851
val/loss: 0.8943632245063782
val/mlm_loss: 1.6715160608291626


Validation: |                                                                                                 …

val/accuracy: 0.8364174962043762
val/f1: 0.8361226916313171
val/taskclf_loss: 0.6254634261131287
val/loss: 0.8962023854255676
val/mlm_loss: 1.6563043594360352


Validation: |                                                                                                 …

val/accuracy: 0.8346096277236938
val/f1: 0.8343281745910645
val/taskclf_loss: 0.6202266216278076
val/loss: 0.8924216628074646
val/mlm_loss: 1.6566113233566284


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.8342222571372986
val/f1: 0.833896279335022
val/taskclf_loss: 0.6231246590614319
val/loss: 0.8956043720245361
val/mlm_loss: 1.6605936288833618


Best checkpoint path: checkpoints/lightning_logs/version_11/checkpoints/task-GTEUni-epoch=01-val_loss=0.79.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_11/checkpoints/GTEUni-epoch=05.ckpt


Source genre: government
Target genre: telephone
Number of target samples: 75013


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.5972306132316589, 'source_test/accuracy': 0.8505184054374695, 'source_test/f1': 0.849670946598053, 'source_test/f1_macro': 0.845848798751831, 'source_test/f1_micro': 0.8505184054374695, 'target_test/loss': 0.881098747253418, 'target_test/accuracy': 0.7653369903564453, 'target_test/f1': 0.7651310563087463, 'target_test/f1_macro': 0.7550639510154724, 'target_test/f1_micro': 0.7653369903564453}]
Best checkpoint path: checkpoints/lightning_logs/version_11/checkpoints/task-GTEUni-epoch=01-val_loss=0.79.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_11/checkpoints/GTEUni-epoch=05.ckpt


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.47496041655540466, 'source_test/accuracy': 0.8212845325469971, 'source_test/f1': 0.8207164406776428, 'source_test/f1_macro': 0.816295325756073, 'source_test/f1_micro': 0.8212845325469971, 'target_test/loss': 0.630818247795105, 'target_test/accuracy': 0.7498559951782227, 'target_test/f1': 0.7497701048851013, 'target_test/f1_macro': 0.739126980304718, 'target_test/f1_micro': 0.7498559951782227}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on saved epoch: [{'source_test/loss': 0.5791696906089783, 'source_test/accuracy': 0.846990168094635, 'source_test/f1': 0.8457463383674622, 'source_test/f1_macro': 0.8430465459823608, 'source_test/f1_micro': 0.846990168094635, 'target_test/loss': 0.8375287055969238, 'target_test/accuracy': 0.7599366307258606, 'target_test/f1': 0.7587699890136719, 'target_test/f1_macro': 0.7497910261154175, 'target_test/f1_micro': 0.7599366307258606}]


In [6]:
results.items()

dict_items([('last_epoch', {'source_test/loss': [0.6843253374099731, 0.6262930035591125, 0.5972306132316589], 'source_test/accuracy': [0.8380616307258606, 0.8479982614517212, 0.8505184054374695], 'source_test/f1': [0.836942732334137, 0.8471413254737854, 0.849670946598053], 'source_test/f1_macro': [0.8336036205291748, 0.844122588634491, 0.845848798751831], 'source_test/f1_micro': [0.8380616307258606, 0.8479982614517212, 0.8505184054374695], 'target_test/loss': [0.9464944005012512, 0.8754822611808777, 0.881098747253418], 'target_test/accuracy': [0.7629608511924744, 0.7685052156448364, 0.7653369903564453], 'target_test/f1': [0.762649416923523, 0.769526481628418, 0.7651310563087463], 'target_test/f1_macro': [0.7531947493553162, 0.7574337124824524, 0.7550639510154724], 'target_test/f1_micro': [0.7629608511924744, 0.7685052156448364, 0.7653369903564453]}), ('best_model', {'source_test/loss': [0.4764903485774994, 0.49072155356407166, 0.47496041655540466], 'source_test/accuracy': [0.8310052156

In [7]:
# Calculate mean and standard deviation for each scenario
mean_results = {scenario: {key: np.mean(values) for key, values in metrics.items()} for scenario, metrics in results.items()}
std_results = {scenario: {key: np.std(values) for key, values in metrics.items()} for scenario, metrics in results.items()}

# Log mean and standard deviation results to wandb
# wandb.init(project=project_name, name=f'{domain}_mean_results')
# for scenario in mean_results:
#     for key, value in mean_results[scenario].items():
#         wandb.log({f"{scenario}/{key}": value})
#         wandb.log({f"{scenario}/{key}_std": std_results[scenario][key]})
# wandb.finish()

print("Mean Results:", mean_results)
print("Standard Deviation Results:", std_results)

# # Save the best model's adapter
# if model:
#     adapter_save_path = f"../../saved/adapter_after_run/{hparams['task_adapter_name']}"
#     model.save_adapter(adapter_save_path, hparams['task_adapter_name'])
#     print(f"Adapter saved to {adapter_save_path}")
# else:
#     print("No best model to save.")

Mean Results: {'last_epoch': {'source_test/loss': 0.6359496514002482, 'source_test/accuracy': 0.8455260992050171, 'source_test/f1': 0.8445850014686584, 'source_test/f1_macro': 0.8411916693051656, 'source_test/f1_micro': 0.8455260992050171, 'target_test/loss': 0.901025136311849, 'target_test/accuracy': 0.7656010190645853, 'target_test/f1': 0.7657689849535624, 'target_test/f1_macro': 0.7552308042844137, 'target_test/f1_micro': 0.7656010190645853}, 'best_model': {'source_test/loss': 0.48072410623232525, 'source_test/accuracy': 0.8216205636660258, 'source_test/f1': 0.820934514204661, 'source_test/f1_macro': 0.8166321913401285, 'source_test/f1_micro': 0.8216205636660258, 'target_test/loss': 0.6432601014773051, 'target_test/accuracy': 0.750648041566213, 'target_test/f1': 0.7506725390752157, 'target_test/f1_macro': 0.7402507464090983, 'target_test/f1_micro': 0.750648041566213}, 'epoch_saved': {'source_test/loss': 0.5991575121879578, 'source_test/accuracy': 0.8367895285288492, 'source_test/f1'

In [8]:
print('dones')

dones


In [9]:
best_val_loss

inf