In [1]:

import torch
import gc
torch.cuda.empty_cache()
gc.collect()


0

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Initialize the console

# Step 2: Import necessary libraries
from typing import Optional, Dict, Any
import os
import torch
import pytorch_lightning as pl
import pandas as pd
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from rich.traceback import install
from pytorch_lightning.loggers import WandbLogger
import torch.nn as nn
import torch.optim as optim
from transformers import AutoConfig
from adapters import AutoAdapterModel, AdapterConfig
from adapters.composition import Stack
from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping
import torchmetrics

install(show_locals=True)

from setup import setup_src_path
print(setup_src_path())
import data.processed as processed
import config.config as config
import utils.setup as setup
import utils.functions as fn
from importlib import reload

from datasets import load_from_disk

print(config.Config.TXT_SAVE_PATH)
print(config.Config.MODEL_SAVE_PATH)

dataset = load_from_disk(f"../{config.Config.DATASETS_SAVE_PATH}/datasets")


['/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages', '/tmp/tmpvtr85na_', '/home/guest/Desktop/projects/third-experiments/domain_adaptation_project/modules']


2024-09-12 09:12:28.351896: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-12 09:12:28.386104: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




./text-files/
./hp-model-


In [3]:
import torch
import os
import pytorch_lightning as pl
from transformers import  AutoConfig, DataCollatorForLanguageModeling
from collections import defaultdict
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
import torchmetrics

class JointDomainTaskAdapter(pl.LightningModule):
    def __init__(self, hparams,source_dataset_length,target_dataset_length):
        super(JointDomainTaskAdapter, self).__init__()
        self.save_hyperparameters(hparams)

        # Load config with hidden states output
        self.config = AutoConfig.from_pretrained(self.hparams["pretrained_model_name"])
        self.config.output_hidden_states = True
        self.model = AutoAdapterModel.from_pretrained(self.hparams["pretrained_model_name"], config=self.config)

        # Set reduction factor and leave_out layers
        self.reduction_factor = self.hparams.get("reduction_factor", 16)
        self.leave_out = self.hparams.get("leave_out", [])
        # if self.leave_out != "None":
        #     self.leave_out = self.leave_out.split(",")
        #     self.leave_out = [int(i) for i in self.leave_out]
        # else:
        #     self.leave_out = []

        # Load MLM adapter with head
        self.model.load_adapter(f"{self.hparams['saved_adapter_dir']}/{self.hparams['domain_adapter_name']}", with_head=True)

        # Add classification head for the task
        self.model.add_classification_head(f"{self.hparams['task_adapter_name']}", num_labels=self.hparams["num_classes"])

        # Set active adapters
        self.model.train_adapter(self.hparams['domain_adapter_name'])
        # Calculate alpha based on dataset lengths
        self.alpha = source_dataset_length / (source_dataset_length + target_dataset_length)

        # Initialize loss functions and metrics
        self.criterion = nn.CrossEntropyLoss()
        self.mlm_criterion = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=self.hparams["num_classes"])
        self.f1 = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="weighted")
        self.f1_macro = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="macro")
        self.f1_micro = torchmetrics.F1Score(task='multiclass',num_classes=self.hparams["num_classes"], average="micro")

        self.softmax = nn.Softmax(dim=1)
        self.validation_outputs = []
        self.test_outputs = []
        # Optimizer related variables
        self.learning_rate = self.hparams.get("learning_rate", 1e-4)
        self.scheduler_factor = self.hparams.get("scheduler_factor", 0.1)
        self.scheduler_patience = self.hparams.get("scheduler_patience", 0.05)
        self.scheduler_threshold = self.hparams.get("scheduler_threshold", 0.0001)
        self.scheduler_cooldown = self.hparams.get("scheduler_cooldown", 0)
        self.scheduler_eps = self.hparams.get("scheduler_eps", 1e-8)

    def forward(self, input_ids, attention_mask=None, labels=None, task=None):
        if task == "mlm":
            self.model.active_head= self.hparams['domain_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        elif task == "classification":
            self.model.active_head= self.hparams['task_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        else:
            raise ValueError("Task must be either 'mlm' or 'classification'.")
        return outputs

    def training_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha

        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))

        metrics = {
            "train/accuracy": accuracy,
            "train/f1": f1,
            "train/taskclf_loss": task_loss,
            "train/loss": loss,
            "train/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return loss

    def validation_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha
        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        self.validation_outputs.append({
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
                })
        
        metrics = {
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return metrics
    def on_validation_epoch_start(self):
        self.validation_outputs = []
    
    def on_validation_epoch_end(self):
        outputs= self.validation_outputs

        avg_loss = torch.stack([x["val/loss"] for x in outputs]).mean()
        avg_task_loss = torch.stack([x["val/taskclf_loss"] for x in outputs]).mean()
        avg_mlm_loss = torch.stack([x["val/mlm_loss"] for x in outputs]).mean()
        avg_accuracy = torch.stack([x["val/accuracy"] for x in outputs]).mean()
        avg_f1 = torch.stack([x["val/f1"] for x in outputs]).mean()
        print(f"val/accuracy: {avg_accuracy}")
        print(f"val/f1: {avg_f1}")
        print(f"val/taskclf_loss: {avg_task_loss}")
        print(f"val/loss: {avg_loss}")
        print(f"val/mlm_loss: {avg_mlm_loss}")
        metrics = {
            "val/avg_loss": avg_loss,
            "val/avg_taskclf_loss": avg_task_loss,
            "val/avg_mlm_loss": avg_mlm_loss,
            "val/avg_accuracy": avg_accuracy,
            "val/avg_f1": avg_f1,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
        self.log("val_loss", avg_loss)

    def test_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        target_labels = batch["label_target"]

        # Classification task for source data
        cls_outputs_source = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits_source = cls_outputs_source.logits
        task_loss_source = self.criterion(cls_logits_source, source_labels)

        # Classification task for target data
        cls_outputs_target = self(input_ids=target_input_ids, attention_mask=target_attention_mask, task="classification")
        cls_logits_target = cls_outputs_target.logits
        task_loss_target = self.criterion(cls_logits_target, target_labels)

        # Combine losses (though typically you would evaluate them separately)
        loss = task_loss_source + task_loss_target

        accuracy_source = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_source = self.f1(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_macro_source = self.f1_macro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_micro_source = self.f1_micro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
    
        accuracy_target = self.accuracy(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_target = self.f1(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_macro_target = self.f1_macro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_micro_target = self.f1_micro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))

        metrics = {
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        self.test_outputs.append({
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        })
        return metrics
    def on_test_epoch_start(self):
        self.test_outputs = []

    def on_test_epoch_end(self):
        outputs=  self.test_outputs

        avg_loss_source = torch.stack([x["source_test/loss"] for x in outputs]).mean()
        avg_task_loss_target = torch.stack([x["target_test/loss"] for x in outputs]).mean()
        avg_accuracy_source = torch.stack([x["source_test/accuracy"] for x in outputs]).mean()
        avg_f1_source = torch.stack([x["source_test/f1"] for x in outputs]).mean()
        avg_f1_macro_source = torch.stack([x["source_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_source = torch.stack([x["source_test/f1_micro"] for x in outputs]).mean()

        avg_accuracy_target = torch.stack([x["target_test/accuracy"] for x in outputs]).mean()
        avg_f1_target = torch.stack([x["target_test/f1"] for x in outputs]).mean()
        avg_f1_macro_target = torch.stack([x["target_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_target = torch.stack([x["target_test/f1_micro"] for x in outputs]).mean()

        metrics = {
            "source_test/loss": avg_loss_source,
            "target_test/loss": avg_task_loss_target,
            "source_test/accuracy": avg_accuracy_source,
            "source_test/f1": avg_f1_source,
            "source_test/f1_macro": avg_f1_macro_source,
            "source_test/f1_micro": avg_f1_micro_source,
            "target_test/accuracy": avg_accuracy_target,
            "target_test/f1": avg_f1_target,
            "target_test/f1_macro": avg_f1_macro_target,
            "target_test/f1_micro": avg_f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
    def save_adapter(self, location, adapter_name):
        self.model.save_adapter(location, adapter_name)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams["learning_rate"])
        lr_scheduler = {
            'scheduler': optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=0.0001, cooldown=0, min_lr=1e-8),
            'monitor': 'val_loss'
        }
        return [optimizer], [lr_scheduler]


In [4]:
import wandb

wandb.login()
# Wandb setup and training loop
seeds = [42, 10, 100]  # List of seeds
project_name = 'mixed_edited'  # Replace with your wandb project name
domain = 'FTE'  # Replace with the specific domain for this notebook
type = 'union'  # Replace with the specific type for this notebook

# Initialize results dictionary
results = {
    "last_epoch": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "best_model": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "epoch_saved": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    }
}

best_val_loss = float('inf')
best_model = None
best_model_path = ""

[34m[1mwandb[0m: Currently logged in as: [33mmrawhani5[0m ([33mmrawhani[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
reload(processed)
for seed in seeds:
    #wandb.init(project=project_name, name=f'{domain}_{type}_run_with_seed_{seed}', config={'seed': seed})

    try:
        seed_everything(seed)

        hparams = {
            "source_target": "fiction_telephone",
            "source_domain": "fiction",
            "target_domain": "telephone",
            "domain_adapter_name": "mlm_union_TE",
            "task_adapter_name": "task_FTEUni",
            "pretrained_model_name": "bert-base-uncased",
            "padding": "max_length",
            "max_seq_length": 128,
            "bsz": 32,
            "num_classes": 3,
            "learning_rate": 1e-4,
            "reduction_factor": 16,
            "mode": "domain",
            "saved_adapter_dir": "../../saved/adapters",
        }

        save_dir = "checkpoints"
        save_epoch_3 = 6  # Save model at the 3rd epoch
        
        dm = processed.DataModuleSourceTargetMixed(hparams)
        dm.setup('fit')
        dm.setup("test")
        source_length, target_length = dm.get_dataset_lengths()
        print(f"Source dataset length: {source_length}")
        print(f"Target dataset length: {target_length}")
        model = JointDomainTaskAdapter(hparams,source_length,target_length)

        checkpoint_callback = ModelCheckpoint(
            filename="task-FTEUni-{epoch:02d}-{val_loss:.2f}",
            save_top_k=1,
            monitor="val_loss",
            mode="min",
        )
        save_model_callback_epoch = ModelCheckpoint(
            filename="FTEUni-{epoch:02d}",
            every_n_epochs=save_epoch_3,
            save_top_k=-1,
        )

        #wandb_logger = WandbLogger()

    except Exception as e:
        print(f"Error during preprocessing : {e}")

    try:
        train_loader = dm.train_dataloader()
        val_loader = dm.val_dataloader()
        trainer = Trainer(
            max_epochs=10,
            accelerator="auto",
            precision=16,
            
            default_root_dir="checkpoints",
            #logger=wandb_logger,
            callbacks=[checkpoint_callback, save_model_callback_epoch],
            limit_train_batches=1.0,
            limit_val_batches=1.0,
            limit_test_batches=1.0,
        )

        trainer.fit(model, train_loader, val_loader)
        print(f"Best checkpoint path: {checkpoint_callback.best_model_path}")
        print(f"Saved epoch checkpoint path: {save_model_callback_epoch.best_model_path}")
    except Exception as e:
        print(f"Error during training : {e}")

    try:
        dm.setup("test")
        test_loader = dm.test_dataloader()
        test_results_last = trainer.test(model, test_loader)
        print("Test Results Last Epoch:", test_results_last)

        # Collect results for last epoch model
        for key, value in test_results_last[0].items():
            results["last_epoch"][key].append(value)

        # Paths to the saved checkpoints
        best_checkpoint_path = checkpoint_callback.best_model_path
        saved_epoch_checkpoint_path = save_model_callback_epoch.best_model_path
        print(f"Best checkpoint path: {best_checkpoint_path}")
        print(f"Saved epoch checkpoint path: {saved_epoch_checkpoint_path}")

        best_model = JointDomainTaskAdapter.load_from_checkpoint(best_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_best = trainer.test(best_model, test_loader)
        print("Test Results on Best Model:", test_results_best)
        for key, value in test_results_best[0].items():
            results["best_model"][key].append(value)

        saved_epoch_model = JointDomainTaskAdapter.load_from_checkpoint(saved_epoch_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_saved_epoch = trainer.test(saved_epoch_model, test_loader)
        print("Test Results on saved epoch:", test_results_saved_epoch)
        for key, value in test_results_saved_epoch[0].items():
            results["epoch_saved"][key].append(value)

    except Exception as e:
        print(f"Error during testing: {e}")

    #wandb.finish()

Seed set to 42




Batch size: 32


Filter:   0%|          | 0/392702 [00:00<?, ? examples/s]

Filter:   0%|          | 0/392702 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9815 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9815 [00:00<?, ? examples/s]

Source genre: fiction
Target genre: telephone
Number of target samples: 75013


Map:   0%|          | 0/75013 [00:00<?, ? examples/s]

Map:   0%|          | 0/75013 [00:00<?, ? examples/s]

Map:   0%|          | 0/8335 [00:00<?, ? examples/s]

Map:   0%|          | 0/8335 [00:00<?, ? examples/s]

Filter:   0%|          | 0/392702 [00:00<?, ? examples/s]

Filter:   0%|          | 0/392702 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9815 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9815 [00:00<?, ? examples/s]

Source genre: fiction
Target genre: telephone
Number of target samples: 75013


Map:   0%|          | 0/75013 [00:00<?, ? examples/s]

Map:   0%|          | 0/75013 [00:00<?, ? examples/s]

Map:   0%|          | 0/8335 [00:00<?, ? examples/s]

Map:   0%|          | 0/8335 [00:00<?, ? examples/s]

Source dataset length: 69613
Target dataset length: 24796


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
Using 16bit Automatic Mixed Precision (AMP)


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


eee



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.3125
val/f1: 0.41830360889434814
val/taskclf_loss: 1.1079940795898438
val/loss: 1.2935667037963867
val/mlm_loss: 1.8145488500595093


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.7362726926803589
val/f1: 0.743891179561615
val/taskclf_loss: 0.6307324767112732
val/loss: 0.9169570803642273
val/mlm_loss: 1.7205126285552979


Validation: |                                                                                                 …

val/accuracy: 0.7712394595146179
val/f1: 0.7752214074134827
val/taskclf_loss: 0.5679687857627869
val/loss: 0.8624223470687866
val/mlm_loss: 1.6890795230865479


Validation: |                                                                                                 …

val/accuracy: 0.7872012853622437
val/f1: 0.7888181209564209
val/taskclf_loss: 0.564666211605072
val/loss: 0.8594585657119751
val/mlm_loss: 1.6870671510696411


Validation: |                                                                                                 …

val/accuracy: 0.7990814447402954
val/f1: 0.7992250323295593
val/taskclf_loss: 0.5708420276641846
val/loss: 0.866032600402832
val/mlm_loss: 1.6947592496871948


Validation: |                                                                                                 …

val/accuracy: 0.7939667105674744
val/f1: 0.7935055494308472
val/taskclf_loss: 0.6368880867958069
val/loss: 0.9115481376647949
val/mlm_loss: 1.682637095451355


Validation: |                                                                                                 …

val/accuracy: 0.7967795133590698
val/f1: 0.7964602112770081
val/taskclf_loss: 0.6484302282333374
val/loss: 0.9246012568473816
val/mlm_loss: 1.6999316215515137


Validation: |                                                                                                 …

val/accuracy: 0.8055099844932556
val/f1: 0.8052289485931396
val/taskclf_loss: 0.7003073692321777
val/loss: 0.9583159685134888
val/mlm_loss: 1.6826567649841309


Validation: |                                                                                                 …

val/accuracy: 0.8047351837158203
val/f1: 0.8040657639503479
val/taskclf_loss: 0.7589889764785767
val/loss: 0.9977221488952637
val/mlm_loss: 1.6679483652114868


Validation: |                                                                                                 …

val/accuracy: 0.8027982115745544
val/f1: 0.802155077457428
val/taskclf_loss: 0.7894044518470764
val/loss: 1.022173285484314
val/mlm_loss: 1.6756551265716553


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.8049934506416321
val/f1: 0.8045727014541626
val/taskclf_loss: 0.7938934564590454
val/loss: 1.0199469327926636
val/mlm_loss: 1.6545758247375488


Best checkpoint path: checkpoints/lightning_logs/version_6/checkpoints/task-FTEUni-epoch=02-val_loss=0.86.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_6/checkpoints/FTEUni-epoch=05.ckpt


Filter:   0%|          | 0/392702 [00:00<?, ? examples/s]

Filter:   0%|          | 0/392702 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9815 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9815 [00:00<?, ? examples/s]

Source genre: fiction


Target genre: telephone
Number of target samples: 75013


Map:   0%|          | 0/75013 [00:00<?, ? examples/s]

Map:   0%|          | 0/75013 [00:00<?, ? examples/s]

Map:   0%|          | 0/8335 [00:00<?, ? examples/s]

Map:   0%|          | 0/8335 [00:00<?, ? examples/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.7908204197883606, 'source_test/accuracy': 0.8064035773277283, 'source_test/f1': 0.8060191869735718, 'source_test/f1_macro': 0.79946368932724, 'source_test/f1_micro': 0.8064035773277283, 'target_test/loss': 0.8754003643989563, 'target_test/accuracy': 0.784490168094635, 'target_test/f1': 0.7830209732055664, 'target_test/f1_macro': 0.7746514081954956, 'target_test/f1_micro': 0.784490168094635}]
Best checkpoint path: checkpoints/lightning_logs/version_6/checkpoints/task-FTEUni-epoch=02-val_loss=0.86.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_6/checkpoints/FTEUni-epoch=05.ckpt


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.5391191840171814, 'source_test/accuracy': 0.7897465229034424, 'source_test/f1': 0.7906677722930908, 'source_test/f1_macro': 0.7804479002952576, 'source_test/f1_micro': 0.7897465229034424, 'target_test/loss': 0.5956705808639526, 'target_test/accuracy': 0.7686011791229248, 'target_test/f1': 0.7705775499343872, 'target_test/f1_macro': 0.7572407722473145, 'target_test/f1_micro': 0.7686011791229248}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 10


Test Results on saved epoch: [{'source_test/loss': 0.6568295955657959, 'source_test/accuracy': 0.8031393885612488, 'source_test/f1': 0.802668035030365, 'source_test/f1_macro': 0.7939618825912476, 'source_test/f1_micro': 0.8031393885612488, 'target_test/loss': 0.7170677185058594, 'target_test/accuracy': 0.7670890688896179, 'target_test/f1': 0.7660312056541443, 'target_test/f1_macro': 0.756636917591095, 'target_test/f1_micro': 0.7670890688896179}]
Batch size: 32


Source genre: fiction


Target genre: telephone
Number of target samples: 75013


Source genre: fiction
Target genre: telephone
Number of target samples: 75013
Source dataset length: 69613
Target dataset length: 24796


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 16bit Automatic Mixed Precision (AMP)


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


eee


Sanity Checking: |                                                                                            …

val/accuracy: 0.34375
val/f1: 0.36637043952941895
val/taskclf_loss: 1.0937652587890625
val/loss: 1.2986761331558228
val/mlm_loss: 1.8739491701126099


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.7406631708145142
val/f1: 0.7483473420143127
val/taskclf_loss: 0.6226915121078491
val/loss: 0.9065423011779785
val/mlm_loss: 1.7034331560134888


Validation: |                                                                                                 …

val/accuracy: 0.7757591009140015
val/f1: 0.779018223285675
val/taskclf_loss: 0.5550349354743958
val/loss: 0.8543416261672974
val/mlm_loss: 1.6946239471435547


Validation: |                                                                                                 …

val/accuracy: 0.7975318431854248
val/f1: 0.7987691760063171
val/taskclf_loss: 0.5268011093139648
val/loss: 0.8306008577346802
val/mlm_loss: 1.6834970712661743


Validation: |                                                                                                 …

val/accuracy: 0.8049148321151733
val/f1: 0.8057895302772522
val/taskclf_loss: 0.5536131858825684
val/loss: 0.8532254099845886
val/mlm_loss: 1.694365382194519


Validation: |                                                                                                 …

val/accuracy: 0.8005243539810181
val/f1: 0.8000423908233643
val/taskclf_loss: 0.6118856072425842
val/loss: 0.8940145373344421
val/mlm_loss: 1.686071515083313


Validation: |                                                                                                 …

val/accuracy: 0.7922598719596863
val/f1: 0.7909041047096252
val/taskclf_loss: 0.6924561858177185
val/loss: 0.9547162652015686
val/mlm_loss: 1.6909929513931274


Validation: |                                                                                                 …

val/accuracy: 0.8065430521965027
val/f1: 0.8061197996139526
val/taskclf_loss: 0.7167954444885254
val/loss: 0.9676215648651123
val/mlm_loss: 1.671797752380371


Validation: |                                                                                                 …

val/accuracy: 0.806464433670044
val/f1: 0.8065922856330872
val/taskclf_loss: 0.7421043515205383
val/loss: 0.9836860299110413
val/mlm_loss: 1.6619093418121338


Validation: |                                                                                                 …

val/accuracy: 0.8039098978042603
val/f1: 0.8032647371292114
val/taskclf_loss: 0.7879830002784729
val/loss: 1.0212610960006714
val/mlm_loss: 1.6761726140975952


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.807418942451477
val/f1: 0.8074370622634888
val/taskclf_loss: 0.7794278860092163
val/loss: 1.0104494094848633
val/mlm_loss: 1.6590262651443481


Best checkpoint path: checkpoints/lightning_logs/version_7/checkpoints/task-FTEUni-epoch=02-val_loss=0.83.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_7/checkpoints/FTEUni-epoch=05.ckpt


Source genre: fiction
Target genre: telephone
Number of target samples: 75013


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.7736843228340149, 'source_test/accuracy': 0.8084197044372559, 'source_test/f1': 0.8082702159881592, 'source_test/f1_macro': 0.7988354563713074, 'source_test/f1_micro': 0.8084197044372559, 'target_test/loss': 0.8318930864334106, 'target_test/accuracy': 0.7867463231086731, 'target_test/f1': 0.7865643501281738, 'target_test/f1_macro': 0.7749903202056885, 'target_test/f1_micro': 0.7867463231086731}]
Best checkpoint path: checkpoints/lightning_logs/version_7/checkpoints/task-FTEUni-epoch=02-val_loss=0.83.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_7/checkpoints/FTEUni-epoch=05.ckpt


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.5185837745666504, 'source_test/accuracy': 0.7922667264938354, 'source_test/f1': 0.7933343648910522, 'source_test/f1_macro': 0.7812694311141968, 'source_test/f1_micro': 0.7922667264938354, 'target_test/loss': 0.5626596808433533, 'target_test/accuracy': 0.7809619903564453, 'target_test/f1': 0.7826908230781555, 'target_test/f1_macro': 0.7680380940437317, 'target_test/f1_micro': 0.7809619903564453}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 100


Test Results on saved epoch: [{'source_test/loss': 0.6869057416915894, 'source_test/accuracy': 0.7963229417800903, 'source_test/f1': 0.7946922183036804, 'source_test/f1_macro': 0.7882605195045471, 'source_test/f1_micro': 0.7963229417800903, 'target_test/loss': 0.7438775897026062, 'target_test/accuracy': 0.7653369903564453, 'target_test/f1': 0.7642714977264404, 'target_test/f1_macro': 0.7551387548446655, 'target_test/f1_micro': 0.7653369903564453}]
Batch size: 32


Source genre: fiction
Target genre: telephone
Number of target samples: 75013


Source genre: fiction
Target genre: telephone
Number of target samples: 75013
Source dataset length: 69613
Target dataset length: 24796


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 16bit Automatic Mixed Precision (AMP)


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


eee


Sanity Checking: |                                                                                            …

val/accuracy: 0.3125
val/f1: 0.4761904776096344
val/taskclf_loss: 1.1005020141601562
val/loss: 1.2883528470993042
val/mlm_loss: 1.8157306909561157


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.7544297575950623
val/f1: 0.7605006694793701
val/taskclf_loss: 0.5998108386993408
val/loss: 0.8892225027084351
val/mlm_loss: 1.701724886894226


Validation: |                                                                                                 …

val/accuracy: 0.7771795392036438
val/f1: 0.7811137437820435
val/taskclf_loss: 0.560742974281311
val/loss: 0.8627852201461792
val/mlm_loss: 1.710747480392456


Validation: |                                                                                                 …

val/accuracy: 0.7987446188926697
val/f1: 0.8001789450645447
val/taskclf_loss: 0.5290506482124329
val/loss: 0.8307128548622131
val/mlm_loss: 1.6776081323623657


Validation: |                                                                                                 …

val/accuracy: 0.7993902564048767
val/f1: 0.799527108669281
val/taskclf_loss: 0.555846631526947
val/loss: 0.8518314361572266
val/mlm_loss: 1.6827878952026367


Validation: |                                                                                                 …

val/accuracy: 0.796341598033905
val/f1: 0.7953355312347412
val/taskclf_loss: 0.6299053430557251
val/loss: 0.9094774723052979
val/mlm_loss: 1.6943563222885132


Validation: |                                                                                                 …

val/accuracy: 0.7941688299179077
val/f1: 0.7928155064582825
val/taskclf_loss: 0.7112720012664795
val/loss: 0.9713773727416992
val/mlm_loss: 1.7016046047210693


Validation: |                                                                                                 …

val/accuracy: 0.8056110739707947
val/f1: 0.8051657676696777
val/taskclf_loss: 0.701208233833313
val/loss: 0.9585813879966736
val/mlm_loss: 1.681138515472412


Validation: |                                                                                                 …

val/accuracy: 0.8086315989494324
val/f1: 0.8082531094551086
val/taskclf_loss: 0.7381894588470459
val/loss: 0.9816109538078308
val/mlm_loss: 1.6649997234344482


Validation: |                                                                                                 …

val/accuracy: 0.8053528070449829
val/f1: 0.8049733638763428
val/taskclf_loss: 0.7700621485710144
val/loss: 1.005462884902954
val/mlm_loss: 1.666333794593811


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.8064363598823547
val/f1: 0.8058813214302063
val/taskclf_loss: 0.7829911112785339
val/loss: 1.0155975818634033
val/mlm_loss: 1.668623685836792


Best checkpoint path: checkpoints/lightning_logs/version_8/checkpoints/task-FTEUni-epoch=02-val_loss=0.83.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_8/checkpoints/FTEUni-epoch=05.ckpt


Source genre: fiction
Target genre: telephone
Number of target samples: 75013


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.7918956875801086, 'source_test/accuracy': 0.8043874502182007, 'source_test/f1': 0.8044919371604919, 'source_test/f1_macro': 0.7961285710334778, 'source_test/f1_micro': 0.8043874502182007, 'target_test/loss': 0.8570380210876465, 'target_test/accuracy': 0.7877544164657593, 'target_test/f1': 0.7870309948921204, 'target_test/f1_macro': 0.779535710811615, 'target_test/f1_micro': 0.7877544164657593}]
Best checkpoint path: checkpoints/lightning_logs/version_8/checkpoints/task-FTEUni-epoch=02-val_loss=0.83.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_8/checkpoints/FTEUni-epoch=05.ckpt


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.5209733247756958, 'source_test/accuracy': 0.8008352518081665, 'source_test/f1': 0.8009685277938843, 'source_test/f1_macro': 0.7920703291893005, 'source_test/f1_micro': 0.8008352518081665, 'target_test/loss': 0.5520986318588257, 'target_test/accuracy': 0.7837221622467041, 'target_test/f1': 0.78693026304245, 'target_test/f1_macro': 0.7707148790359497, 'target_test/f1_micro': 0.7837221622467041}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on saved epoch: [{'source_test/loss': 0.7153010964393616, 'source_test/accuracy': 0.7950748801231384, 'source_test/f1': 0.7939404845237732, 'source_test/f1_macro': 0.7884384393692017, 'source_test/f1_micro': 0.7950748801231384, 'target_test/loss': 0.762600302696228, 'target_test/accuracy': 0.7711213231086731, 'target_test/f1': 0.7692725658416748, 'target_test/f1_macro': 0.7637867331504822, 'target_test/f1_micro': 0.7711213231086731}]


In [6]:
results.items()

dict_items([('last_epoch', {'source_test/loss': [0.7908204197883606, 0.7736843228340149, 0.7918956875801086], 'source_test/accuracy': [0.8064035773277283, 0.8084197044372559, 0.8043874502182007], 'source_test/f1': [0.8060191869735718, 0.8082702159881592, 0.8044919371604919], 'source_test/f1_macro': [0.79946368932724, 0.7988354563713074, 0.7961285710334778], 'source_test/f1_micro': [0.8064035773277283, 0.8084197044372559, 0.8043874502182007], 'target_test/loss': [0.8754003643989563, 0.8318930864334106, 0.8570380210876465], 'target_test/accuracy': [0.784490168094635, 0.7867463231086731, 0.7877544164657593], 'target_test/f1': [0.7830209732055664, 0.7865643501281738, 0.7870309948921204], 'target_test/f1_macro': [0.7746514081954956, 0.7749903202056885, 0.779535710811615], 'target_test/f1_micro': [0.784490168094635, 0.7867463231086731, 0.7877544164657593]}), ('best_model', {'source_test/loss': [0.5391191840171814, 0.5185837745666504, 0.5209733247756958], 'source_test/accuracy': [0.7897465229

In [7]:
# Calculate mean and standard deviation for each scenario
mean_results = {scenario: {key: np.mean(values) for key, values in metrics.items()} for scenario, metrics in results.items()}
std_results = {scenario: {key: np.std(values) for key, values in metrics.items()} for scenario, metrics in results.items()}

# Log mean and standard deviation results to wandb
# wandb.init(project=project_name, name=f'{domain}_mean_results')
# for scenario in mean_results:
#     for key, value in mean_results[scenario].items():
#         wandb.log({f"{scenario}/{key}": value})
#         wandb.log({f"{scenario}/{key}_std": std_results[scenario][key]})
# wandb.finish()

print("Mean Results:", mean_results)
print("Standard Deviation Results:", std_results)

# # Save the best model's adapter
# if model:
#     adapter_save_path = f"../../saved/adapter_after_run/{hparams['task_adapter_name']}"
#     model.save_adapter(adapter_save_path, hparams['task_adapter_name'])
#     print(f"Adapter saved to {adapter_save_path}")
# else:
#     print("No best model to save.")

Mean Results: {'last_epoch': {'source_test/loss': 0.7854668100674947, 'source_test/accuracy': 0.8064035773277283, 'source_test/f1': 0.8062604467074076, 'source_test/f1_macro': 0.7981425722440084, 'source_test/f1_micro': 0.8064035773277283, 'target_test/loss': 0.8547771573066711, 'target_test/accuracy': 0.7863303025563558, 'target_test/f1': 0.7855387727419535, 'target_test/f1_macro': 0.7763924797375997, 'target_test/f1_micro': 0.7863303025563558}, 'best_model': {'source_test/loss': 0.5262254277865092, 'source_test/accuracy': 0.7942828337351481, 'source_test/f1': 0.7949902216593424, 'source_test/f1_macro': 0.7845958868662516, 'source_test/f1_micro': 0.7942828337351481, 'target_test/loss': 0.5701429645220438, 'target_test/accuracy': 0.7777617772420248, 'target_test/f1': 0.7800662120183309, 'target_test/f1_macro': 0.7653312484423319, 'target_test/f1_micro': 0.7777617772420248}, 'epoch_saved': {'source_test/loss': 0.6863454778989156, 'source_test/accuracy': 0.7981790701548258, 'source_test/

In [8]:
print('dones')

dones


In [9]:
best_val_loss

inf