In [1]:

import torch
import gc
torch.cuda.empty_cache()
gc.collect()


0

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Initialize the console

# Step 2: Import necessary libraries
from typing import Optional, Dict, Any
import os
import torch
import pytorch_lightning as pl
import pandas as pd
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from rich.traceback import install
from pytorch_lightning.loggers import WandbLogger
import torch.nn as nn
import torch.optim as optim
from transformers import AutoConfig
from adapters import AutoAdapterModel, AdapterConfig
from adapters.composition import Stack
from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint,EarlyStopping
import torchmetrics

install(show_locals=True)

from setup import setup_src_path
print(setup_src_path())
import data.processed as processed
import config.config as config
import utils.setup as setup
import utils.functions as fn
from importlib import reload

from datasets import load_from_disk

print(config.Config.TXT_SAVE_PATH)
print(config.Config.MODEL_SAVE_PATH)

dataset = load_from_disk(f"../{config.Config.DATASETS_SAVE_PATH}/datasets")


['/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages', '/tmp/tmpycxsdb7r', '/home/guest/Desktop/projects/third-experiments/domain_adaptation_project/modules']


2024-09-23 20:47:33.721792: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-23 20:47:33.767781: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




./text-files/
./hp-model-


In [3]:
import torch
import os
import pytorch_lightning as pl
from transformers import  AutoConfig, DataCollatorForLanguageModeling
from collections import defaultdict
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
import torchmetrics

class JointDomainTaskAdapter(pl.LightningModule):
    def __init__(self, hparams,source_dataset_length,target_dataset_length):
        super(JointDomainTaskAdapter, self).__init__()
        self.save_hyperparameters(hparams)

        # Load config with hidden states output
        self.config = AutoConfig.from_pretrained(self.hparams["pretrained_model_name"])
        self.config.output_hidden_states = True
        self.model = AutoAdapterModel.from_pretrained(self.hparams["pretrained_model_name"], config=self.config)

        # Set reduction factor and leave_out layers
        self.reduction_factor = self.hparams.get("reduction_factor", 16)
        self.leave_out = self.hparams.get("leave_out", [])
        # if self.leave_out != "None":
        #     self.leave_out = self.leave_out.split(",")
        #     self.leave_out = [int(i) for i in self.leave_out]
        # else:
        #     self.leave_out = []

        # Load MLM adapter with head
        self.model.load_adapter(f"{self.hparams['saved_adapter_dir']}/{self.hparams['domain_adapter_name']}", with_head=True)

        # Add classification head for the task
        self.model.add_classification_head(f"{self.hparams['task_adapter_name']}", num_labels=self.hparams["num_classes"])

        # Set active adapters
        self.model.train_adapter(self.hparams['domain_adapter_name'])
        # Calculate alpha based on dataset lengths
        self.alpha = source_dataset_length / (source_dataset_length + target_dataset_length)

        # Initialize loss functions and metrics
        self.criterion = nn.CrossEntropyLoss()
        self.mlm_criterion = nn.CrossEntropyLoss()
        self.accuracy = torchmetrics.Accuracy(task='multiclass', num_classes=self.hparams["num_classes"])
        self.f1 = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="weighted")
        self.f1_macro = torchmetrics.F1Score(task='multiclass', num_classes=self.hparams["num_classes"], average="macro")
        self.f1_micro = torchmetrics.F1Score(task='multiclass',num_classes=self.hparams["num_classes"], average="micro")

        self.softmax = nn.Softmax(dim=1)
        self.validation_outputs = []
        self.test_outputs = []
        # Optimizer related variables
        self.learning_rate = self.hparams.get("learning_rate", 1e-4)
        self.scheduler_factor = self.hparams.get("scheduler_factor", 0.1)
        self.scheduler_patience = self.hparams.get("scheduler_patience", 0.05)
        self.scheduler_threshold = self.hparams.get("scheduler_threshold", 0.0001)
        self.scheduler_cooldown = self.hparams.get("scheduler_cooldown", 0)
        self.scheduler_eps = self.hparams.get("scheduler_eps", 1e-8)

    def forward(self, input_ids, attention_mask=None, labels=None, task=None):
        if task == "mlm":
            self.model.active_head= self.hparams['domain_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        elif task == "classification":
            self.model.active_head= self.hparams['task_adapter_name']
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        else:
            raise ValueError("Task must be either 'mlm' or 'classification'.")
        return outputs

    def training_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha

        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))

        metrics = {
            "train/accuracy": accuracy,
            "train/f1": f1,
            "train/taskclf_loss": task_loss,
            "train/loss": loss,
            "train/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return loss

    def validation_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        mlm_labels = batch["mlm_labels"]

        # Calculate dynamic alpha based on the lengths of source and target data
        alpha = self.alpha
        # Classification task
        cls_outputs = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits = cls_outputs.logits
        task_loss = self.criterion(cls_logits, source_labels)

        # MLM task
        mlm_outputs = self(input_ids=target_input_ids, attention_mask=target_attention_mask, labels=mlm_labels, task="mlm")
        mlm_loss = mlm_outputs.loss

        # Combine losses
        loss = alpha * task_loss + (1 - alpha) * mlm_loss

        accuracy = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        f1 = self.f1(source_labels, torch.argmax(self.softmax(cls_logits), dim=1))
        self.validation_outputs.append({
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
                })
        
        metrics = {
            "val/accuracy": accuracy,
            "val/f1": f1,
            "val/taskclf_loss": task_loss,
            "val/loss": loss,
            "val/mlm_loss": mlm_loss
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        return metrics
    def on_validation_epoch_start(self):
        self.validation_outputs = []
    
    def on_validation_epoch_end(self):
        outputs= self.validation_outputs

        avg_loss = torch.stack([x["val/loss"] for x in outputs]).mean()
        avg_task_loss = torch.stack([x["val/taskclf_loss"] for x in outputs]).mean()
        avg_mlm_loss = torch.stack([x["val/mlm_loss"] for x in outputs]).mean()
        avg_accuracy = torch.stack([x["val/accuracy"] for x in outputs]).mean()
        avg_f1 = torch.stack([x["val/f1"] for x in outputs]).mean()
        print(f"val/accuracy: {avg_accuracy}")
        print(f"val/f1: {avg_f1}")
        print(f"val/taskclf_loss: {avg_task_loss}")
        print(f"val/loss: {avg_loss}")
        print(f"val/mlm_loss: {avg_mlm_loss}")
        metrics = {
            "val/avg_loss": avg_loss,
            "val/avg_taskclf_loss": avg_task_loss,
            "val/avg_mlm_loss": avg_mlm_loss,
            "val/avg_accuracy": avg_accuracy,
            "val/avg_f1": avg_f1,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
        self.log("val_loss", avg_loss)

    def test_step(self, batch, batch_idx):
        # Separate source and target data processing
        source_input_ids = batch["source_input_ids"]
        source_attention_mask = batch["source_attention_mask"]
        source_labels = batch["label_source"]

        target_input_ids = batch["target_input_ids"]
        target_attention_mask = batch["target_attention_mask"]
        target_labels = batch["label_target"]

        # Classification task for source data
        cls_outputs_source = self(input_ids=source_input_ids, attention_mask=source_attention_mask, task="classification")
        cls_logits_source = cls_outputs_source.logits
        task_loss_source = self.criterion(cls_logits_source, source_labels)

        # Classification task for target data
        cls_outputs_target = self(input_ids=target_input_ids, attention_mask=target_attention_mask, task="classification")
        cls_logits_target = cls_outputs_target.logits
        task_loss_target = self.criterion(cls_logits_target, target_labels)

        # Combine losses (though typically you would evaluate them separately)
        loss = task_loss_source + task_loss_target

        accuracy_source = self.accuracy(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_source = self.f1(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_macro_source = self.f1_macro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
        f1_micro_source = self.f1_micro(source_labels, torch.argmax(self.softmax(cls_logits_source), dim=1))
    
        accuracy_target = self.accuracy(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_target = self.f1(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_macro_target = self.f1_macro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))
        f1_micro_target = self.f1_micro(target_labels, torch.argmax(self.softmax(cls_logits_target), dim=1))

        metrics = {
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)

        self.test_outputs.append({
            "source_test/loss": task_loss_source,
            "source_test/accuracy": accuracy_source,
            "source_test/f1": f1_source,
            "source_test/f1_macro": f1_macro_source,
            "source_test/f1_micro": f1_micro_source,
            "target_test/loss": task_loss_target,
            "target_test/accuracy": accuracy_target,
            "target_test/f1": f1_target,
            "target_test/f1_macro": f1_macro_target,
            "target_test/f1_micro": f1_micro_target,
        })
        return metrics
    def on_test_epoch_start(self):
        self.test_outputs = []

    def on_test_epoch_end(self):
        outputs=  self.test_outputs

        avg_loss_source = torch.stack([x["source_test/loss"] for x in outputs]).mean()
        avg_task_loss_target = torch.stack([x["target_test/loss"] for x in outputs]).mean()
        avg_accuracy_source = torch.stack([x["source_test/accuracy"] for x in outputs]).mean()
        avg_f1_source = torch.stack([x["source_test/f1"] for x in outputs]).mean()
        avg_f1_macro_source = torch.stack([x["source_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_source = torch.stack([x["source_test/f1_micro"] for x in outputs]).mean()

        avg_accuracy_target = torch.stack([x["target_test/accuracy"] for x in outputs]).mean()
        avg_f1_target = torch.stack([x["target_test/f1"] for x in outputs]).mean()
        avg_f1_macro_target = torch.stack([x["target_test/f1_macro"] for x in outputs]).mean()
        avg_f1_micro_target = torch.stack([x["target_test/f1_micro"] for x in outputs]).mean()

        metrics = {
            "source_test/loss": avg_loss_source,
            "target_test/loss": avg_task_loss_target,
            "source_test/accuracy": avg_accuracy_source,
            "source_test/f1": avg_f1_source,
            "source_test/f1_macro": avg_f1_macro_source,
            "source_test/f1_micro": avg_f1_micro_source,
            "target_test/accuracy": avg_accuracy_target,
            "target_test/f1": avg_f1_target,
            "target_test/f1_macro": avg_f1_macro_target,
            "target_test/f1_micro": avg_f1_micro_target,
        }

        for key, val in metrics.items():
            self.log(name=key, value=val)
    def save_adapter(self, location, adapter_name):
        self.model.save_adapter(location, adapter_name)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams["learning_rate"])
        lr_scheduler = {
            'scheduler': optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, threshold=0.0001, cooldown=0, min_lr=1e-8),
            'monitor': 'val_loss'
        }
        return [optimizer], [lr_scheduler]


In [4]:
import wandb

wandb.login()
# Wandb setup and training loop
seeds = [42, 10, 100]  # List of seeds
project_name = 'mixed_edited'  # Replace with your wandb project name
domain = 'GF'  # Replace with the specific domain for this notebook
type = 'union'  # Replace with the specific type for this notebook

# Initialize results dictionary
results = {
    "last_epoch": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "best_model": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    },
    "epoch_saved": {
        "source_test/loss": [],
        "source_test/accuracy": [],
        "source_test/f1": [],
        "source_test/f1_macro": [],
        "source_test/f1_micro": [],
        "target_test/loss": [],
        "target_test/accuracy": [],
        "target_test/f1": [],
        "target_test/f1_macro": [],
        "target_test/f1_micro": [],
    }
}

best_val_loss = float('inf')
best_model = None
best_model_path = ""

[34m[1mwandb[0m: Currently logged in as: [33mmrawhani5[0m ([33mmrawhani[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
reload(processed)
for seed in seeds:
    #wandb.init(project=project_name, name=f'{domain}_{type}_run_with_seed_{seed}', config={'seed': seed})

    try:
        seed_everything(seed)

        hparams = {
            "source_target": "government_fiction",
            "source_domain": "government",
            "target_domain": "fiction",
            "domain_adapter_name": "mlm_union_F",
            "task_adapter_name": "GFUni",
            "pretrained_model_name": "bert-base-uncased",
            "padding": "max_length",
            "max_seq_length": 128,
            "bsz": 32,
            "num_classes": 3,
            "learning_rate": 1e-4,
            "reduction_factor": 16,
            "mode": "domain",
            "saved_adapter_dir": "../../saved/adapters",
        }

        save_dir = "checkpoints"
        save_epoch_3 = 6  # Save model at the 3rd epoch
        
        dm = processed.DataModuleSourceTargetMixed(hparams)
        dm.setup('fit')
        dm.setup("test")
        source_length, target_length = dm.get_dataset_lengths()
        print(f"Source dataset length: {source_length}")
        print(f"Target dataset length: {target_length}")
        model = JointDomainTaskAdapter(hparams,source_length,target_length)

        checkpoint_callback = ModelCheckpoint(
            filename="task-GFUni-{epoch:02d}-{val_loss:.2f}",
            save_top_k=1,
            monitor="val_loss",
            mode="min",
        )
        save_model_callback_epoch = ModelCheckpoint(
            filename="GFUni-{epoch:02d}",
            every_n_epochs=save_epoch_3,
            save_top_k=-1,
        )

        #wandb_logger = WandbLogger()

    except Exception as e:
        print(f"Error during preprocessing : {e}")

    try:
        train_loader = dm.train_dataloader()
        val_loader = dm.val_dataloader()
        trainer = Trainer(
            max_epochs=10,
            accelerator="auto",
            precision=16,
            
            default_root_dir="checkpoints",
            #logger=wandb_logger,
            callbacks=[checkpoint_callback, save_model_callback_epoch],
            limit_train_batches=1.0,
            limit_val_batches=1.0,
            limit_test_batches=1.0,
        )

        trainer.fit(model, train_loader, val_loader)
        print(f"Best checkpoint path: {checkpoint_callback.best_model_path}")
        print(f"Saved epoch checkpoint path: {save_model_callback_epoch.best_model_path}")
    except Exception as e:
        print(f"Error during training : {e}")

    try:
        dm.setup("test")
        test_loader = dm.test_dataloader()
        test_results_last = trainer.test(model, test_loader)
        print("Test Results Last Epoch:", test_results_last)

        # Collect results for last epoch model
        for key, value in test_results_last[0].items():
            results["last_epoch"][key].append(value)

        # Paths to the saved checkpoints
        best_checkpoint_path = checkpoint_callback.best_model_path
        saved_epoch_checkpoint_path = save_model_callback_epoch.best_model_path
        print(f"Best checkpoint path: {best_checkpoint_path}")
        print(f"Saved epoch checkpoint path: {saved_epoch_checkpoint_path}")

        best_model = JointDomainTaskAdapter.load_from_checkpoint(best_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_best = trainer.test(best_model, test_loader)
        print("Test Results on Best Model:", test_results_best)
        for key, value in test_results_best[0].items():
            results["best_model"][key].append(value)

        saved_epoch_model = JointDomainTaskAdapter.load_from_checkpoint(saved_epoch_checkpoint_path,source_dataset_length=source_length, target_dataset_length=target_length)
        test_results_saved_epoch = trainer.test(saved_epoch_model, test_loader)
        print("Test Results on saved epoch:", test_results_saved_epoch)
        for key, value in test_results_saved_epoch[0].items():
            results["epoch_saved"][key].append(value)

    except Exception as e:
        print(f"Error during testing: {e}")

    #wandb.finish()

Seed set to 42




Batch size: 32


Source genre: government
Target genre: fiction
Number of target samples: 69613


Source genre: government
Target genre: fiction
Number of target samples: 69613


Source dataset length: 69615
Target dataset length: 15922


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  state_dict = torch.load(weights_file, map_location="cpu")


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/connector.py:563: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
Using 16bit Automatic Mixed Precision (AMP)


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/amp.py:55: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.
GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


eee



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


Sanity Checking: |                                                                                            …

val/accuracy: 0.46875
val/f1: 0.46796971559524536
val/taskclf_loss: 1.0929336547851562
val/loss: 1.185624122619629
val/mlm_loss: 1.5908902883529663


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.8083452582359314
val/f1: 0.8090802431106567
val/taskclf_loss: 0.4941158890724182
val/loss: 0.6913479566574097
val/mlm_loss: 1.5536962747573853


Validation: |                                                                                                 …

val/accuracy: 0.8222915530204773
val/f1: 0.822269082069397
val/taskclf_loss: 0.4821905195713043
val/loss: 0.679812490940094
val/mlm_loss: 1.54386568069458


Validation: |                                                                                                 …

val/accuracy: 0.8269908428192139
val/f1: 0.8275786638259888
val/taskclf_loss: 0.48394495248794556
val/loss: 0.680866539478302
val/mlm_loss: 1.5418572425842285


Validation: |                                                                                                 …

val/accuracy: 0.8204837441444397
val/f1: 0.8200271725654602
val/taskclf_loss: 0.535743772983551
val/loss: 0.7236207127571106
val/mlm_loss: 1.5450663566589355


Validation: |                                                                                                 …

val/accuracy: 0.8188049793243408
val/f1: 0.8185510039329529
val/taskclf_loss: 0.5762070417404175
val/loss: 0.7541297674179077
val/mlm_loss: 1.532052755355835


Validation: |                                                                                                 …

val/accuracy: 0.8354125022888184
val/f1: 0.8354068398475647
val/taskclf_loss: 0.5786231160163879
val/loss: 0.7507041692733765
val/mlm_loss: 1.503085970878601


Validation: |                                                                                                 …

val/accuracy: 0.8361087441444397
val/f1: 0.836269199848175
val/taskclf_loss: 0.6132460236549377
val/loss: 0.7786994576454163
val/mlm_loss: 1.5021041631698608


Validation: |                                                                                                 …

val/accuracy: 0.8361087441444397
val/f1: 0.8363271951675415
val/taskclf_loss: 0.6391647458076477
val/loss: 0.8019211888313293
val/mlm_loss: 1.5135337114334106


Validation: |                                                                                                 …

val/accuracy: 0.8374000191688538
val/f1: 0.8375435471534729
val/taskclf_loss: 0.6328111886978149
val/loss: 0.7957180738449097
val/mlm_loss: 1.5079877376556396


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.8374785780906677
val/f1: 0.8377047181129456
val/taskclf_loss: 0.6313465237617493
val/loss: 0.79234778881073
val/mlm_loss: 1.496286153793335


Best checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/task-GFUni-epoch=01-val_loss=0.68.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/GFUni-epoch=05.ckpt


Source genre: government


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Target genre: fiction
Number of target samples: 69613


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.6319634318351746, 'source_test/accuracy': 0.8477582335472107, 'source_test/f1': 0.847724437713623, 'source_test/f1_macro': 0.8426411151885986, 'source_test/f1_micro': 0.8477582335472107, 'target_test/loss': 0.8976799249649048, 'target_test/accuracy': 0.7608006596565247, 'target_test/f1': 0.7611796259880066, 'target_test/f1_macro': 0.7496859431266785, 'target_test/f1_micro': 0.7608006596565247}]
Best checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/task-GFUni-epoch=01-val_loss=0.68.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_9/checkpoints/GFUni-epoch=05.ckpt


/home/guest/.cache/pypoetry/virtualenvs/third-experments-xuKQSur9-py3.8/lib/python3.8/site-packages/lightning_fabric/utilities/cloud_io.py:57: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.4804317355155945, 'source_test/accuracy': 0.8137240409851074, 'source_test/f1': 0.8132922053337097, 'source_test/f1_macro': 0.8082744479179382, 'source_test/f1_micro': 0.8137240409851074, 'target_test/loss': 0.6946434378623962, 'target_test/accuracy': 0.7320708632469177, 'target_test/f1': 0.7316169738769531, 'target_test/f1_macro': 0.7217115163803101, 'target_test/f1_micro': 0.7320708632469177}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 10


Test Results on saved epoch: [{'source_test/loss': 0.584538459777832, 'source_test/accuracy': 0.8447340726852417, 'source_test/f1': 0.8443819880485535, 'source_test/f1_macro': 0.839821457862854, 'source_test/f1_micro': 0.8447340726852417, 'target_test/loss': 0.8366706967353821, 'target_test/accuracy': 0.7592886090278625, 'target_test/f1': 0.7587594985961914, 'target_test/f1_macro': 0.7502334713935852, 'target_test/f1_micro': 0.7592886090278625}]
Batch size: 32


Source genre: government
Target genre: fiction
Number of target samples: 69613


Source genre: government
Target genre: fiction
Number of target samples: 69613
Source dataset length: 69615
Target dataset length: 15922


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 16bit Automatic Mixed Precision (AMP)


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


eee


Sanity Checking: |                                                                                            …

val/accuracy: 0.296875
val/f1: 0.304953396320343
val/taskclf_loss: 1.1084747314453125
val/loss: 1.2093687057495117
val/mlm_loss: 1.650502324104309


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.8122192621231079
val/f1: 0.8126157522201538
val/taskclf_loss: 0.48552387952804565
val/loss: 0.6838462948799133
val/mlm_loss: 1.5509623289108276


Validation: |                                                                                                 …

val/accuracy: 0.8200963139533997
val/f1: 0.8199272155761719
val/taskclf_loss: 0.4860477149486542
val/loss: 0.6805318593978882
val/mlm_loss: 1.5308657884597778


Validation: |                                                                                                 …

val/accuracy: 0.8230663537979126
val/f1: 0.823126494884491
val/taskclf_loss: 0.49780285358428955
val/loss: 0.6920371651649475
val/mlm_loss: 1.5412788391113281


Validation: |                                                                                                 …

val/accuracy: 0.824666440486908
val/f1: 0.8241720795631409
val/taskclf_loss: 0.5295073390007019
val/loss: 0.7165496945381165
val/mlm_loss: 1.534346342086792


Validation: |                                                                                                 …

val/accuracy: 0.8213090300559998
val/f1: 0.8212035894393921
val/taskclf_loss: 0.5712742209434509
val/loss: 0.747934877872467
val/mlm_loss: 1.5203403234481812


Validation: |                                                                                                 …

val/accuracy: 0.8332172632217407
val/f1: 0.8328245282173157
val/taskclf_loss: 0.589290976524353
val/loss: 0.7613932490348816
val/mlm_loss: 1.5138674974441528


Validation: |                                                                                                 …

val/accuracy: 0.8366252183914185
val/f1: 0.8361808657646179
val/taskclf_loss: 0.6060149073600769
val/loss: 0.7738457918167114
val/mlm_loss: 1.5076448917388916


Validation: |                                                                                                 …

val/accuracy: 0.8343008756637573
val/f1: 0.8340089321136475
val/taskclf_loss: 0.6259379386901855
val/loss: 0.7893071174621582
val/mlm_loss: 1.5035982131958008


Validation: |                                                                                                 …

val/accuracy: 0.8362378478050232
val/f1: 0.8360635042190552
val/taskclf_loss: 0.6204273700714111
val/loss: 0.7868953943252563
val/mlm_loss: 1.5147360563278198


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.8366252183914185
val/f1: 0.8364721536636353
val/taskclf_loss: 0.6214289665222168
val/loss: 0.7861636877059937
val/mlm_loss: 1.5064257383346558


Best checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/task-GFUni-epoch=01-val_loss=0.68.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/GFUni-epoch=05.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Source genre: government
Target genre: fiction
Number of target samples: 69613


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.6274845004081726, 'source_test/accuracy': 0.8376775979995728, 'source_test/f1': 0.836969792842865, 'source_test/f1_macro': 0.8319224715232849, 'source_test/f1_micro': 0.8376775979995728, 'target_test/loss': 0.918418288230896, 'target_test/accuracy': 0.7560003995895386, 'target_test/f1': 0.7553254961967468, 'target_test/f1_macro': 0.7463375926017761, 'target_test/f1_micro': 0.7560003995895386}]
Best checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/task-GFUni-epoch=01-val_loss=0.68.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_10/checkpoints/GFUni-epoch=05.ckpt


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.507007896900177, 'source_test/accuracy': 0.8031393885612488, 'source_test/f1': 0.8030901551246643, 'source_test/f1_macro': 0.797439694404602, 'source_test/f1_micro': 0.8031393885612488, 'target_test/loss': 0.6660726070404053, 'target_test/accuracy': 0.7292866706848145, 'target_test/f1': 0.7283289432525635, 'target_test/f1_macro': 0.7204146981239319, 'target_test/f1_micro': 0.7292866706848145}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Seed set to 100


Test Results on saved epoch: [{'source_test/loss': 0.5958074331283569, 'source_test/accuracy': 0.8336453437805176, 'source_test/f1': 0.832648515701294, 'source_test/f1_macro': 0.8284580707550049, 'source_test/f1_micro': 0.8336453437805176, 'target_test/loss': 0.8680083751678467, 'target_test/accuracy': 0.7542482614517212, 'target_test/f1': 0.7536938190460205, 'target_test/f1_macro': 0.7447036504745483, 'target_test/f1_micro': 0.7542482614517212}]
Batch size: 32


Source genre: government


Target genre: fiction
Number of target samples: 69613


Source genre: government
Target genre: fiction
Number of target samples: 69613
Source dataset length: 69615
Target dataset length: 15922


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 16bit Automatic Mixed Precision (AMP)


GPU available: True (cuda), used: True


TPU available: False, using: 0 TPU cores


IPU available: False, using: 0 IPUs


HPU available: False, using: 0 HPUs


`Trainer(limit_train_batches=1.0)` was configured so 100% of the batches per epoch will be used..


`Trainer(limit_val_batches=1.0)` was configured so 100% of the batches will be used..


`Trainer(limit_test_batches=1.0)` was configured so 100% of the batches will be used..


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | BertAdapterModel   | 119 M 
1 | criterion     | CrossEntropyLoss   | 0     
2 | mlm_criterion | CrossEntropyLoss   | 0     
3 | accuracy      | MulticlassAccuracy | 0     
4 | f1            | MulticlassF1Score  | 0     
5 | f1_macro      | MulticlassF1Score  | 0     
6 | f1_micro      | MulticlassF1Score  | 0     
7 | softmax       | Softmax            | 0     
-----------------------------------------------------
9.5 M     Trainable params
109 M     Non-trainable params
119 M     Total params
476.013   Total estimated model params size (MB)


eee


Sanity Checking: |                                                                                            …

val/accuracy: 0.359375
val/f1: 0.5128205418586731
val/taskclf_loss: 1.0960769653320312
val/loss: 1.2105311155319214
val/mlm_loss: 1.710953712463379


Training: |                                                                                                   …

Validation: |                                                                                                 …

val/accuracy: 0.809249222278595
val/f1: 0.8097748160362244
val/taskclf_loss: 0.4871044158935547
val/loss: 0.6843880414962769
val/mlm_loss: 1.546962022781372


Validation: |                                                                                                 …

val/accuracy: 0.8211293816566467
val/f1: 0.8216399550437927
val/taskclf_loss: 0.4777289628982544
val/loss: 0.6761963963508606
val/mlm_loss: 1.5439460277557373


Validation: |                                                                                                 …

val/accuracy: 0.8221118450164795
val/f1: 0.8222125172615051
val/taskclf_loss: 0.4817976951599121
val/loss: 0.6758634448051453
val/mlm_loss: 1.5243680477142334


Validation: |                                                                                                 …

val/accuracy: 0.8209215998649597
val/f1: 0.8209747076034546
val/taskclf_loss: 0.5138604044914246
val/loss: 0.7029078006744385
val/mlm_loss: 1.5294705629348755


Validation: |                                                                                                 …

val/accuracy: 0.818080723285675
val/f1: 0.8173865675926208
val/taskclf_loss: 0.5838539600372314
val/loss: 0.7595039010047913
val/mlm_loss: 1.5274900197982788


Validation: |                                                                                                 …

val/accuracy: 0.817047655582428
val/f1: 0.8168485164642334
val/taskclf_loss: 0.6130168437957764
val/loss: 0.784454882144928
val/mlm_loss: 1.5340253114700317


Validation: |                                                                                                 …

val/accuracy: 0.8317687511444092
val/f1: 0.8316859602928162
val/taskclf_loss: 0.6273956298828125
val/loss: 0.7926763892173767
val/mlm_loss: 1.5153255462646484


Validation: |                                                                                                 …

val/accuracy: 0.8335765600204468
val/f1: 0.8336717486381531
val/taskclf_loss: 0.651809811592102
val/loss: 0.8097189664840698
val/mlm_loss: 1.5001376867294312


Validation: |                                                                                                 …

val/accuracy: 0.833318293094635
val/f1: 0.8331993222236633
val/taskclf_loss: 0.6728180646896362
val/loss: 0.8273575305938721
val/mlm_loss: 1.5030428171157837


Validation: |                                                                                                 …

`Trainer.fit` stopped: `max_epochs=10` reached.


val/accuracy: 0.8348678946495056
val/f1: 0.834876298904419
val/taskclf_loss: 0.6693304181098938
val/loss: 0.8225919604301453
val/mlm_loss: 1.4926899671554565


Best checkpoint path: checkpoints/lightning_logs/version_11/checkpoints/task-GFUni-epoch=02-val_loss=0.68.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_11/checkpoints/GFUni-epoch=05.ckpt


Source genre: government
Target genre: fiction
Number of target samples: 69613


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results Last Epoch: [{'source_test/loss': 0.6507255434989929, 'source_test/accuracy': 0.8376775979995728, 'source_test/f1': 0.8375648260116577, 'source_test/f1_macro': 0.8318676352500916, 'source_test/f1_micro': 0.8376775979995728, 'target_test/loss': 0.9916231036186218, 'target_test/accuracy': 0.7628167867660522, 'target_test/f1': 0.762233316898346, 'target_test/f1_macro': 0.7535643577575684, 'target_test/f1_micro': 0.7628167867660522}]
Best checkpoint path: checkpoints/lightning_logs/version_11/checkpoints/task-GFUni-epoch=02-val_loss=0.68.ckpt
Saved epoch checkpoint path: checkpoints/lightning_logs/version_11/checkpoints/GFUni-epoch=05.ckpt


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on Best Model: [{'source_test/loss': 0.47182056307792664, 'source_test/accuracy': 0.8286049962043762, 'source_test/f1': 0.8283261656761169, 'source_test/f1_macro': 0.8215165734291077, 'source_test/f1_micro': 0.8286049962043762, 'target_test/loss': 0.6613497138023376, 'target_test/accuracy': 0.74138343334198, 'target_test/f1': 0.741722583770752, 'target_test/f1_macro': 0.7314764261245728, 'target_test/f1_micro': 0.74138343334198}]


Some weights of BertAdapterModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['heads.default.3.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |                                                                                                    …

Test Results on saved epoch: [{'source_test/loss': 0.6288599371910095, 'source_test/accuracy': 0.8172762989997864, 'source_test/f1': 0.8166100978851318, 'source_test/f1_macro': 0.811604380607605, 'source_test/f1_micro': 0.8172762989997864, 'target_test/loss': 0.8675877451896667, 'target_test/accuracy': 0.74138343334198, 'target_test/f1': 0.740641713142395, 'target_test/f1_macro': 0.7324385046958923, 'target_test/f1_micro': 0.74138343334198}]


In [6]:
results.items()

dict_items([('last_epoch', {'source_test/loss': [0.6319634318351746, 0.6274845004081726, 0.6507255434989929], 'source_test/accuracy': [0.8477582335472107, 0.8376775979995728, 0.8376775979995728], 'source_test/f1': [0.847724437713623, 0.836969792842865, 0.8375648260116577], 'source_test/f1_macro': [0.8426411151885986, 0.8319224715232849, 0.8318676352500916], 'source_test/f1_micro': [0.8477582335472107, 0.8376775979995728, 0.8376775979995728], 'target_test/loss': [0.8976799249649048, 0.918418288230896, 0.9916231036186218], 'target_test/accuracy': [0.7608006596565247, 0.7560003995895386, 0.7628167867660522], 'target_test/f1': [0.7611796259880066, 0.7553254961967468, 0.762233316898346], 'target_test/f1_macro': [0.7496859431266785, 0.7463375926017761, 0.7535643577575684], 'target_test/f1_micro': [0.7608006596565247, 0.7560003995895386, 0.7628167867660522]}), ('best_model', {'source_test/loss': [0.4804317355155945, 0.507007896900177, 0.47182056307792664], 'source_test/accuracy': [0.813724040

In [7]:
# Calculate mean and standard deviation for each scenario
mean_results = {scenario: {key: np.mean(values) for key, values in metrics.items()} for scenario, metrics in results.items()}
std_results = {scenario: {key: np.std(values) for key, values in metrics.items()} for scenario, metrics in results.items()}

# Log mean and standard deviation results to wandb
# wandb.init(project=project_name, name=f'{domain}_mean_results')
# for scenario in mean_results:
#     for key, value in mean_results[scenario].items():
#         wandb.log({f"{scenario}/{key}": value})
#         wandb.log({f"{scenario}/{key}_std": std_results[scenario][key]})
# wandb.finish()

print("Mean Results:", mean_results)
print("Standard Deviation Results:", std_results)

# # Save the best model's adapter
# if model:
#     adapter_save_path = f"../../saved/adapter_after_run/{hparams['task_adapter_name']}"
#     model.save_adapter(adapter_save_path, hparams['task_adapter_name'])
#     print(f"Adapter saved to {adapter_save_path}")
# else:
#     print("No best model to save.")

Mean Results: {'last_epoch': {'source_test/loss': 0.6367244919141134, 'source_test/accuracy': 0.8410378098487854, 'source_test/f1': 0.8407530188560486, 'source_test/f1_macro': 0.8354770739873251, 'source_test/f1_micro': 0.8410378098487854, 'target_test/loss': 0.9359071056048075, 'target_test/accuracy': 0.7598726153373718, 'target_test/f1': 0.7595794796943665, 'target_test/f1_macro': 0.7498626311620077, 'target_test/f1_micro': 0.7598726153373718}, 'best_model': {'source_test/loss': 0.48642006516456604, 'source_test/accuracy': 0.8151561419169108, 'source_test/f1': 0.8149028420448303, 'source_test/f1_macro': 0.8090769052505493, 'source_test/f1_micro': 0.8151561419169108, 'target_test/loss': 0.6740219195683798, 'target_test/accuracy': 0.7342469890912374, 'target_test/f1': 0.7338895003000895, 'target_test/f1_macro': 0.7245342135429382, 'target_test/f1_micro': 0.7342469890912374}, 'epoch_saved': {'source_test/loss': 0.6030686100323995, 'source_test/accuracy': 0.8318852384885153, 'source_test

In [8]:
print('dones')

dones


In [9]:
best_val_loss

inf