In [1]:
from abc import ABCMeta, abstractmethod
import torch


class BaseDivergence(metaclass=ABCMeta):
    @abstractmethod
    def calculate(self, source_sample: torch.Tensor, target_sample: torch.Tensor):
        pass

In [2]:
import torch


def coral(source: torch.Tensor, target: torch.Tensor):
    """
    :param source: torch.Tensor
        The source domain torch tensors (features)
    :param target: torch.Tensor
        The target domain torch tensors (features)
    :return
        float
            The second order correlational measures between the two domains
    """
    d = source.data.shape[1]

    # source covariance
    xm = torch.mean(source, 0, keepdim=True) - source
    xc = xm.t() @ xm

    # target covariance
    xmt = torch.mean(target, 0, keepdim=True) - target
    xct = xmt.t() @ xmt

    # frobenius norm between source and target
    loss = torch.mean(torch.mul((xc - xct), (xc - xct)))
    loss = loss / (4 * d * d)

    return loss

class Coral(BaseDivergence):
    def __init__(
        self,
    ):
        pass

    def calculate(
        self,
        source_sample: torch.Tensor,
        target_sample: torch.Tensor,
    ):
        """

        :param source_sample: torch.Tensor
            batch_size, embedding_dimension
        :param target_sample: torch.Tensor
            batch_size, embedding_dimension

        :return: List[float]
        The divergence between the samples

        """
        assert source_sample.size() == target_sample.size()

        measure = coral(source_sample, target_sample)

        return measure

    def __call__(self, source_sample: torch.Tensor, target_sample: torch.Tensor):
        return self.calculate(source_sample, target_sample)

In [3]:
import torch
import torch.nn as nn
from transformers import  AutoConfig
from adapters import AutoAdapterModel,AdapterConfig
class DomainAdapter(nn.Module):
    def __init__(self, pretrained_model_name, source_target, reduction_factor=16, leave_out=[], loss_fn=None):
        super(DomainAdapter, self).__init__()

        self.config = AutoConfig.from_pretrained(pretrained_model_name)
        self.config.output_hidden_states = True  # To get layer-wise outputs

        self.model = AutoAdapterModel.from_pretrained(pretrained_model_name, config=self.config)

        # Configure the adapter
        adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=reduction_factor, leave_out=leave_out)
        self.model.add_adapter(f"domain_adapter_{source_target}", config=adapter_config)
        self.model.train_adapter(f"domain_adapter_{source_target}")

        # Loss function (e.g., Coral, CMD, etc.)
        self.criterion = loss_fn

    def forward(self, input_ids, attention_mask=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.hidden_states[1:len(outputs.hidden_states)]  # Exclude embeddings output
        return hidden_states

    def compute_loss(self, source_input_ids, target_input_ids, source_attention_mask=None, target_attention_mask=None):
        source_outputs = self.forward(source_input_ids, source_attention_mask)
        target_outputs = self.forward(target_input_ids, target_attention_mask)

        # Assume the loss function (criterion) expects the final hidden state
        # Modify this part according to how your actual divergence measure works
        loss = self.criterion(source_outputs[-1], target_outputs[-1])
        return loss
    def save_adapter(self, location, adapter_name):
        """Module to save adapter.
        Args:
            location str: Location where to save adapter.
            adapter_name: Name of adapter to be saved.
        """
        self.model.save_adapter(location, adapter_name)



  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [4]:
from setup import setup_src_path
print(setup_src_path())
import data.processed as processed
import config.config as config
import utils.setup as setup
import utils.functions as fn
from importlib import reload

['/home/guest/Desktop/projects/intial-experments/domain_adaptation_project/notebooks', '/usr/lib/python38.zip', '/usr/lib/python3.8', '/usr/lib/python3.8/lib-dynload', '', '/home/guest/.cache/pypoetry/virtualenvs/intial-experments-_CPDD38x-py3.8/lib/python3.8/site-packages', '/tmp/tmpbzmfgnlh', '/home/guest/Desktop/projects/intial-experments/domain_adaptation_project/modules']


2024-03-19 16:34:37.485136: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-19 16:34:37.519233: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
reload(config)
reload(processed)
tokenized_data,loaded_data,unsupervised_target = processed.tokenize_and_load_datasets()


Map:   0%|          | 0/75013 [00:00<?, ? examples/s]

Map:   0%|          | 0/8335 [00:00<?, ? examples/s]

Map:   0%|          | 0/69615 [00:00<?, ? examples/s]

Map:   0%|          | 0/7735 [00:00<?, ? examples/s]

In [6]:
source = loaded_data['source_loader']
source_eval = loaded_data['source_loader_eval']
target = loaded_data['target_loader']
target_eval = loaded_data['target_loader_eval']

In [7]:
target.dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 69615
})

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
# Assuming your loss criterion for domain adaptation (e.g., CMD, Coral) is defined
# criterion = YourDivergenceLoss()
# Assuming the model class DomainAdapter is already defined
model = DomainAdapter("distilbert-base-uncased", "telephone-travel", reduction_factor=16, leave_out=[], loss_fn=Coral())


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.1, patience=10,
     threshold=0.0001,
            threshold_mode="rel",
            cooldown=0,
            eps=1e-8,
    )


fn.print_trainable_parameters(model)

  _torch_pytree._register_pytree_node(


trainable params: 1039392 || all params: 67432794 || trainable%: 1.5413746611181498


In [9]:
sum=0
for source_batch in target:
    sum+=1

print(sum)

2175


In [11]:
for i, batch in enumerate(target):
    # Assuming batch is a dictionary with 'input_ids' as one of the keys.
    # Adjust the key as necessary based on your data structure.
    batch_size = batch['input_ids'].shape[0]
    
    if batch_size != 32:
        print(f"Batch {i} is incomplete with size {batch_size}.")
  

In [12]:
import torch
from tqdm import tqdm

num_epochs = 10  # Adjust according to your needs

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0

    for source_batch, target_batch in tqdm(zip(source, target)):
        #source_input, source_labels = source_batch
        #target_input, _ = target_batch

        #source_input, source_labels = source_input.to(device), source_labels.to(device)
        #target_input = target_input.to(device)
        source_input_ids, source_attention_mask, source_labels = (source_batch["input_ids"].to(device), 
                                                                    source_batch["attention_mask"].to(device), 
                                                                    source_batch["labels"].to(device))
        target_input_ids, target_attention_mask = (target_batch["input_ids"].to(device), 
                                                    target_batch["attention_mask"].to(device))
    
        optimizer.zero_grad()

        input_ids = torch.cat([source_input_ids, target_input_ids], dim=0)
        attention_mask = torch.cat([source_attention_mask, target_attention_mask], dim=0)

        # Forward pass for source and target through model
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        divergence = 0
        for num in range(len(outputs)):

            src_feature, trg_feature = torch.split(
                tensor=outputs[num],
                split_size_or_sections=input_ids.shape[0] // 2,
                dim=0,
            )
            # src_feature shape: [batch_size, seq_length, hidden_dim]
            # trg_feature shape: [batch_size, seq_length, hidden_dim]
            # change their shape to [batch_size, hidden_dim]
            src_feature = torch.mean(src_feature, dim=1)
            trg_feature = torch.mean(trg_feature, dim=1)
            divergence += model.criterion.calculate(
                source_sample=src_feature, target_sample=trg_feature
            )
        divergence.backward()
        optimizer.step()

        total_train_loss += divergence.item()

        # print(f'Step {epoch}, Convergence Loss: {divergence}')
        
    num_batches_processed = min(len(source), len(target))
    avg_train_loss = total_train_loss / num_batches_processed
    print(f'Epoch {epoch}, Average Training Loss: {avg_train_loss}')

    # Validation loop
    model.eval()
    total_val_loss = 0
    val_divergences = []  # Collect individual divergences for averaging

    with torch.no_grad():
        for source_batch, target_batch in tqdm(zip(source_eval, target_eval)):
          
            source_input_ids, source_attention_mask, source_labels = (source_batch["input_ids"].to(device), 
                                                                        source_batch["attention_mask"].to(device), 
                                                                        source_batch["labels"].to(device))
            target_input_ids, target_attention_mask = (target_batch["input_ids"].to(device), 
                                                        target_batch["attention_mask"].to(device))
        

            input_ids = torch.cat([source_input_ids, target_input_ids], dim=0)
            attention_mask = torch.cat([source_attention_mask, target_attention_mask], dim=0)

            # Forward pass for source and target through model
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            divergence = 0
            for num in range(len(outputs)):
                
                src_feature, trg_feature = torch.split(
                    tensor=outputs[num],
                    split_size_or_sections=input_ids.shape[0] // 2,
                    dim=0,
                )
                # src_feature shape: [batch_size, seq_length, hidden_dim]
                # trg_feature shape: [batch_size, seq_length, hidden_dim]
                # change their shape to [batch_size, hidden_dim]
                src_feature = torch.mean(src_feature, dim=1)
                trg_feature = torch.mean(trg_feature, dim=1)
                divergence += model.criterion.calculate(
                    source_sample=src_feature, target_sample=trg_feature
                )
            total_val_loss += divergence.item()
            val_divergences.append(divergence.item())
            #print(f'Step {epoch}, Convergence Loss: {divergence}')
            

    num_val_batches_processed = min(len(source_eval), len(target_eval))

    avg_val_loss = total_val_loss / num_val_batches_processed
    mean_divergence = torch.tensor(val_divergences).mean().item()
    print(f'Epoch {epoch}, Average Validation Loss: {avg_val_loss}, Mean Divergence: {mean_divergence}')
    
    scheduler.step(avg_val_loss)


2175it [07:03,  5.13it/s]


Epoch 0, Average Training Loss: 1.220327337316628e-07


241it [00:29,  8.25it/s]


Epoch 0, Average Validation Loss: 1.2349430278391083e-07, Mean Divergence: 1.2349430278391083e-07


2175it [07:10,  5.06it/s]


Epoch 1, Average Training Loss: 9.720088541563388e-08


241it [00:29,  8.21it/s]


Epoch 1, Average Validation Loss: 9.829448807870232e-08, Mean Divergence: 9.829449254539213e-08


2175it [07:09,  5.07it/s]


Epoch 2, Average Training Loss: 7.705479171345255e-08


241it [00:29,  8.20it/s]


Epoch 2, Average Validation Loss: 7.849066525364584e-08, Mean Divergence: 7.849065752907336e-08


2175it [07:10,  5.05it/s]


Epoch 3, Average Training Loss: 6.243782860729911e-08


241it [00:29,  8.22it/s]


Epoch 3, Average Validation Loss: 6.493652955877925e-08, Mean Divergence: 6.493653614825234e-08


2175it [07:09,  5.06it/s]


Epoch 4, Average Training Loss: 5.222361213166148e-08


241it [00:29,  8.19it/s]


Epoch 4, Average Validation Loss: 5.3843873079848325e-08, Mean Divergence: 5.384387691265147e-08


2175it [07:10,  5.06it/s]


Epoch 5, Average Training Loss: 4.4636659819795423e-08


241it [00:29,  8.24it/s]


Epoch 5, Average Validation Loss: 4.691999360731748e-08, Mean Divergence: 4.691999322403717e-08


2175it [07:05,  5.11it/s]


Epoch 6, Average Training Loss: 3.919606673539368e-08


241it [00:27,  8.67it/s]


Epoch 6, Average Validation Loss: 4.1431330797682956e-08, Mean Divergence: 4.1431327701957343e-08


2175it [07:00,  5.17it/s]


Epoch 7, Average Training Loss: 3.4848341762401745e-08


241it [00:29,  8.24it/s]


Epoch 7, Average Validation Loss: 3.625936531754361e-08, Mean Divergence: 3.625936528806051e-08


2175it [07:09,  5.06it/s]


Epoch 8, Average Training Loss: 3.132534382284471e-08


241it [00:29,  8.20it/s]


Epoch 8, Average Validation Loss: 3.2742865234258685e-08, Mean Divergence: 3.274286441978802e-08


2175it [07:09,  5.06it/s]


Epoch 9, Average Training Loss: 2.8428099611967983e-08


241it [00:29,  8.25it/s]

Epoch 9, Average Validation Loss: 3.0689200705829156e-08, Mean Divergence: 3.0689204066902676e-08





In [30]:
model.model.active_adapters

Stack[domain_adapter_telephone-travel]

In [13]:
model.save_adapter(f"{config.Config.ADAPTER_SAVE_PATH}/domain_adapter_telephone_travel", "domain_adapter_telephone-travel")

In [14]:
model.model.add_classification_head(
    "task-test-after-coral",
    num_labels=3,
  )

In [15]:
target_test = loaded_data['test_target_loader']

accuracy_before, f1_before = fn.evaluate_model(model.model, target_test)
print(f"Accuracy after adaptation: {accuracy_before}")
print(f"F1 score after adaptation: {f1_before}")

Accuracy after adaptation: 0.32274590163934425
F1 score after adaptation: 0.2554702737690946


In [16]:
reload(config)
tokenized_data['source'].save_to_disk(f"{config.Config.DATASETS_SAVE_PATH}/source_data")
tokenized_data['eval_source'].save_to_disk(f"{config.Config.DATASETS_SAVE_PATH}/source_data_eval")
tokenized_data['target'].save_to_disk(f"{config.Config.DATASETS_SAVE_PATH}/target_data")
tokenized_data['eval_target'].save_to_disk(f"{config.Config.DATASETS_SAVE_PATH}/target_data_eval")
tokenized_data['test_target'].save_to_disk(f"{config.Config.DATASETS_SAVE_PATH}/test_target_data")
unsupervised_target.save_to_disk(f"{config.Config.DATASETS_SAVE_PATH}/unsupervised_target")


Saving the dataset (0/1 shards):   0%|          | 0/75013 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7735 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/69615 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7735 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1976 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/52350 [00:00<?, ? examples/s]