In [1]:
%load_ext autoreload
%autoreload 2

from IPython.display import display, HTML
import os
from os import path
import sys
cwd = os.getcwd().split(os.path.sep)

# point to the git repository
while cwd[-1] != "ExplanationPairSentencesTasks":
    os.chdir("..")
    cwd = os.getcwd().split(os.path.sep)
print(f">> current directory : {os.getcwd()}")

# add the root directory
sys.path.append(os.path.join(os.getcwd(), "src"))

# cache and data cache
cache_path = path.join(os.getcwd() ,'.cache')
dataset_path = path.join(cache_path, 'dataset')
log_path = path.join(cache_path, 'logs')
model_path = path.join(cache_path, 'models')
print(f">> cache path : {cache_path}")
print(f">> model path : {model_path}")
print(f">> dataset path : {dataset_path}")
print(f">> logs path : {log_path}")

# import the different modules
from src.data_module.hatexplain import CLSTokenHateXPlainDM
from pur_attention_key_reg import AttitModel
from modules import metrics
from notebooks.attention_based.utils.ckp_config import *

# external librairies
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
from tqdm.notebook import tqdm

from modules.metrics.geometry import cosine_sim, effective_rank

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f">> device : {DEVICE}")

>> current directory : C:\Users\loicf\Documents\IRISA\ExplanationPairSentencesTasks
>> cache path : C:\Users\loicf\Documents\IRISA\ExplanationPairSentencesTasks\.cache
>> model path : C:\Users\loicf\Documents\IRISA\ExplanationPairSentencesTasks\.cache\models
>> dataset path : C:\Users\loicf\Documents\IRISA\ExplanationPairSentencesTasks\.cache\dataset
>> logs path : C:\Users\loicf\Documents\IRISA\ExplanationPairSentencesTasks\.cache\logs
>> device : cuda


In [4]:
%%capture
## work on the hatexplain dataset
sim_k_dict = {
    f"n_layer={i+1}" : np.zeros((i+1,)) for i in range(5)
}
sim_v_dict = {
    f"n_layer={i+1}" : np.zeros((i+1,)) for i in range(5)
}

dm_kwargs = dict(cache_path=dataset_path,
                 batch_size=32,
                 num_workers=0,
                 n_data=999
                 )

dm = CLSTokenHateXPlainDM(**dm_kwargs)

dm.prepare_data()
dm.setup(stage="test")
test_dataloader = dm.test_dataloader() # load the test dataset

model_args = dict(
        cache_path=model_path,
        mode="exp",
        vocab=dm.vocab,
        lambda_entropy=0,
        lambda_supervise=0,
        lambda_lagrange=0,
        pretrained_vectors="glove.840B.300d",
        num_layers=1,
        num_heads=1,
        d_embedding=300,
        data="hatexplain",
        num_class=dm.num_class,
        opt="adadelta"
)
cpt = torch.tensor([0, 0, 0, 0, 0], device=DEVICE)
for l in range(5) :

    # update the args for the model
    model_args["num_layers"] = l+1
    ckp = os.path.join(log_path, "PurAttention", f"run=0_hatexplain_l=0{l+1}_h=1_adam", "checkpoints", "best.ckpt")
    hparams = os.path.join(log_path, "PurAttention", f"run=0_hatexplain_l=0{l+1}_h=1_adam", "hparams.yaml")

    # the model
    model = AttitModel.load_from_checkpoint(ckp, hparams_file=hparams, **model_args)
    model = model.eval()

    with torch.no_grad():
        model = model.to(DEVICE)
        pbar = tqdm(enumerate(test_dataloader), total = int(999/32))
        for id_batch, batch in pbar:
            pbar.set_description("proceed the similarity metric")
            ids = batch["token_ids"].to(DEVICE)
            padding_mask = batch["padding_mask"].bool().to(DEVICE)
            output = model(ids=ids, mask=padding_mask)
            cl = output["logits"].argmax(dim=-1)
            cpt[l] += (cl == batch["y_true"].to(DEVICE)).sum().item()
            k, v = output["key_embeddings"], output["value_embeddings"]
            # ENTROPY : calculation of the entropy on each layer
            for i in range(l+1):
                # calculus of the metrics
                sim_k = cosine_sim(k[i], padding_mask, normalize="")
                sim_v = cosine_sim(v[i], padding_mask, normalize="")

                sim_k_dict[f"n_layer={l+1}"][i] += sim_k.sum().item()
                sim_v_dict[f"n_layer={l+1}"][i] += sim_v.sum().item()

    model = model.cpu()
    del model
    torch.cuda.empty_cache()

for k in sim_k_dict:
    sim_k_dict[k] = sim_k_dict[k] / 999
    sim_v_dict[k] = sim_v_dict[k] / 999
;

In [8]:
(cpt / 999).cpu() * 100 # the accuracy (in %)

tensor([62.4625, 62.4625, 63.6637, 63.9640, 62.7628])

In [6]:
sim_k_dict

{'n_layer=1': array([0.71326747]),
 'n_layer=2': array([0.69151901, 0.68288404]),
 'n_layer=3': array([0.69846342, 0.68784525, 0.84177555]),
 'n_layer=4': array([0.61357826, 0.61971813, 0.74860878, 0.83997919]),
 'n_layer=5': array([0.62424775, 0.64732869, 0.77722902, 0.91285675, 0.97317868]),
 'n_layer=6': array([0., 0., 0., 0., 0., 0.])}

In [7]:
sim_v_dict

{'n_layer=1': array([0.99999999]),
 'n_layer=2': array([1.00000002, 0.99999997]),
 'n_layer=3': array([0.99999998, 0.99999999, 0.99999999]),
 'n_layer=4': array([1.        , 0.99999998, 0.99999994, 0.99999998]),
 'n_layer=5': array([0.99999995, 0.99999999, 0.99999997, 0.99999999, 0.99999997]),
 'n_layer=6': array([0., 0., 0., 0., 0., 0.])}

Ici une première conclusion est de dire que l'attention ne sert absolument à rien. nous avons des vecteurs de valeurs qui sont tout le temps au même endroit.