In [1]:
%load_ext autoreload
%autoreload 2

# Setup Paths

In [2]:
import os
import pyrootutils
root = pyrootutils.setup_root(
    search_from='.',
    indicator=[".git", "pyproject.toml"],
    pythonpath=True,
    dotenv=True,
)
hydra_cfg_path = root / "text_classification_problems" / "configs"
os.chdir(root / "text_classification_problems")

In [3]:
import hydra
from hydra import compose, initialize

from pathlib import Path
import numpy as np
import torch
from core.grads import tree_to_device
from core.tracer import KNN, KNNGD, KNNGN
from sklearn.neighbors import KNeighborsClassifier
from text_classification_problems.datamodule import TextClassifierDataModule
from text_classification_problems.modelmodule import TextClassifierModel
from transformers import AutoTokenizer
from tqdm import tqdm

import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


# Config  

In [4]:
'..' / hydra_cfg_path.relative_to(root)
with initialize(version_base=None, config_path= '../configs'):
    cfg = compose(config_name="tracing", return_hydra_config=True, overrides=["datamodule=snli", "tracer=gd"])

In [5]:
device = 'cuda:1'

# Load Data and Model

In [6]:
from datamodule import TextClassifierDataModule

In [7]:
checkpoint = torch.load("outputs/train/snli/multirun/flip0.2_bert/122_2023-01-08_00-01-22/checkpoints/epoch=00_val_acc=0.8712.ckpt", map_location=device)
datamodule_hparams = checkpoint["datamodule_hyper_parameters"] 
datamodule_hparams["use_denoised_data"] = False

In [8]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
dm = TextClassifierDataModule(
    data_root=os.environ["PYTORCH_DATASET_ROOT"],
    tokenizer=tokenizer,
    **datamodule_hparams,
)
dm.prepare_data()
dm.setup("tracing")

In [9]:
net = hydra.utils.instantiate(cfg.net, num_classes=dm.num_classes)
lit_model = TextClassifierModel(
    net=net,
    num_classes=dm.num_classes,
    lr=1e-3,
)
lit_model.load_state_dict(checkpoint["state_dict"])
net = lit_model.net
lit_model.eval()
lit_model.to(device);

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# np.savez_compressed('outputs/train/imdb/knn.npz', neibor_inds.numpy())

In [11]:
# neibor_inds = np.load('outputs/train/imdb/knn.npz')['arr_0']

## Tracing

In [12]:
from core.grads import RuntimeGradientExtractor
from core.tracer import GradientNormalize as GN, GradientCosin as GC, GradientBasedTracer as GD
import torch.nn.functional as F
import pandas as pd
from core.aggregation import cal_neibor_matrices
from text_classification_problems.convert_result import eval_ckpt
from text_classification_problems.run_tracing import register_BatchEncoding
import re

In [13]:
register_BatchEncoding()

## Comparation

In [14]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
colected_ks = list(range(50, 1001, 10))
sel_sizes = list(range(100, 5000, 100))
ckpt_path = "outputs/train/snli/multirun/flip0.2_bert/122_2023-01-08_00-01-22/checkpoints/epoch=00_val_acc=0.8712.ckpt"

In [None]:
result_df, all_scores = eval_ckpt(tokenizer,
    lit_model,
    ckpt_path,
    use_denoised_data=False,
    is_self_ref=True,
    sel_sizes=sel_sizes,
    colected_ks=colected_ks,
    use_cache=True,
    device=device)

In [None]:
result_df.to_csv(ckpt_path + ".csv")

In [None]:
torch.save(all_scores, ckpt_path + ".scores")

In [17]:
def get_best_ckpt(checkpoint_dir: Path):
    metrics = [(float(re.search('val_acc=([+-]?([0-9]*[.])?[0-9]+)', str(p)).group(1)),p) for p in checkpoint_dir.glob("epoch*.ckpt")]
    metrics.sort(reverse=True)
    # metrics = [(float(re.search('epoch=([+-]?([0-9]*[.])?[0-9]+)', str(p)).group(1)),p) for p in checkpoint_dir.glob("epoch*.ckpt")]
    # metrics.sort(reverse=False)

    best_ckpt_path = metrics[0][1]
    return best_ckpt_path

In [18]:
imdb_real_train_path = Path("outputs/imdb/flip0.2_bert/")
output_dirs = '''121_2023-01-03_19-26-09
122_2023-01-03_19-26-09
123_2023-01-03_19-26-09
124_2023-01-03_19-26-09
125_2023-01-03_19-26-09'''

best_ckpt_results = []
for run in output_dirs.split('\n'):
    run = imdb_real_train_path / run
    ckpt_path = get_best_ckpt(run / "checkpoints")
# for ckpt_path in imdb_real_train_path.rglob("checkpoints/epoch=*.ckpt"):
    ckpt_path = str(ckpt_path)
    print(ckpt_path)
    result_df, all_scores = eval_ckpt(tokenizer,
        lit_model,
        ckpt_path,
        use_denoised_data=True,
        is_self_ref=False,
        sel_sizes=sel_sizes,
        colected_ks=colected_ks,
        use_cache=True,
        device=device)
    result_df.to_csv(ckpt_path+".csv")
    torch.save(all_scores, ckpt_path + ".scores")

outputs/imdb/flip0.2_bert/121_2023-01-03_19-26-09/checkpoints/epoch=01_val_acc=0.8348.ckpt


RuntimeError: Error(s) in loading state_dict for TextClassifierModel:
	size mismatch for net.fc.weight: copying a param with shape torch.Size([2, 768]) from checkpoint, the shape in current model is torch.Size([3, 768]).
	size mismatch for net.fc.bias: copying a param with shape torch.Size([2]) from checkpoint, the shape in current model is torch.Size([3]).