# Counterfactual WorkFlow

1. Caculate importance of words
2. Load masked Language Model
3. Generate CF dataset with masked Language Model

In [3]:
# import all pacakages used in teacher's scripts
import dataclasses
import shutil
import yaml
from pathlib import Path
from enum import Enum
import json

import wandb
import numpy as np
import pandas as pd

from sklearn.metrics import (
    precision_recall_fscore_support,
    multilabel_confusion_matrix,
    classification_report,
    confusion_matrix,
    balanced_accuracy_score,
    roc_auc_score,
    average_precision_score, matthews_corrcoef
)
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import compute_class_weight

import torch
from sklearn.utils.extmath import softmax
from torch.cuda.amp import autocast
from torch.nn import CrossEntropyLoss

from datasets import load_dataset, concatenate_datasets
import transformers
import transformers.adapters.composition as ac
from transformers import (
    AdapterConfig,
    AdapterTrainer,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    EarlyStoppingCallback,
    HfArgumentParser,
    MultiLingAdapterArguments,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed, TrainerCallback, XLMRobertaTokenizer,
)
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.utils import check_min_version

from root import DATA_DIR, AUGMENTED_DIR
from utils.custom_callbacks import CustomWandbCallback
from long import LongBert
from arguments.data_arguments import DataArguments, ProblemType, SegmentationType, DataAugmentationType, LegalArea, \
    OriginCanton, SubDataset, OriginRegion, Jurisdiction
from hierarchical.hier_bert.configuration_hier_bert import HierBertConfig
from hierarchical.hier_bert.modeling_hier_bert import HierBertForSequenceClassification
from hierarchical.hier_camembert.configuration_hier_camembert import HierCamembertConfig
from hierarchical.hier_camembert.modeling_hier_camembert import HierCamembertForSequenceClassification
from hierarchical.hier_roberta.configuration_hier_roberta import HierRobertaConfig
from hierarchical.hier_roberta.modeling_hier_roberta import HierRobertaForSequenceClassification
from hierarchical.hier_xlm_roberta.configuration_hier_xlm_roberta import HierXLMRobertaConfig
from hierarchical.hier_xlm_roberta.modeling_hier_xlm_roberta import HierXLMRobertaForSequenceClassification
from arguments.model_arguments import ModelArguments, LabelImbalanceMethod, LongInputBertType, TrainType
from utils.sentencizer import get_sentencizer, combine_small_sentences, spacy_sentencize, get_spacy_sents


## Caculate importance of words
1. load model
2. load & preprocess data
3. calculate importance

### Load Model

In [4]:
def baseline():
    with open(DATA_DIR / 'de' / 'labels.json', 'r') as f:
        label_dict = json.load(f)
        label_dict['id2label'] = {int(k): v for k, v in label_dict['id2label'].items()}
        label_dict['label2id'] = {k: int(v) for k, v in label_dict['label2id'].items()}
        label_list = list(label_dict["label2id"].keys())
    num_labels = len(label_list)

    model_class = AutoModelForSequenceClassification

    config = AutoConfig.from_pretrained(
                    'xlm-roberta-base',
                    num_labels=num_labels,
                    id2label=label_dict["id2label"],
                    label2id=label_dict["label2id"],
                    finetuning_task="text-classification",
                    problem_type='single_label_classification',
                    cache_dir=None,
                    revision='main',
                    use_auth_token=None,
                    max_segments=4,
                    max_segment_length=512,
                    segment_encoder_type="transformer",
                )


    config_class = HierXLMRobertaConfig
    model_class = HierXLMRobertaForSequenceClassification
    config = config_class(**config.to_dict())


    model = model_class.from_pretrained(
        'xlm-roberta-base',
        from_tf=bool(".ckpt" in 'xlm-roberta-base'),
        config=config,
        cache_dir=None,
        revision='main',
        use_auth_token=None,
    )



    return model



def load_model(model, folder):
    model_path = Path(f'{folder}/model.bin')
    if model_path.exists():
        model.load_state_dict(torch.load(model_path, map_location='cuda:1'))
        model.to('cuda:1')



In [5]:
model = baseline()
load_model(model, 'sjp/teacher_model')

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing HierXLMRobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing HierXLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HierXLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HierXLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['

### Load And Preprocess Data
To simplify my work and reduce running time, I use only France training dataset throughout this notebook

In [6]:
def load_data(lang='fr'):
    path = (DATA_DIR / lang / 'train.csv').as_posix()
    dataset = load_dataset("csv", data_files={'train': path})['train']
    return dataset

datasets = load_data()

Using custom data configuration default-949afda011810beb
Reusing dataset csv (/home/xxkx236/.cache/huggingface/datasets/csv/default-949afda011810beb/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(
        'xlm-roberta-base',
        do_lower_case=False,
        cache_dir=None,
        use_fast=True,
        revision='main',
        use_auth_token=None,
    )

def append_zero_segments(case_encodings, pad_token_id):
    """appends a list of zero segments to the encodings to make up for missing segments"""
    return case_encodings + [[pad_token_id] * 512] * (
            4 - len(case_encodings))

def preprocess_function(batch):
    with open(DATA_DIR / 'de' / 'labels.json', 'r') as f:
        label_dict = json.load(f)
        label_dict['id2label'] = {int(k): v for k, v in label_dict['id2label'].items()}
        label_dict['label2id'] = {k: int(v) for k, v in label_dict['label2id'].items()}
        label_list = list(label_dict["label2id"].keys())
    num_labels = len(label_list)
    
    padding = "max_length"
    pad_id = tokenizer.pad_token_id
    batch['segments'] = []
    tokenized = tokenizer(batch["text"], padding=padding, truncation=True,
                          max_length=4 * 512,
                          add_special_tokens=False)  # prevent it from adding the cls and sep tokens twice
    for ids in tokenized['input_ids']:
        id_blocks = [ids[i:i + 512] for i in range(0, len(ids), 512) if
                     ids[i] != pad_id]  # remove blocks containing only ids
        id_blocks[-1] = [id for id in id_blocks[-1] if
                         id != pad_id]  # remove remaining pad_tokens_ids from the last block
        token_blocks = [tokenizer.convert_ids_to_tokens(ids) for ids in id_blocks]
        string_blocks = [tokenizer.convert_tokens_to_string(tokens) for tokens in token_blocks]
        batch['segments'].append(string_blocks)



    tokenized = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []}
    for case in batch['segments']:
        case_encodings = tokenizer(case[:4], padding=padding, truncation=True,
                                   max_length=512, return_token_type_ids=True)
        tokenized['input_ids'].append(append_zero_segments(case_encodings['input_ids'], pad_id))
        tokenized['attention_mask'].append(append_zero_segments(case_encodings['attention_mask'], 0))
        tokenized['token_type_ids'].append(append_zero_segments(case_encodings['token_type_ids'], 0))
    del batch['segments']
   
    
    if label_dict["label2id"] is not None and "label" in batch:
        tokenized["label"] = [label_dict["label2id"][l] for l in batch["label"]]
    return tokenized

def preprocess_dataset(dataset):
    return dataset.map(
        preprocess_function,
        batched=True,
        load_from_cache_file=False,
        remove_columns=[col for col in dataset.column_names if not col == "id"],  # keep id for example-wise logging
    )

datasets = preprocess_dataset(datasets)

  0%|          | 0/22 [00:00<?, ?ba/s]

### Calculate Importance
To reduce time, I only calculated the importance of the 500 words with the highest frequency

In [63]:
from collections import defaultdict
from tqdm import tqdm
import pickle

def get_logit(input_ids, attention_mask, token_type_ids):
    input_ids = torch.tensor(input_ids).reshape(1, 4, 512).to('cuda:1')
    attention_mask = torch.tensor(attention_mask).reshape(1, 4, 512).to('cuda:1')
    token_type_ids = torch.tensor(token_type_ids).reshape(1, 4, 512).to('cuda:1')
    with torch.no_grad():
        logits = model(input_ids, attention_mask, token_type_ids).logits
    return logits


def get_words(lang='fr', size=1):
    ids2count = defaultdict(int)
    for data in tqdm(datasets):
        input_ids = []
        for i in data['input_ids']:
            input_ids += i
        input_ids = input_ids[1:]
        for ids in input_ids:
            if ids == 1:
                break
            ids2count[ids] += 1
    selected_ids = sorted(ids2count.items(), key=lambda x:x[1], reverse=True)[200:200+size]
    selected_ids = [ids for ids, count in selected_ids]
    words = tokenizer.convert_ids_to_tokens(selected_ids)
    words = {token: ids for token, ids in zip(words, selected_ids)}
    return words

def dist(vec1, vec2):
    return ((vec1 - vec2) ** 2).sum()

def calc_importance():
    negtive_pronouns = []
    negtive_pronouns = tokenizer.convert_tokens_to_ids(negtive_pronouns)
    words = get_words()
    word2importance = defaultdict(list)
    mask_id = tokenizer('<mask>')['input_ids'][1]
    for token, ids in words.items():
        for d in tqdm(datasets):
            input_ids = []
            for i in d['input_ids']:
                input_ids += i
            mask_input_ids = [i for i in input_ids]
            attention_mask = d['attention_mask']
            token_type_ids = d['token_type_ids']
            try:
                idx = input_ids.index(ids)
            except Exception:
                continue
            mask_input_ids[idx] = mask_id
            if idx > 0 and input_ids[idx-1] in negtive_pronouns:
                mask_input_ids[idx-1] = mask_id
            l1 = get_logit(input_ids, attention_mask, token_type_ids)
            l2 = get_logit(mask_input_ids, attention_mask, token_type_ids)
            word2importance[ids].append(dist(l1, l2))
    word2importance = {word: sum(importance)/len(importance) for word, importance in word2importance.items()}
    return word2importance


word2importance = calc_importance()
word2importance

{72403: tensor(0.0014, device='cuda:1')}

## Load Masked Language Model

In [None]:
from transformers import BertForMaskedLM

mlm = BertForMaskedLM.from_pretrained('xlm-roberta-base')
words = sorted(word2importance.items(), key=lambda x:x[1], reverse=True)
words = [word for word, imp in words]
positive_words = []
negtive_words = []
for word in words:
    for d in datasets:
        input_ids = d['input_ids'][0]
        try:
            idx = input_ids.index(word)
        except Exception:
            continue
        mask_ids = [ids for ids in input_ids]
        mask_ids[idx] = 250001
        logits = mlm(input_ids, ...).logits
        possible_words = logits[0, idx]

**This method is too difficult to implement, so I try to implement another one.**

# Visualize Attention Layers Of Fited Bert.
1. load model
2. load text data
3. explain model

In [3]:
# import all pacakages used in teacher's scripts
import dataclasses
import shutil
import yaml
from pathlib import Path
from enum import Enum
import json

import wandb
import numpy as np
import pandas as pd

from sklearn.metrics import (
    precision_recall_fscore_support,
    multilabel_confusion_matrix,
    classification_report,
    confusion_matrix,
    balanced_accuracy_score,
    roc_auc_score,
    average_precision_score, matthews_corrcoef
)
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import compute_class_weight

import torch
from sklearn.utils.extmath import softmax
from torch.cuda.amp import autocast
from torch.nn import CrossEntropyLoss

from datasets import load_dataset, concatenate_datasets
import transformers
import transformers.adapters.composition as ac
from transformers import (
    AdapterConfig,
    AdapterTrainer,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    EarlyStoppingCallback,
    HfArgumentParser,
    MultiLingAdapterArguments,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed, TrainerCallback, XLMRobertaTokenizer,
)
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.utils import check_min_version

from root import DATA_DIR, AUGMENTED_DIR
from utils.custom_callbacks import CustomWandbCallback
from long import LongBert
from arguments.data_arguments import DataArguments, ProblemType, SegmentationType, DataAugmentationType, LegalArea, \
    OriginCanton, SubDataset, OriginRegion, Jurisdiction
from hierarchical.hier_bert.configuration_hier_bert import HierBertConfig
from hierarchical.hier_bert.modeling_hier_bert import HierBertForSequenceClassification
from hierarchical.hier_camembert.configuration_hier_camembert import HierCamembertConfig
from hierarchical.hier_camembert.modeling_hier_camembert import HierCamembertForSequenceClassification
from hierarchical.hier_roberta.configuration_hier_roberta import HierRobertaConfig
from hierarchical.hier_roberta.modeling_hier_roberta import HierRobertaForSequenceClassification
from hierarchical.hier_xlm_roberta.configuration_hier_xlm_roberta import HierXLMRobertaConfig
from hierarchical.hier_xlm_roberta.modeling_hier_xlm_roberta import HierXLMRobertaForSequenceClassification
from arguments.model_arguments import ModelArguments, LabelImbalanceMethod, LongInputBertType, TrainType

# Load Model

In [4]:
with open(DATA_DIR / 'de' / 'labels.json', 'r') as f:
    label_dict = json.load(f)
    label_dict['id2label'] = {int(k): v for k, v in label_dict['id2label'].items()}
    label_dict['label2id'] = {k: int(v) for k, v in label_dict['label2id'].items()}
    label_list = list(label_dict["label2id"].keys())
num_labels = len(label_list)
    
    
def baseline():


    model_class = AutoModelForSequenceClassification

    config = AutoConfig.from_pretrained(
                    'xlm-roberta-base',
                    num_labels=num_labels,
                    id2label=label_dict["id2label"],
                    label2id=label_dict["label2id"],
                    finetuning_task="text-classification",
                    problem_type='single_label_classification',
                    cache_dir=None,
                    revision='main',
                    use_auth_token=None,
                    max_segments=4,
                    max_segment_length=512,
                    segment_encoder_type="transformer",
                    output_attentions=True
                )


    if config.model_type == 'bert':
        config_class = HierBertConfig
        model_class = HierBertForSequenceClassification
    if config.model_type == 'roberta':
        config_class = HierRobertaConfig
        model_class = HierRobertaForSequenceClassification
    if config.model_type == 'xlm-roberta':
        config_class = HierXLMRobertaConfig
        model_class = HierXLMRobertaForSequenceClassification
    if config.model_type == 'camembert':
        config_class = HierCamembertConfig
        model_class = HierCamembertForSequenceClassification
    config = config_class(**config.to_dict())


    model = model_class.from_pretrained(
        'xlm-roberta-base',
        from_tf=bool(".ckpt" in 'xlm-roberta-base'),
        config=config,
        cache_dir=None,
        revision='main',
        use_auth_token=None,
    )



    return model



def load_model(model, folder):
    model_path = Path(f'{folder}/model.bin')
    if model_path.exists():
        model.load_state_dict(torch.load(model_path, map_location='cuda:1'))



In [5]:
model = baseline()
load_model(model, 'sjp/teacher_model')
tokenizer = AutoTokenizer.from_pretrained(
        'xlm-roberta-base',
        do_lower_case=False,
        cache_dir=None,
        use_fast=True,
        revision='main',
        use_auth_token=None,
        return_token_type_ids=True
    )

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing HierXLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing HierXLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HierXLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HierXLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['

In [6]:
bert = model.roberta
tokenizer.model_max_length = 2048

# Visualize Bert

In [None]:
from bertviz import model_view
from transformers import utils


utils.logging.set_verbosity_error()  


input_text = "A.- Der 1945 geborene S._ meldete sich am 20. Januar 1997 bei der Invalidenversicherung wegen Schmerzen im Bereich des Rückens, der Hüfte und des rechten Beins zum Leistungsbezug an."
inputs = tokenizer.encode(input_text, return_tensors='pt')  # Tokenize input text
outputs = bert(inputs)  # Run model
attention = outputs[-1]  # Retrieve attention from model outputs
tokens = tokenizer.convert_ids_to_tokens(inputs[0])  # Convert input ids to token strings
model_view(attention, tokens)  # Display model view