# IG Calc & attributions for Hallucinations

DeRose et al.

In [1]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
# !export CUDA_LAUNCH_BLOCKING=1
# !export TORCH_USE_CUDA_DSA=1

In [3]:
# set task and run notebook
possible_tasks = ['DM' 'PG' 'MT']
tasks_done = ["DM", "PG"]
task = "MT"

## Setup

### Install

In [4]:
%pip install captum



In [5]:
import json
import os
import random
import sys

In [6]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import torch
from captum.attr import IntegratedGradients
from torch.nn.functional import softmax

import huggingface_hub
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5ForConditionalGeneration

### Data Loading

In [7]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
data_dir = "/content/drive/MyDrive/SHROOM/data"

In [9]:
with open(f"{data_dir}/SHROOM_unlabeled-training-data-v2/train.model-aware.v2.json") as f:
  train_data = json.load(f)

with open(f"{data_dir}/SHROOM_dev-v2/val.model-aware.v2.json") as f:
  dev_data = json.load(f)

with open(f"{data_dir}/SHROOM_test-unlabeled/test.model-aware.json") as f:
  test_data = json.load(f)

with open(f"{data_dir}/SHROOM_dev-v2/val.model-agnostic.json") as f:
  agnostic_data = json.load(f)

In [10]:
def prep_df(json_data):
  print(json_data[0])
  print(json_data[0].keys())
  _df = pd.DataFrame(json_data)
  print(_df.task.unique())
  _df = _df.query(f"task == '{task}'")
  # _df = _df.query("task == 'PG'")
  _df = _df.reset_index()
  return _df

In [11]:
train_df = prep_df(train_data)
val_df = prep_df(dev_data)
test_df = prep_df(train_data)
agnostic_df = prep_df(agnostic_data)

{'hyp': 'Of or pertaining to the language of a particular area , or to a particular', 'tgt': 'Of or pertaining to everyday language , as opposed to standard , literary , liturgical , or scientific idiom .', 'src': 'There are blacktips , silvertips , bronze whalers , black whalers , spinner sharks , and bignose sharks . these of course are vernacular names , but this is one case where the scientific nomenclature does not clarify the species , since it is now being revised . What is the meaning of vernacular ?', 'ref': 'tgt', 'task': 'DM', 'model': 'ltg/flan-t5-definition-en-base'}
dict_keys(['hyp', 'tgt', 'src', 'ref', 'task', 'model'])
['DM' 'PG' 'MT']
{'hyp': 'A sloping top .', 'ref': 'tgt', 'src': 'The sides of the casket were covered with heavy black broadcloth , with velvet caps , presenting a deep contrast to the rich surmountings . What is the meaning of surmounting ?', 'tgt': 'A decorative feature that sits on top of something .', 'model': 'ltg/flan-t5-definition-en-base', 'task

## Model loading

In [14]:
import torch
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig,
    T5Config, T5ForConditionalGeneration,
    PegasusConfig, PegasusTokenizer, PegasusForConditionalGeneration
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

In [15]:
torch.cuda.empty_cache()

match task:
  case "DM":
    model_config = T5Config.from_pretrained("ltg/flan-t5-definition-en-base", output_attentions=True)
    model = T5ForConditionalGeneration.from_pretrained("ltg/flan-t5-definition-en-base", config=model_config).to(device)
    tokenizer = AutoTokenizer.from_pretrained("ltg/flan-t5-definition-en-base")
  case "PG":
    model_config = PegasusConfig.from_pretrained("tuner007/pegasus_paraphrase", output_attentions=True)
    model = PegasusForConditionalGeneration.from_pretrained("tuner007/pegasus_paraphrase", config=model_config).to(device)
    #model = PegasusForConditionalGeneration.from_pretrained("tuner007/pegasus_paraphrase").to(device)
    tokenizer = PegasusTokenizer.from_pretrained("tuner007/pegasus_paraphrase")
  case "MT":
    model_config = AutoConfig.from_pretrained("facebook/nllb-200-distilled-600M", output_attentions=True)
    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", config=model_config).to(device)
    tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

## Attention flow

This is a first check if the calc works

In [16]:
from transformers import AutoTokenizer
from transformers import T5Config, T5ForConditionalGeneration

input_text = "The sides of the casket were covered with heavy black broadcloth, ... What is the meaning of surmounting ?"
inputs = tokenizer.encode_plus(input_text, return_tensors='pt', add_special_tokens=True)
encoder_input_ids = inputs['input_ids'].to(device)

decoder_input_text = "The meaning of surmounting is"
decoder_inputs = tokenizer.encode_plus(decoder_input_text, return_tensors='pt', add_special_tokens=True)
decoder_input_ids = decoder_inputs['input_ids'].to(device)

attention_mask = inputs['attention_mask'].to(device)

In [17]:
model.eval()
outputs = model(
    input_ids=encoder_input_ids,
    decoder_input_ids=decoder_input_ids,
    attention_mask=attention_mask
)

In [18]:
encoder_attention_layers = outputs.encoder_attentions
decoder_attention_layers = outputs.decoder_attentions
cross_attention_layers = outputs.cross_attentions

Compute Influence Scores for the Last Layer

In [19]:
last_layer_attention = decoder_attention_layers[-1]
token_influence_last_layer = last_layer_attention.sum(dim=1).mean(dim=0)

Propagate Influence Scores Backward:
Iteratively propagate scores backward through decoder layers, then through cross-attention to the encoder, and finally through encoder layers.

In [20]:
# Start with decoder layers
decoder_influences = [token_influence_last_layer]
for layer_attention in reversed(decoder_attention_layers[:-1]):
    influence = layer_attention.sum(dim=1).mean(dim=0) @ decoder_influences[-1]
    decoder_influences.append(influence)
decoder_influences = decoder_influences[::-1]

# prop. from decoder to encoder through cross-attention
last_decoder_influence = decoder_influences[-1]
cross_attention = cross_attention_layers[-1]
encoder_influence_from_decoder = cross_attention.sum(dim=1).mean(dim=0).transpose(-1, -2) @ last_decoder_influence

# continue with encoder layers
encoder_influences = [encoder_influence_from_decoder]
for layer_attention in reversed(encoder_attention_layers):
    influence = layer_attention.sum(dim=1).mean(dim=0) @ encoder_influences[-1]
    encoder_influences.append(influence)
encoder_influences = encoder_influences[::-1]


Apply Exponential Decay

In [21]:
decay_factor = 0.9
for i, influence_scores in enumerate(decoder_influences + encoder_influences):
    (decoder_influences + encoder_influences)[i] = (decay_factor ** i) * influence_scores


Aggregate and Normalize the Scores

In [22]:
# agg encoder influences
aggregated_influence_encoder = torch.stack(encoder_influences).sum(dim=0)
normalized_influence_encoder = (aggregated_influence_encoder / aggregated_influence_encoder.sum())[-1]

# agg decoder influences
aggregated_influence_decoder = torch.stack(decoder_influences).sum(dim=0)
normalized_influence_decoder = (aggregated_influence_decoder / aggregated_influence_decoder.sum())

In [23]:
normalized_influence_decoder

tensor([[1.1111e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.0879e-01, 2.3259e-03, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.0593e-01, 4.4717e-03, 7.0542e-04, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.0369e-01, 5.3986e-03, 2.0180e-03, 2.9125e-06, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.0191e-01, 5.5545e-03, 3.6057e-03, 3.8207e-05, 7.0281e-06, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [9.8723e-02, 6.6751e-03, 5.5473e-03, 1.3329e-04, 3.2475e-05, 3.3770e-08,
         0.0000e+00, 0.0000e+00, 0.0000e+00],
        [9.5104e-02, 8.2017e-03, 7.3994e-03, 3.1316e-04, 9.2433e-05, 5.8566e-07,
         4.2497e-09, 0.0000e+00, 0.0000e+00],
        [9.9748e-02, 6.2210e-03, 4.7090e-03, 2.6427e-04, 1.1589e-04, 1.8544e-05,
         1.0872e-05, 2.3922e-0

In [24]:
del decoder_input_ids, encoder_input_ids, attention_mask, outputs
torch.cuda.empty_cache()

## Building a classifier

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.decomposition import PCA
from torch.utils.data import DataLoader, TensorDataset, Dataset

In [26]:
class InfluenceDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=512):
        self.dataframe = dataframe
        self.dataframe.reset_index(inplace=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]

        try:
            src_encoding = self.tokenizer.encode_plus(
                row['src'],
                max_length=self.max_len,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            hyp_encoding = self.tokenizer.encode_plus(
                row['hyp'],
                max_length=self.max_len,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            print(f"src: {row['src']}, hyp: {row['hyp']}")

            # default values in case of na => tensor of zeros, with shape (1, max_len)
            default_value = torch.zeros((1, self.max_len), dtype=torch.long)

            src_encoding = {'input_ids': default_value, 'attention_mask': default_value}
            hyp_encoding = {'input_ids': default_value, 'attention_mask': default_value}


        if "label" in row:
          label = 1 if row['label'] == "Hallucination" else 0
        # no label given (can be discared as not used for training but just for inference)
        else:
          label = "x"
        return {
            'encoder_input_ids': src_encoding['input_ids'].squeeze(),
            'decoder_input_ids': hyp_encoding['input_ids'].squeeze(),
            'attention_mask': src_encoding['attention_mask'].squeeze(),
            "label": label
        }


In [27]:
def get_influences(df, max_len=256):
  _df = df.copy()

  if max_len == "auto":
    max_len = len(max(list(_df.src.str.split(" "))))
    print("base max_len = ", max_len)
    max_len = int(max_len + 0.15 * max_len)+1 # 15% extra for tokenization puffer

  print("max_len =", max_len)

  dataset = InfluenceDataset(_df, tokenizer, max_len=max_len)
  # print(dataset[0])
  # adjust batch_size dep on the GPU
  dataloader = DataLoader(dataset, batch_size=8, shuffle=False)
  # print(dataloader)

  batched_influences = []
  all_influences = []
  labels = []


  # model.to(device)
  model.eval()
  curr_idx = 0
  for batch in tqdm(dataloader):

      # print(curr_idx)
      # curr_idx += 1

      labels.extend(np.array(batch['label']))

      with torch.no_grad():
        encoder_input_ids = batch['encoder_input_ids'].to(device)
        decoder_input_ids = batch['decoder_input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(
            input_ids=encoder_input_ids,
            decoder_input_ids=decoder_input_ids,
            attention_mask=attention_mask,
            # output_attentions=True
        )

      # print(batch['encoder_input_ids'].shape)

      # print(f"Shape of encoder_attention_layers: {len(outputs.encoder_attentions)}")
      # print(f"Shape of decoder_attention_layers: {len(outputs.decoder_attentions)}")
      # print(f"Shape of cross_attention_layers: {len(outputs.cross_attentions)}")

      encoder_attention_layers = outputs.encoder_attentions
      decoder_attention_layers = outputs.decoder_attentions
      cross_attention_layers = outputs.cross_attentions
      last_layer_attention = decoder_attention_layers[-1]
      token_influence_last_layer = last_layer_attention.sum(dim=1).mean(dim=0)

      # start with decoder layers
      decoder_influences = [token_influence_last_layer]
      for layer_attention in reversed(decoder_attention_layers[:-1]):
          influence = layer_attention.sum(dim=1).mean(dim=0) @ decoder_influences[-1]
          decoder_influences.append(influence)
      decoder_influences = decoder_influences[::-1]

      # propagate from decoder to encoder through cross-attention
      last_decoder_influence = decoder_influences[-1]
      cross_attention = cross_attention_layers[-1]
      encoder_influence_from_decoder = cross_attention.sum(dim=1).mean(dim=0).transpose(-1, -2) @ last_decoder_influence

      # continue with encoder layers
      encoder_influences = [encoder_influence_from_decoder]
      for layer_attention in reversed(encoder_attention_layers):
          influence = layer_attention.sum(dim=1).mean(dim=0) @ encoder_influences[-1]
          encoder_influences.append(influence)
      encoder_influences = encoder_influences[::-1]

      # decay and normalize influences for the entire batch
      decay_factor = 0.85
      for i, influence_scores in enumerate(decoder_influences + encoder_influences):
          (decoder_influences + encoder_influences)[i] = (decay_factor ** i) * influence_scores

      # print(batch['encoder_input_ids'].shape)

      # print("----")
      for idx in range(batch['encoder_input_ids'].shape[0]):
          # comp aggregated influence for the idx-th instance in the batch
          aggregated_influence_decoder = torch.stack([dec_inf[idx] for dec_inf in decoder_influences]).sum(dim=0)
          normalized_influence_decoder = (aggregated_influence_decoder / aggregated_influence_decoder.sum())

          # flatten and collect the influences for the idx-th instance in the batch
          influences_i = normalized_influence_decoder.view(-1)
          all_influences.append(influences_i.cpu().detach().numpy())



      del encoder_input_ids, decoder_input_ids, attention_mask, outputs  # Free up memory
      torch.cuda.empty_cache()

  influences_array = np.stack(all_influences)
  labels_array = np.array(labels)

  return influences_array, labels_array

In [28]:
X, y = get_influences(agnostic_df, max_len = 8)#"auto")

max_len = 8


100%|██████████| 24/24 [00:00<00:00, 25.64it/s]


In [29]:
print(X.shape)
print(y.shape)

(187, 8)
(187,)


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1160)

In [31]:
DO_PCA = True
PCA_COMPONENTS = 6
GRID_SEARCH = False

In [32]:
if DO_PCA:
  pca = PCA(n_components=PCA_COMPONENTS)
  X_train_ready = pca.fit_transform(X_train)
  X_test_ready = pca.transform(X_test)

else:
  X_train_ready = X_train.copy()
  X_test_ready = X_train.copy()

clf = SVC(kernel='rbf', random_state=1160)

if GRID_SEARCH:
  param_grid = {
      'C': [0.001, 0.008, 0.1, 1, 10, 100],
      'gamma': [0.0001, 0.001, 0.003, 0.2, 0.5, 0.75, 1]
  }

  grid_search = GridSearchCV(clf, param_grid, cv=4, scoring='accuracy', verbose=2, n_jobs=-1)
  grid_search.fit(X_train_ready, y_train)
  best_clf = grid_search.best_estimator_
  clf = best_clf

clf.fit(X_train_ready, y_train)
y_pred = clf.predict(X_test_ready)

# Evaluation
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
# print(classification_report(y_test, y_pred))
# print(f'Best parameters: {grid_search.best_params_}')

Accuracy: 0.5789473684210527


In [44]:
def predict_for_set(df):
  _df = df.copy()
  # ['DM' 'PG' 'MT']
  match task:
    case "DM":
      max_len = "auto"
    case "PG":
      max_len = 8
    case "MT":
      max_len = 8

  X_values, y_values = get_influences(_df, max_len = max_len)
  pca = PCA(n_components=PCA_COMPONENTS)
  X_values_pca = pca.fit_transform(X_values)
  preds = clf.predict(X_values_pca)
  _df["class"] = list(map(lambda x: "Hallucination" if x == 1 else "Not Hallucination", preds))
  return _df

In [45]:
val_df_preds = predict_for_set(val_df)
val_df_preds.head(2)
val_df_preds.to_csv(f"{task}_validation_results.csv", index=False)

max_len = 8


100%|██████████| 24/24 [00:02<00:00, 10.58it/s]


In [46]:
accuracy_score(val_df_preds["label"], val_df_preds["class"])

0.5159574468085106

In [47]:
test_df.shape

(10000, 7)

In [48]:
test_df_preds = predict_for_set(
    test_df
)
print(test_df_preds.head(2))
test_df_preds.to_csv(
    f"{task}_test_results.csv", index=False
)

max_len = 8


100%|██████████| 1250/1250 [00:57<00:00, 21.59it/s]


   index                                                hyp  \
0  20000  These plants get their food from the sun throu...   
1  20001  Only the Stanley mining bank around the FIC we...   

                                                 tgt  \
0  Plants make their food from the sun by photosy...   
1  Money can be exchanged at the only bank in the...   

                                                 src     ref task  \
0  Үсемлекләр азыкны фотосинтез юлы белән кояштан...  either   MT   
1  Ilé ìfowópamọ́sí tó wà ní erékùsù ní Stanley k...  either   MT   

                              model              class  
0  facebook/nllb-200-distilled-600M  Not Hallucination  
1  facebook/nllb-200-distilled-600M  Not Hallucination  
