# IG Calc & attributions for Hallucinations

Based of this: https://discuss.pytorch.org/t/integrated-gradients-and-text-generation/144209 + answer from Jakob

## Setup

### Install

In [None]:
%pip install captum

Collecting captum
  Downloading captum-0.7.0-py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: captum
Successfully installed captum-0.7.0


In [None]:
import json
import os
import random
import sys

In [None]:
import pandas as pd
import numpy as np

import torch
from captum.attr import IntegratedGradients
from torch.nn.functional import softmax

import huggingface_hub
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5ForConditionalGeneration

### Data Loading

In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_dir = "/content/drive/MyDrive/SHROOM/data"

In [None]:
with open(f"{data_dir}/SHROOM_unlabeled-training-data-v2/train.model-aware.v2.json") as f:
  train_data = json.load(f)

with open(f"{data_dir}/SHROOM_dev-v2/val.model-aware.v2.json") as f:
  dev_data = json.load(f)

In [None]:
def prep_df(json_data):
  print(json_data[0])
  print(json_data[0].keys())
  _df = pd.DataFrame(json_data)
  print(_df.task.unique())
  _df = _df.query("task == 'DM'")
  _df = _df.reset_index()
  return _df

In [None]:
train_df = prep_df(train_data)
val_df = prep_df(dev_data)

{'hyp': 'Of or pertaining to the language of a particular area , or to a particular', 'tgt': 'Of or pertaining to everyday language , as opposed to standard , literary , liturgical , or scientific idiom .', 'src': 'There are blacktips , silvertips , bronze whalers , black whalers , spinner sharks , and bignose sharks . these of course are vernacular names , but this is one case where the scientific nomenclature does not clarify the species , since it is now being revised . What is the meaning of vernacular ?', 'ref': 'tgt', 'task': 'DM', 'model': 'ltg/flan-t5-definition-en-base'}
dict_keys(['hyp', 'tgt', 'src', 'ref', 'task', 'model'])
['DM' 'PG' 'MT']
{'hyp': 'A sloping top .', 'ref': 'tgt', 'src': 'The sides of the casket were covered with heavy black broadcloth , with velvet caps , presenting a deep contrast to the rich surmountings . What is the meaning of surmounting ?', 'tgt': 'A decorative feature that sits on top of something .', 'model': 'ltg/flan-t5-definition-en-base', 'task

### Model loading

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from captum.attr import IntegratedGradients

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("ltg/flan-t5-definition-en-base")
model = AutoModelForSeq2SeqLM.from_pretrained("ltg/flan-t5-definition-en-base").to(device)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

## Calculate IGs

In [None]:

def forward_with_last_token(encoder_input_ids, decoder_input_ids, attention_mask):
    outputs = model(input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, attention_mask=attention_mask)
    return outputs.logits[:, -1, :]


ig = IntegratedGradients(forward_with_last_token)

# example
data = {
    'src': 'The sides of the casket were covered with heavy black broadcloth, ... What is the meaning of surmounting ?'
}

encoder_input_ids = tokenizer.encode(data['src'], return_tensors='pt').to(device).long()
decoder_input_ids = torch.tensor([[tokenizer.pad_token_id]], dtype=torch.long).to(device)

dummy_attention_mask = torch.zeros_like(encoder_input_ids).long().to(device)

MAX_RATIONALE_LEN = 30
rels_expl_sum = None

# attribution loop
for step in range(MAX_RATIONALE_LEN):
    current_logits = forward_with_last_token(encoder_input_ids, decoder_input_ids, dummy_attention_mask)
    current_token_idx = torch.argmax(current_logits, dim=-1)

    # comp IG for the current token
    baseline_encoder_input_ids = torch.zeros_like(encoder_input_ids).long()
    baseline_decoder_input_ids = torch.zeros_like(decoder_input_ids).long()

    attributions = ig.attribute(inputs=(encoder_input_ids, decoder_input_ids, dummy_attention_mask),
                                baselines=(baseline_encoder_input_ids, baseline_decoder_input_ids, dummy_attention_mask),
                                target=current_token_idx.item())

    # sum up attributions
    if rels_expl_sum is None:
        rels_expl_sum = attributions[0].sum(dim=-1)
    else:
        rels_expl_sum += attributions[0].sum(dim=-1)

    # check for end of sequence token
    if current_token_idx.item() == tokenizer.eos_token_id:
        break

    # update decoder_input_ids for the next iteration
    new_decoder_input_id = torch.tensor([current_token_idx.item()], dtype=torch.long).to(device)
    decoder_input_ids = torch.cat([decoder_input_ids, new_decoder_input_id.unsqueeze(0)], dim=-1)


normalized_attributions = rels_expl_sum / rels_expl_sum.max()

print("Normalized Attributions:", normalized_attributions)

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)