# All map comparison

In this notebook, we make a script that generate HTML comparing the attention map for each data instance, given a folder.

## Folder setup

We define the folder set as following:

```
<root>
├── ProjectA
│   ├── A_map.json
│   ├── B_map.json
│   ├── C_map.json
│   └── ...
├── ProjectB
└── ...
```

We want to sample different heatmaps in ProjectA into ProjectA/html. Each output html file will have the file name **<instance_id>.html**

We assume that annotation map is found inside of attention_map from models


## Setting up

In [1]:
%load_ext autoreload
%autoreload 2

from IPython.display import display, HTML
import sys
import os
from os import path

sys.path.append("./../src")

In [2]:
from modules.logger import init_logging
from modules.logger import log

init_logging(color=True)

In [3]:
!nvidia-smi

Thu Oct  5 19:08:30 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 108...  On   | 00000000:04:00.0 Off |                  N/A |
| 23%   22C    P8     8W / 250W |      1MiB / 11178MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

## Parameters

In [4]:
import platform

# Define root folder based on current node (local or server)
node = platform.node()
log.info(f'Current node: {node}')
if node == 'MAC-C02D80HRMD6':
    ROOT = '/Users/dunguyen/Developer/server_backup/historic/2023-06-05'
else:
    ROOT = '/home/dunguyen/RUNS'
    
# ==== Choose dataset ====    
DATASET = 'esnli'
# ========================
ROOT = ROOT + '/qualitative_result'
PROJECT = f'benchmark_explainers_{DATASET}'
MODEL_NAME = 'lstm_attention.run=0_lstm=1'

# Define all paths
MAPS = [
    { 'file_suffix': 'attention_map', 'display': 'Annotation Maps', 'column': 'a_true'},
    { 'file_suffix': 'attention_map', 'display': 'Attention Maps', 'column': 'a_hat'},
    { 'file_suffix': 'lime_map', 'display': 'LIME Maps', 'column': 'a_lime'},
    { 'file_suffix': 'grad_map', 'display': 'Gradient-based Maps', 'column': 'a_grad'},
    { 'file_suffix': 'shap_map', 'display': 'SHAP Maps', 'column': 'a_shap'},
]

# update file path
for m in MAPS:
    m['fpath'] = path.join(ROOT, PROJECT, MODEL_NAME + '.' + m['file_suffix'] + '.json')

05-10-2023 19:08:31 | [34m    INFO[0m [1m [4m 426039642.py:<cell line: 5>:5 [0m [34mCurrent node: grele-3.nancy.grid5000.fr[0m


In [5]:
import pandas as pd
import numpy as np

In [6]:
# Clean padding tokens in attention map files
df_attention = pd.read_json(MAPS[1]['fpath'])

def clean_padding(row):
    a_hat = np.array(row['a_hat'])
    padding_mask = np.array(row['padding_mask'])
    a_true = np.array(row['a_true'])
    a_heu = np.array(row['heuristic'])
    tokens = np.array(row['tokens.form'])
    a_hat_clean = a_hat[~padding_mask]
    a_true_clean = a_true[~padding_mask]
    row['a_hat'] = a_hat_clean.tolist()
    row['a_true'] = a_true_clean.tolist()
    row['heuristic'] = a_heu[~padding_mask].tolist()
    # row['tokens.form'] = tokens[~padding_mask].tolist()
    return row

if 'padding_mask' in df_attention.columns:
    df_attention = df_attention.apply(clean_padding, axis=1)
    df_attention = df_attention.drop(columns=['padding_mask'])
    df_attention.to_json(MAPS[1]['fpath'])
    
# Replace label 
if 'label_hat' not in df_attention.columns:
    label_itos = dict()
    if DATASET == 'hatexplain': 
        from data.hatexplain.dataset import HateXPlain
        label_itos = HateXPlain.LABEL_ITOS
    elif DATASET == 'yelphat':
        from data.yelp_hat.dataset import YelpHat
        label_itos = YelpHat.LABEL_ITOS
    elif DATASET == 'esnli':
        from data.esnli.dataset import ESNLI
        label_itos = ESNLI.LABEL_ITOS
    else:
        raise ValueError('Dataset not supported')
    
    #df_attention['label_hat'] = df_attention['y_hat'].apply(lambda x: label_itos[x])
    #df_attention['label_true'] = df_attention['y_true'].apply(lambda x: label_itos[x])
    df_attention['label_hat'] = df_attention['y_hat']
    df_attention['label_true'] = df_attention['y_hat']
    df_attention.to_json(MAPS[1]['fpath'])

In [7]:
# Treating eSNLI: fusion all together
def clean_padding_nli(row):
    """Clean padding tokens in attention map files"""
    for side in ['premise', 'hypothesis']:
        padding_mask = np.array(row['padding_mask.'+side])
        a_true = np.array(row['a_true.'+side])
        a_hat = np.array(row['a_hat.'+side])
        row['a_true.'+side] = a_true[~padding_mask].tolist()
        row['a_hat.'+side] = a_hat[~padding_mask].tolist()
    return row

if DATASET == 'esnli':
    
    # Clean padding mask in premise and hypothesis
    if 'padding_mask.premise' in df_attention.columns:
        log.debug(f'Cleaning padding tokens for eSNLI')
        df_attention = df_attention.apply(clean_padding_nli, axis=1)
        df_attention = df_attention.drop(columns=['padding_mask.premise', 'padding_mask.hypothesis'])
        df_attention.to_json(MAPS[1]['fpath'])
    
    # Normalize weights if this is not done in attention map
    max_vector = df_attention['a_hat.premise'].apply(lambda x: max(x))
    if (max_vector < 1).any():
        from modules.utils import rescale
        log.debug(f'Normalize attention map for eSNLI')
        df_attention['a_hat.premise'] = df_attention['a_hat.premise'].apply(lambda x: rescale(x).tolist())
        df_attention['a_hat.hypothesis'] = df_attention['a_hat.hypothesis'].apply(lambda x: rescale(x).tolist())
        df_attention.to_json(MAPS[1]['fpath'])
        
    # concatenate tokens
    if 'tokens.form' not in df_attention.columns:  
        log.debug(f'Concat tokens for eSNLI')
        # TODO: change back to tokens.form once this is fixed
        df_attention['tokens.form'] = df_attention.apply(lambda row: ['<b>Premise</b>:'] + row['tokens.norm.premise'] + ['<br/><b>Hypothesis</b>:'] + row['tokens.norm.hypothesis'], axis=1)
        df_attention = df_attention.drop(columns=['tokens.norm.premise', 'tokens.norm.hypothesis'])
        df_attention.to_json(MAPS[1]['fpath'])    
    
df_attention

Unnamed: 0,y_hat,a_hat.premise,a_hat.hypothesis,id,premise,hypothesis,label,explanation,highlight_premise,highlight_hypothesis,...,heuristic.premise,heuristic.hypothesis,tokens.ids.premise,tokens.ids.hypothesis,y_true,a_true.premise,a_true.hypothesis,label_hat,label_true,tokens.form
0,entailment,"[0.061332613200000004, 0.6565563679, 0.3939329...","[0.0, 0.0916936249, 0.2854360044, 0.0629455, 0...",4563544127.jpg#1r1e,An older man holding a sign for tattoos solici...,The man works with a tattoo business.,entailment,An older man is a man and he being holding a s...,An *older* *man* holding a sign for tattoos so...,The *man* works with a tattoo business.,...,"[-1.000000015e+30, -2.0663328171, -1.421733498...","[-1.000000015e+30, -1.0027236938, -1.072178840...","[20, 68, 7, 35, 2, 177, 39, 630, 11844, 564, 8...","[5, 7, 64, 12, 2, 630, 564, 3, 0, 0, 0, 0, 0, ...",entailment,"[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",entailment,entailment,"[<b>Premise</b>:, an, old, man, hold, a, sign,..."
1,neutral,"[0.0472730212, 0.0794806406, 0.0, 0.5041347146...","[0.0443991721, 0.3893270493, 0.6401420832, 1.0...",4755772625.jpg#4r1n,People looking at tall oriental art piece.,The new Chinese history exhibit is open at the...,neutral,Oriental art pieces can be found in so many pl...,People looking at tall oriental art piece.,The new Chinese history exhibit is open at the...,...,"[-1.4176145792, -1.1878677607, -1.000000015e+3...","[-1.000000015e+30, -1.7927821875, -1.954220891...","[14, 36, 17, 319, 1044, 309, 308, 3, 0, 0, 0, ...","[5, 355, 680, 3740, 1607, 4, 258, 17, 5, 856, ...",neutral,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",neutral,neutral,"[<b>Premise</b>:, people, look, at, tall, orie..."
2,entailment,"[0.4208866358, 1.0, 0.0369730406, 0.1549697667...","[0.2146061957, 0.3105322421, 1.0, 0.0, 0.94315...",4755772625.jpg#4r1e,People looking at tall oriental art piece.,People are looking at art,entailment,the art that the people are looking is a tall ...,*People* *looking* at *tall* *oriental* *art* ...,People are looking at *art*,...,"[-1.2582182884, -1.1879684925, -1.000000015e+3...","[-1.2146189213, -1.000000015e+30, -0.839180231...","[14, 36, 17, 319, 1044, 309, 308, 3, 0, 0, 0, ...","[14, 4, 36, 17, 309, 0, 0, 0, 0, 0, 0, 0, 0, 0...",entailment,"[1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 1.0]",entailment,entailment,"[<b>Premise</b>:, people, look, at, tall, orie..."
3,contradiction,"[0.1599319279, 0.7968158722, 0.0015835182, 0.0...","[0.0609926395, 0.0, 0.4840745032, 0.4886478782...",4755772625.jpg#4r1c,People looking at tall oriental art piece.,People are running a marathon,contradiction,People cannot be running a marathon and lookin...,People *looking* at tall oriental art piece.,People are *running* a marathon,...,"[-0.9945734143, -1.4236390591, -1.000000015e+3...","[-0.35116553310000004, -1.000000015e+30, -1.56...","[14, 36, 17, 319, 1044, 309, 308, 3, 0, 0, 0, ...","[14, 4, 54, 2, 604, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",contradiction,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.0, 1.0, 0.0, 0.0]",contradiction,contradiction,"[<b>Premise</b>:, people, look, at, tall, orie..."
4,contradiction,"[0.057741958600000004, 0.7411455512, 0.1925581...","[0.052095841600000005, 0.6763522029, 0.3455749...",3005123298.jpg#0r1c,An Obama Biden supporter cheers for the Presid...,An Obama supporter is upset that the President...,contradiction,One is either an Obama supporter or one is an ...,An Obama Biden supporter *cheers* for the Pres...,An Obama supporter is *upset* that the Preside...,...,"[-1.000000015e+30, -1.000000015e+30, -1.000000...","[-1.000000015e+30, -1.000000015e+30, -1.546857...","[20, 3334, 1, 4105, 743, 39, 5, 7760, 7350, 9,...","[20, 3334, 4105, 4, 1766, 107, 5, 2991, 951, 2...",contradiction,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",contradiction,contradiction,"[<b>Premise</b>:, an, obama, biden, supporter,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9819,neutral,"[0.0412852503, 0.2441798896, 0.0, 0.0141727552...","[0.18250405790000002, 1.0, 0.0057042758, 0.0, ...",6126962700.jpg#4r1c,A rally in a different country.,A carnival in a different country.,contradiction,"Sentence 1 says the location is at a rally, wh...",A *rally* in a different country.,A *carnival* in a different country.,...,"[-1.000000015e+30, -2.4112033844000003, -1.000...","[-1.000000015e+30, -2.6194021702, -1.000000015...","[2, 1491, 6, 2, 550, 656, 3, 0, 0, 0, 0, 0, 0,...","[2, 884, 6, 2, 550, 656, 3, 0, 0, 0, 0, 0, 0, ...",contradiction,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",neutral,neutral,"[<b>Premise</b>:, a, rally, in, a, different, ..."
9820,entailment,"[0.0928477421, 0.6092538834, 0.0267547872, 0.0...","[0.062568672, 0.5261303782, 0.0, 0.0137268817,...",6126962700.jpg#4r1e,A rally in a different country.,A rally in a seperate country.,entailment,Different is a synonym for seperate.,A rally in a *different* country.,A rally in a *seperate* country.,...,"[-1.000000015e+30, -1.5680634975, -1.000000015...","[-1.000000015e+30, -1.4119501114, -1.000000015...","[2, 1491, 6, 2, 550, 656, 3, 0, 0, 0, 0, 0, 0,...","[2, 1491, 6, 2, 6485, 656, 3, 0, 0, 0, 0, 0, 0...",entailment,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]",entailment,entailment,"[<b>Premise</b>:, a, rally, in, a, different, ..."
9821,neutral,"[0.1597015411, 0.1067305058, 0.0543378256, 0.1...","[0.0122721102, 1.0, 0.0264848694, 0.0, 0.02731...",3502897880.jpg#0r1n,A man in a blue shirt is performing a skateboa...,A tall person in shirt,neutral,Some men aren’t necessarily tall.,A man in a blue shirt is performing a skateboa...,A *tall* person in shirt,...,"[-1.000000015e+30, -1.6318250895, -1.000000015...","[-1.000000015e+30, -2.1690359116, -0.360219568...","[2, 7, 6, 2, 34, 27, 4, 117, 2, 198, 263, 84, ...","[2, 319, 45, 6, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",neutral,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 0.0]",neutral,neutral,"[<b>Premise</b>:, a, man, in, a, blue, shirt, ..."
9822,entailment,"[0.4249371886, 0.3039977551, 0.102009601900000...","[0.051058452600000005, 0.1729544103, 0.0847867...",3502897880.jpg#0r1e,A man in a blue shirt is performing a skateboa...,A person in a shirt,entailment,The man is wearing a shirt in both sentences.,A *man* in a *blue* *shirt* is performing a sk...,A *person* in a *shirt*,...,"[-1.000000015e+30, -1.6260750294, -1.000000015...","[-1.000000015e+30, -0.2388564497, -1.000000015...","[2, 7, 6, 2, 34, 27, 4, 117, 2, 198, 263, 84, ...","[2, 45, 6, 2, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...",entailment,"[0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[0.0, 1.0, 0.0, 0.0, 1.0]",entailment,entailment,"[<b>Premise</b>:, a, man, in, a, blue, shirt, ..."


In [8]:
# import and fusion into a single dataframe
map_data = None
for m in MAPS:
    # load data from json file
    df = pd.read_json(m['fpath'])
    df.set_index('id', inplace=True)
    
    column = m['column']
    
    # concat if this is esnli
    if DATASET == 'esnli':
        from modules.utils import rescale
        if (column != 'a_true') and (df[column+'.premise'].apply(lambda x: max(x)) != 1).any():
            df[column+'.premise'] = df[column+'.premise'].apply(lambda x: rescale(x).tolist())
            df[column+'.hypothesis'] = df[column+'.hypothesis'].apply(lambda x: rescale(x).tolist())
        df[column] = df.apply(lambda row: [0] + row[column +'.premise'] + [0] + row[column+'.hypothesis'], axis=1)
        df.drop(columns=[column +'.premise', column+'.hypothesis'], inplace=True)

    # the first dataframe will query the id and the tokens
    if map_data is None:
        map_data = df[['tokens.form', 'label_hat', 'label_true', 'y_hat', 'y_true']].copy()

    map_data = map_data.join(df[column])

map_data = map_data[(map_data['y_hat'] == map_data['y_true']) & (map_data['y_hat'] != 0)]
map_data

Unnamed: 0_level_0,tokens.form,label_hat,label_true,y_hat,y_true,a_true,a_hat,a_lime,a_grad,a_shap
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4563544127.jpg#1r1e,"[<b>Premise</b>:, an, old, man, hold, a, sign,...",entailment,entailment,entailment,entailment,"[0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[0, 0.061332613200000004, 0.6565563679, 0.3939...","[0, 0.25276488065719604, 1.0, 0.0, 0.133986830...",,"[0, 0.5209540724754333, 0.7705860137939453, 0...."
4755772625.jpg#4r1n,"[<b>Premise</b>:, people, look, at, tall, orie...",neutral,neutral,neutral,neutral,"[0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0,...","[0, 0.0472730212, 0.0794806406, 0.0, 0.5041347...",,,"[0, 0.21172809600830078, 0.582747220993042, 0...."
4755772625.jpg#4r1e,"[<b>Premise</b>:, people, look, at, tall, orie...",entailment,entailment,entailment,entailment,"[0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0,...","[0, 0.4208866358, 1.0, 0.0369730406, 0.1549697...","[0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.7777280807...","[0, 0.4363981783, 1.0, 0.1477371007, 0.7395327...","[0, 0.2937150001525879, 0.2991650104522705, 0...."
4755772625.jpg#4r1c,"[<b>Premise</b>:, people, look, at, tall, orie...",contradiction,contradiction,contradiction,contradiction,"[0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0,...","[0, 0.1599319279, 0.7968158722, 0.0015835182, ...","[0, 0.5615438222885132, 1.0, 0.0, 0.0, 0.33970...","[0, 0.2257496864, 1.0, 0.035255041, 0.09796419...","[0, 0.6391984820365906, 1.0, 0.684261739253997..."
3005123298.jpg#0r1c,"[<b>Premise</b>:, an, obama, biden, supporter,...",contradiction,contradiction,contradiction,contradiction,"[0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0....","[0, 0.057741958600000004, 0.7411455512, 0.1925...","[0, 0.5260864496231079, 1.0, 0.210010454058647...","[0, 0.0874609947, 1.0, 0.31299108270000003, 0....","[0, 0.4279344379901886, 0.0, 0.433184742927551..."
...,...,...,...,...,...,...,...,...,...,...
6126962700.jpg#4r1n,"[<b>Premise</b>:, a, rally, in, a, different, ...",neutral,neutral,neutral,neutral,"[0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0.0,...","[0, 0.0681448057, 0.4375804961, 0.0, 0.0032877...",,,"[0, 0.3147812485694885, 0.6295944452285767, 0...."
6126962700.jpg#4r1e,"[<b>Premise</b>:, a, rally, in, a, different, ...",entailment,entailment,entailment,entailment,"[0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0, 0.0,...","[0, 0.0928477421, 0.6092538834, 0.0267547872, ...","[0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0, 0.0,...",,"[0, 0.0, 0.5554646849632263, 0.495340675115585..."
3502897880.jpg#0r1n,"[<b>Premise</b>:, a, man, in, a, blue, shirt, ...",neutral,neutral,neutral,neutral,"[0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[0, 0.1597015411, 0.1067305058, 0.0543378256, ...",,,"[0, 0.5308428406715393, 1.0, 0.071584269404411..."
3502897880.jpg#0r1e,"[<b>Premise</b>:, a, man, in, a, blue, shirt, ...",entailment,entailment,entailment,entailment,"[0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0....","[0, 0.4249371886, 0.3039977551, 0.102009601900...","[0, 0.0, 1.0, 0.0, 0.9952212572097778, 0.0, 0....","[0, 0.1239211038, 0.24865697320000002, 0.13355...","[0, 0.18451674282550812, 0.8170640468597412, 0..."


In [15]:
def steep_sigmoid(x, s=10, p=2):
    x = np.array(x)
    x_normalized = 1 / (1 + np.exp(-s * (x - 0.5)))**p
    return x_normalized.tolist()

In [9]:
from tqdm.notebook import tqdm
from modules.utils import highlight
import shutil

# Remove the previous existing folder
html_dir = path.join(ROOT, PROJECT, '.html')
if os.path.exists(html_dir) and os.path.isdir(html_dir):
    log.info(f'Removing existing folder {html_dir}')
    shutil.rmtree(html_dir)

# Generate each comparison into a file:
for idx, row in tqdm(map_data.iterrows(), total=len(map_data)):
    
    # ignore if label is 0
    if row['y_true'] == 0: continue
    
    # ignore if row contains any NaN
    if row.isnull().sum() > 0: continue
    
    html = """
    <html>
    <head><style>
    table, th, td {
      border:solid black;
      border-collapse: collapse;
      padding: 0px 5px 0px 5px;
    }</style></head>
    <body>
    """
    html += '<table style="font-size:120%;" cellspacing=0>'
    html += f'<caption>Dataset: {DATASET} - Instance ID: {idx}</caption>'
    html += '<tr><th style="100px;">Explainer</th> <th style="500px;">Explanation</th> <th style="100px;">Predicted Label</th> <th style="100px;">True Label</th></tr>'
    
    # Display a row for each map
    for m in MAPS:
        html += '<tr>'
        
        # Display the explainer and its explanation
        c = m['column']
        map_name = m['display']
        # TODO check what if we change the value in gradient map:
        #if c == 'a_grad':
        #    row[c] = steep_sigmoid(row[c], s=5, p=2)
        map_viz = highlight(row['tokens.form'], row[c], normalize_weight=False)
        html+= f'<td style="text-align:right;"> {map_name} </td><td> {map_viz} </td>'
        
        # For the first row, display the spanning the label
        if c == 'a_true':
            row_span = len(MAPS)
            html +=f'<td rowspan="{row_span}" style="text-align:center"> {row["label_hat"]} </td>'
            html +=f'<td rowspan="{row_span}" style="text-align:center"> {row["label_true"]} </td>'
            
        html += '</tr>\n'
        
    html += '</table>'
    html += '</body></html>'

    fpath_html = path.join(html_dir, f'{idx}.html')
    os.makedirs(html_dir, exist_ok=True)
    with open(fpath_html, 'w') as f:
        f.write(html)

05-10-2023 19:09:17 | [34m    INFO[0m [1m [4m 3277692043.py:<cell line: 7>:8 [0m [34mRemoving existing folder /home/dunguyen/RUNS/qualitative_result/benchmark_explainers_esnli/.html[0m


  0%|          | 0/7970 [00:00<?, ?it/s]

# Modify dataset columns

In [7]:
import pandas as pd

# fname = './../.cache/dataset/esnli/test.pretransformed.parquet'

fname = './../../RUNS/dataset/esnli/test.pretransformed.parquet'
df = pd.read_parquet(fname)
df.head(10)

Unnamed: 0,id,premise,hypothesis,label,explanation,highlight_premise,highlight_hypothesis,tokens.norm.premise,tokens.norm.hypothesis,rationale.premise,rationale.hypothesis,heuristic.premise,heuristic.hypothesis
0,2677109430.jpg#1r1n,This church choir sings to the masses as they ...,The church has cracks in the ceiling.,neutral,Not all churches have cracks in the ceiling,This church choir sings to the masses as they ...,The church has *cracks* *in* *the* *ceiling.*,"[this, church, choir, sing, to, the, masse, as...","[the, church, have, crack, in, the, ceiling, .]","[False, False, False, False, False, False, Fal...","[False, False, False, True, True, True, True, ...","[-1.0000000150474662e+30, 3.064525842666626, 1...","[-1.0000000150474662e+30, 7.628961086273193, -..."
1,2677109430.jpg#1r1e,This church choir sings to the masses as they ...,The church is filled with song.,entailment,"""Filled with song"" is a rephrasing of the ""cho...",This church *choir* *sings* *to* *the* *masses...,The church is *filled* *with* *song.*,"[this, church, choir, sing, to, the, masse, as...","[the, church, be, fill, with, song, .]","[False, False, True, True, True, True, True, F...","[False, False, False, True, True, True, False]","[-1.0000000150474662e+30, 2.79181170463562, 2....","[-1.0000000150474662e+30, 7.628961086273193, -..."
2,2677109430.jpg#1r1c,This church choir sings to the masses as they ...,A choir singing at a baseball game.,contradiction,A choir sing some other songs other than book ...,This church choir sings to the *masses* as the...,A choir *singing* at a *baseball* *game.*,"[this, church, choir, sing, to, the, masse, as...","[a, choir, singing, at, a, baseball, game, .]","[False, False, False, False, False, False, Tru...","[False, False, True, False, False, True, True,...","[-1.0000000150474662e+30, 2.5598974227905273, ...","[-1.0000000150474662e+30, 6.388305187225342, 6..."
3,6160193920.jpg#4r1n,"A woman with a green headscarf, blue shirt and...",The woman is young.,neutral,the woman could've been old rather than young,"A woman with a green headscarf, blue shirt and...",The woman is *young.*,"[a, woman, with, a, green, headscarf, ,, blue,...","[the, woman, be, young, .]","[False, False, False, False, False, False, Fal...","[False, False, False, True, False]","[-1.0000000150474662e+30, 2.597653388977051, -...","[-1.0000000150474662e+30, 5.648240089416504, -..."
4,6160193920.jpg#4r1e,"A woman with a green headscarf, blue shirt and...",The woman is very happy.,entailment,a grin suggests hapiness.,"A woman with a green headscarf, blue shirt and...",The woman is very *happy.*,"[a, woman, with, a, green, headscarf, ,, blue,...","[the, woman, be, very, happy, .]","[False, False, False, False, False, False, Fal...","[False, False, False, False, True, False]","[-1.0000000150474662e+30, 2.784580707550049, -...","[-1.0000000150474662e+30, 5.648240089416504, -..."
5,6160193920.jpg#4r1c,"A woman with a green headscarf, blue shirt and...",The woman has been shot.,contradiction,There can be either a woman with a very big gr...,"A woman with a *green* headscarf, blue shirt a...",The woman has been *shot.*,"[a, woman, with, a, green, headscarf, ,, blue,...","[the, woman, have, be, shoot, .]","[False, False, False, False, True, False, Fals...","[False, False, False, False, True, False]","[-1.0000000150474662e+30, 2.6564526557922363, ...","[-1.0000000150474662e+30, 5.648240089416504, -..."
6,4791890474.jpg#3r1e,An old man with a package poses in front of an...,A man poses in front of an ad.,entailment,"The word "" ad "" is short for the word "" advert...",An old man with a package poses in front of an...,A man poses in front of an *ad.*,"[an, old, man, with, a, package, pose, in, fro...","[a, man, pose, in, front, of, an, ad, .]","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal...","[-1.0000000150474662e+30, 2.9205048084259033, ...","[-1.0000000150474662e+30, 5.1345367431640625, ..."
7,4791890474.jpg#3r1n,An old man with a package poses in front of an...,A man poses in front of an ad for beer.,neutral,Not all advertisements are ad for beer.,An old man with a package poses in front of an...,A man poses in front of an ad for *beer.*,"[an, old, man, with, a, package, pose, in, fro...","[a, man, pose, in, front, of, an, ad, for, bee...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal...","[-1.0000000150474662e+30, 3.5127861499786377, ...","[-1.0000000150474662e+30, 5.1345367431640625, ..."
8,4791890474.jpg#3r1c,An old man with a package poses in front of an...,A man walks by an ad.,contradiction,The man poses in front of the advertisement th...,An old *man* with a package *poses* *in* *fron...,A man *walks* *by* an *ad.*,"[an, old, man, with, a, package, pose, in, fro...","[a, man, walk, by, an, ad, .]","[False, False, True, False, False, False, True...","[False, False, True, True, False, True, False]","[-1.0000000150474662e+30, 2.2357261180877686, ...","[-1.0000000150474662e+30, 5.1345367431640625, ..."
9,6526219567.jpg#4r1n,A statue at a museum that no seems to be looki...,The statue is offensive and people are mad tha...,neutral,Not all statues are ignored because they are o...,A statue at a museum that no seems to be looki...,The statue is *offensive* and people are mad t...,"[a, statue, at, a, museum, that, no, seem, to,...","[the, statue, be, offensive, and, people, be, ...","[False, False, False, False, False, False, Fal...","[False, False, False, True, False, False, Fals...","[-1.0000000150474662e+30, 3.6017332077026367, ...","[-1.0000000150474662e+30, 3.75215744972229, -1..."


In [8]:
df.columns

Index(['id', 'premise', 'hypothesis', 'label', 'explanation',
       'highlight_premise', 'highlight_hypothesis', 'tokens.norm.premise',
       'tokens.norm.hypothesis', 'rationale.premise', 'rationale.hypothesis',
       'heuristic.premise', 'heuristic.hypothesis'],
      dtype='object')

In [9]:
from data.transforms import SpacyTokenizerTransform

import spacy
spacy_model = spacy.load('en_core_web_sm')
transform = SpacyTokenizerTransform(spacy_model)

df['tokens.form.premise'] = transform(df['premise'])

In [10]:
df['tokens.form.hypothesis'] = transform(df['hypothesis'])

In [11]:
df.to_parquet(fname, index=False)

In [53]:
from data.transforms import SpacyTokenizerTransform

import spacy
spacy_model = spacy.load('en_core_web_sm')
transform = SpacyTokenizerTransform(spacy_model)

In [54]:
df['tokens.form'] = transform(df['text'].tolist())

In [72]:
premise = df['premise'].tolist()
hypothesis = df['hypothesis'].tolist()
premise_toks = transform(premise)
hypothesis_toks = transform(hypothesis)

df['tokens.form.premise'] = premise_toks
df['tokens.form.hypothesis'] = hypothesis_toks

In [74]:
fname

'./../.cache/dataset/esnli/test.pretransformed.parquet'

In [7]:
import pandas as pd

# fname = './../.cache/dataset/esnli/test.pretransformed.parquet'

fname = './../../RUNS/dataset_/yelp-hat/yelp50.pretokenized_lower_lemma.parquet'
df = pd.read_parquet(fname)
df

Unnamed: 0,text,label,ham_html_0,human_label_0,ham_html_1,human_label_1,ham_html_2,human_label_2,id,ham_0,ham_1,ham_2,tokens.norm,tokens.form,ham,cam,sam,heuristic
0,Out in Twinsburg for work and wasn't expecting...,1,<span>Out</span> <span>in</span> <span>Twinsbu...,yes,<span>Out</span> <span>in</span> <span>Twinsbu...,yes,<span>Out</span> <span>in</span> <span>Twinsbu...,yes,ham_part1(50words)_1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[out, in, twinsburg, for, work, and, be, not, ...","[Out, in, Twinsburg, for, work, and, was, n't,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.00042654533821490776, 0..."
1,Very slow. Never been in the drive at any othe...,0,"<span class=""active"">Very</span> <span class=""...",no,"<span>Very</span> <span class=""active"">slow.</...",no,"<span>Very</span> <span class=""active"">slow.</...",no,ham_part1(50words)_2,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[very, slow, ., never, be, in, the, drive, at,...","[Very, slow, ., Never, been, in, the, drive, a...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0023104539153307505, 0.0, 0.0, 0.0, 0...."
2,"Food is good, but service terrible. They have ...",0,<span>Food</span> <span>is</span> <span class=...,idk,"<span>Food</span> <span>is</span> <span>good,<...",no,"<span>Food</span> <span>is</span> <span>good,<...",no,ham_part1(50words)_3,"[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[food, be, good, ,, but, service, terrible, .,...","[Food, is, good, ,, but, service, terrible, .,...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, ...","[0.023922084384886078, 0.0, 0.0225002665908363..."
3,Stopped by on a Sunday around 11am after a tri...,1,<span>Stopped</span> <span>by</span> <span>on<...,yes,<span>Stopped</span> <span>by</span> <span>on<...,yes,<span>Stopped</span> <span>by</span> <span>on<...,yes,ham_part1(50words)_4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[stop, by, on, a, sunday, around, 11, am, afte...","[Stopped, by, on, a, Sunday, around, 11, am, a...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.00039099989336366543, 0.0, 0.0, 0.0, 0.0, 0..."
4,This place is horrible. They are very stingy w...,0,<span>This</span> <span>place</span> <span>is<...,no,<span>This</span> <span>place</span> <span>is<...,no,<span>This</span> <span>place</span> <span>is<...,no,ham_part1(50words)_5,"[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, ...","[this, place, be, horrible, ., they, be, very,...","[This, place, is, horrible, ., They, are, very...","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, ...","[0.0, 0.012405360253083567, 0.0, 0.00312799914..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,Service and staff were very good. Topping sele...,1,"<span class=""active"">Service</span> <span>and<...",yes,<span>Service</span> <span>and</span> <span>st...,yes,<span>Service</span> <span>and</span> <span>st...,yes,ham_part1(50words)_296,"[1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, ...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, ...","[service, and, staff, be, very, good, ., toppi...","[Service, and, staff, were, very, good, ., Top...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, ...","[1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, ...","[0.019869903671844453, 0.0, 0.0042654533821490..."
296,Love it! Eaten here over 300 times in the last...,1,"<span class=""active"">Love</span> <span>it!</sp...",yes,"<span class=""active"">Love</span> <span class=""...",yes,"<span class=""active"">Love</span> <span class=""...",yes,ham_part1(50words)_297,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, ...","[love, it, !, eat, here, over, 300, time, in, ...","[Love, it, !, Eaten, here, over, 300, times, i...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, ...","[0.008495361319446913, 0.0, 0.0, 0.00252372658..."
297,"According to my friend, this local bar type pl...",1,<span>According</span> <span>to</span> <span>m...,yes,<span>According</span> <span>to</span> <span>m...,yes,<span>According</span> <span>to</span> <span>m...,yes,ham_part1(50words)_298,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, ...","[accord, to, my, friend, ,, this, local, bar, ...","[According, to, my, friend, ,, this, local, ba...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, ...","[0.0, 0.0, 0.0, 0.0003554544485124231, 0.0, 0...."
298,I went here to get a snack before I went on th...,0,<span>I</span> <span>went</span> <span>here</s...,no,<span>I</span> <span>went</span> <span>here</s...,no,<span>I</span> <span>went</span> <span>here</s...,no,ham_part1(50words)_299,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[i, go, here, to, get, a, snack, before, i, go...","[I, went, here, to, get, a, snack, before, I, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.5545444851242..."


In [None]:
DATA_CACHE = '/home/dunguyen/RUNS/dataset_'

In [28]:
from data_module.yelp_hat_module import YelpHat50DM
yelphat_dm = YelpHat50DM(cache_path=DATA_CACHE, batch_size=16)
yelphat_dm.prepare_data()
yelphat_dm.setup()

04-10-2023 15:52:07 | [32;1m   DEBUG[0m [1m [4m dataset.py:download_format_dataset:82 [0m [32;1mCorrectly handle part7.csv[0m
04-10-2023 15:52:08 | [34m    INFO[0m [1m [4m dataset.py:download_format_dataset:110 [0m [34mSave yelp subset at: /home/dunguyen/RUNS/dataset_/yelp-hat/yelp200.parquet[0m
04-10-2023 15:52:08 | [34m    INFO[0m [1m [4m dataset.py:download_format_dataset:110 [0m [34mSave yelp subset at: /home/dunguyen/RUNS/dataset_/yelp-hat/yelp50.parquet[0m
04-10-2023 15:52:08 | [34m    INFO[0m [1m [4m dataset.py:download_format_dataset:110 [0m [34mSave yelp subset at: /home/dunguyen/RUNS/dataset_/yelp-hat/yelp100.parquet[0m
04-10-2023 15:52:08 | [34m    INFO[0m [1m [4m dataset.py:download_format_dataset:116 [0m [34mSave clean dataset at /home/dunguyen/RUNS/dataset_/yelp-hat/yelp.parquet[0m
04-10-2023 15:52:08 | [34m    INFO[0m [1m [4m dataset.py:download_format_dataset:123 [0m [34mSave training set at /home/dunguyen/RUNS/dataset_/yelp-hat/