# Enhancing explainability via a classifier-variant approach

imports

In [1]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification

Load a Fine-Tuned Classifier

In [2]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


label mapping

In [12]:
label_mapping = model.config.id2label

Base Prediction Function

In [20]:
def predict(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True
    )

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits.squeeze()
    probs = torch.softmax(logits, dim=0)

    pred_id = torch.argmax(probs).item()

    return {
        "label": label_mapping[pred_id],
        "confidence": probs[pred_id].item(),
        "logits": logits.numpy(),
        "pred_id": pred_id
    }


Understand the Classifier-Variant Idea

In [21]:
# Original input:

"The movie was absolutely fantastic and inspiring"

'The movie was absolutely fantastic and inspiring'

In [22]:
# Variant classifier inputs:
"fantastic"
"inspiring"
"absolutely"


'absolutely'

Token-Level Variant Generator

In [23]:
def generate_variants(text):
  tokens = tokenizer.tokenize(text)
  variants = []

  for i, token in enumerate(tokens):
    modified_tokens = tokens[:i] + tokens[i + 1:]
    modified_text = tokenizer.convert_tokens_to_string(modified_tokens)
    variants.append((token, modified_text))

  return variants

Classifier-Variant Explainability Engine

In [24]:
def explain_by_token_removal(text):
    original = predict(text)
    original_logit = original["logits"][original["pred_id"]]

    variants = generate_variants(text)
    importance_scores = []

    for token, modified_text in variants:
        result = predict(modified_text)
        new_logit = result["logits"][original["pred_id"]]

        logit_drop = original_logit - new_logit

        importance_scores.append({
            "token": token,
            "logit_drop": round(float(logit_drop), 4),
            "new_label": result["label"]
        })

    return sorted(
        importance_scores,
        key=lambda x: x["logit_drop"],
        reverse=True
    )


Run the Explanation

In [25]:
text = "The movie was absolutely fantastic and inspiring"

base_result = predict(text)
explanations = explain_by_token_removal(text)

print("Original prediction:", base_result)
print("\nTop Explanatory Tokens:")
for item in explanations[:5]:
  print(item)

Original prediction: {'label': 'POSITIVE', 'confidence': 0.9998890161514282, 'logits': array([-4.3716793,  4.7340064], dtype=float32), 'pred_id': 1}

Top Explanatory Tokens:
{'token': 'inspiring', 'logit_drop': 0.0533, 'new_label': 'POSITIVE'}
{'token': 'fantastic', 'logit_drop': 0.015, 'new_label': 'POSITIVE'}
{'token': 'and', 'logit_drop': 0.0146, 'new_label': 'POSITIVE'}
{'token': 'movie', 'logit_drop': 0.0114, 'new_label': 'POSITIVE'}
{'token': 'absolutely', 'logit_drop': 0.0017, 'new_label': 'POSITIVE'}
