# Explain predictions using Lime

In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import json
!pip install lime
from lime.lime_text import LimeTextExplainer
import torch
from scipy.special import softmax
from IPython.display import display, HTML
import sqlite3


from google.colab import files
uploaded = files.upload()


Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=ea9b5100f55596b51a9706c1ae774c267e4a68b86ba9409537e4b69eaf4ca777
  Stored in directory: /root/.cache/pip/wheels/e7/5d/0e/4b4fff9a47468fed5633211fb3b76d1db43fe806a17fb7486a
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


Saving lime_colab_dataset.csv to lime_colab_dataset.csv


In [2]:
data_set = pd.read_csv("lime_colab_dataset.csv")
data_set

Unnamed: 0,conversational,label
0,what is the best way to encode and compress a ...,0
1,"does lzstring also work in the browser, client...",0
2,Is there a way in typescript to cast to a type...,0
3,I would like to distill a type based on an inc...,0
4,can you create Photoshop Scripts?,0
...,...,...
109,can you update directly the folder iterative c...,1
110,"super, now can you give me latex formula for t...",1
111,yes latex code please,1
112,I am working on the problem of reconstruc...,0


In [3]:
uploaded = files.upload()

Saving label2id.json to label2id.json


In [4]:
!hf auth login




    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
The token `thesis` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `thesis`


## Load Model and Tokenizer

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("CUDA available:", torch.cuda.is_available())
print("Device:", device)


fine_tuned_model_path = "Mayaryin/gender-prompt-roberta-base"

tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)
model = AutoModelForSequenceClassification.from_pretrained(fine_tuned_model_path).to(device)

with open("label2id.json", "r") as f:
    label2id = json.load(f)

CUDA available: True
Device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

## Explain

In [6]:
class_names = list(label2id.keys())
id2label = {v: k for k, v in label2id.items()}


def predict(texts):
    # Tokenize and move each tensor to the correct device
    encodings = tokenizer(
        texts, return_tensors="pt", truncation=True, padding=True
    ).to(device)

    with torch.no_grad():
        outputs = model(**encodings)
        logits = outputs.logits.cpu().numpy() # for batch processing as expected by limes explainer, since it perturbs the text internally
        probs = softmax(logits, axis=1)
    return probs


## Set up custom explainer with roberta tokenizer

In [7]:
class SubwordLimeTextExplainer(LimeTextExplainer):
    def __init__(self, hf_tokenizer, **kwargs):
        super().__init__(**kwargs)
        self.hf_tokenizer = hf_tokenizer

    def tokenize(self, text):
        # Tokenize the text into subwords (by default returns list of strings/tokens)
        # Note: This usually includes special tokens, so we skip those
        tokens = self.hf_tokenizer.tokenize(text)
        return tokens

    def untokenize(self, tokens):
        # Convert the list of subword tokens back to a text string
        return self.hf_tokenizer.convert_tokens_to_string(tokens)

In [8]:
from collections import defaultdict
from tqdm import tqdm


explainer = SubwordLimeTextExplainer(hf_tokenizer=tokenizer, class_names=class_names)

# Sample column name is 'text'
importance_agg = defaultdict(float)  # token -> sum of scores
token_counts = defaultdict(int)      # token -> number of appearances in explanations

for sample_text in tqdm(data_set['conversational'], desc="Explaining samples"):
    explanation = explainer.explain_instance(
        sample_text,
        predict,
        num_features=100,  # adjust as needed
        labels=[1]         # or the class index you're interested in
    )

    # Get the explanation as a list of (token, weight) tuples
    token_weights = explanation.as_list(label=1)  # use correct label

    for token, weight in token_weights:
        importance_agg[token] += weight
        token_counts[token] += 1

# Now aggregate: for example, calculate average importance for each token
average_importance = {token: importance_agg[token] / token_counts[token]
                      for token in importance_agg}

# Optionally, sort tokens by their average importance (desc)
sorted_tokens = sorted(average_importance.items(), key=lambda x: abs(x[1]), reverse=True)

# Print top 20 tokens
print("Top tokens by average importance:")
for token, score in sorted_tokens[:20]:
    print(f"{token}: {score:.4f}")


Explaining samples: 100%|██████████| 114/114 [02:36<00:00,  1.38s/it]

Top tokens by average importance:
concrete: 0.4368
continue: 0.3892
every: 0.3691
bounds: -0.3427
though: 0.3343
us: 0.3323
own: 0.3292
information: 0.3190
buffer: -0.3044
away: 0.3008
customize: -0.2958
interview: 0.2756
complexity: -0.2706
better: 0.2704
results: 0.2640
PDF: 0.2604
table: 0.2564
tools: 0.2560
papers: 0.2540
approach: 0.2521





In [9]:
sorted_tokens_asc = sorted(average_importance.items(), key=lambda x: x[1])           # Lowest first
sorted_tokens_desc = sorted(average_importance.items(), key=lambda x: x[1], reverse=True)  # Highest first

top_n = 20  # Number to display for each


print("Top lowest tokens by average importance:")
for token, score in sorted_tokens_asc[:top_n]:
    print(f"{token}: {score:.4f}")

print("\nTop highest tokens by average importance:")
for token, score in sorted_tokens_desc[:top_n]:
    print(f"{token}: {score:.4f}")


Top lowest tokens by average importance:
bounds: -0.3427
buffer: -0.3044
customize: -0.2958
complexity: -0.2706
fixed: -0.2439
throw: -0.2247
type: -0.2116
Write: -0.1940
colour: -0.1904
Focus: -0.1782
funny: -0.1655
chat: -0.1606
saved: -0.1605
frames: -0.1444
lol: -0.1422
setup: -0.1378
height: -0.1321
append: -0.1287
error: -0.1268
adjustment: -0.1255

Top highest tokens by average importance:
concrete: 0.4368
continue: 0.3892
every: 0.3691
though: 0.3343
us: 0.3323
own: 0.3292
information: 0.3190
away: 0.3008
interview: 0.2756
better: 0.2704
results: 0.2640
PDF: 0.2604
table: 0.2564
tools: 0.2560
papers: 0.2540
approach: 0.2521
images: 0.2487
follows: 0.2364
dataframe: 0.2351
Please: 0.2322


In [12]:
print("id2label contents:", id2label)


id2label contents: {'Man (cisgender)': '0', 'Woman (cisgender)': '1'}


In [16]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

print("label2id contents:", label2id)

sample_texts = data_set['conversational'].tolist()
batch_size = 32  # Adjust as needed for your hardware

results = []
all_preds = []
all_labels = []  # If you have true labels, fill this in!

for i in range(0, len(sample_texts), batch_size):
    batch_texts = sample_texts[i:i+batch_size]
    probs = predict(batch_texts)
    preds = probs.argmax(axis=1)
    for j, text in enumerate(batch_texts):
        result = {
            "text": text,
            "probabilities": probs[j],
            "predicted_label": preds[j]
        }
        results.append(result)
        all_preds.append(preds[j])
        # If your data_set has a true label column, e.g., 'label', collect it as ints:
        all_labels.append(data_set.iloc[i + j]['label'])  # adapt 'label' column name as appropriate

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# (Optional) Expand probability columns for analysis
prob_cols = [f"prob_{label}" for label in label2id.keys()]
prob_array = np.stack(results_df["probabilities"])
for idx, col in enumerate(prob_cols):
    results_df[col] = prob_array[:, idx]
results_df.drop(columns="probabilities", inplace=True)

# Preview results
display(results_df.head())

# ------ METRICS ------

target_names = list(id2label.keys())
# If your label2id maps like {"female": 0, "male": 1} and your data has numeric labels:
print("\nClassification report:")
print(classification_report(all_labels, all_preds, target_names=target_names, digits=3))

acc = accuracy_score(all_labels, all_preds)
precision, recall, f1, support = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
print(f"Accuracy: {acc:.3f}")
print(f"Weighted Precision: {precision:.3f}")
print(f"Weighted Recall:    {recall:.3f}")
print(f"Weighted F1:        {f1:.3f}")


label2id contents: {0: 'Man (cisgender)', 1: 'Woman (cisgender)'}


Unnamed: 0,text,predicted_label,prob_0,prob_1
0,what is the best way to encode and compress a ...,0,0.903502,0.096498
1,"does lzstring also work in the browser, client...",0,0.981499,0.0185
2,Is there a way in typescript to cast to a type...,0,0.988871,0.011129
3,I would like to distill a type based on an inc...,1,0.349372,0.650628
4,can you create Photoshop Scripts?,0,0.984916,0.015084



Classification report:
                   precision    recall  f1-score   support

  Man (cisgender)      0.957     0.772     0.854        57
Woman (cisgender)      0.809     0.965     0.880        57

         accuracy                          0.868       114
        macro avg      0.883     0.868     0.867       114
     weighted avg      0.883     0.868     0.867       114

Accuracy: 0.868
Weighted Precision: 0.883
Weighted Recall:    0.868
Weighted F1:        0.867
