# Explain predictions using Lime

In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import json
!pip install lime
from lime.lime_text import LimeTextExplainer
import torch
from scipy.special import softmax
from IPython.display import display, HTML
import sqlite3


from google.colab import files
torch.cuda.empty_cache()
uploaded = files.upload()




Saving masked_test_set_prompts.csv to masked_test_set_prompts (3).csv


In [2]:
data_set = pd.read_csv("masked_test_set_prompts.csv")
data_set = data_set[data_set["id"] != 42]
data_set = data_set[data_set["id"] != 25]
data_set = data_set[data_set["id"] != 67]
data_set

Unnamed: 0,id,prompt,gender,label,created_at,user_id,masked_prompt
0,1,I need some python code to verify the password...,Woman (cisgender),1,2025-10-04 00:19:48,1,I need some python code to verify the password...
1,2,I want you to create a plot for weather data. ...,Woman (cisgender),1,2025-10-04 00:22:51,1,I want you to create a plot for weather data. ...
2,3,Perfect. Can you add little bullet point on th...,Woman (cisgender),1,2025-10-04 00:22:51,1,Perfect. Can you add little bullet point on th...
3,4,Can you remove the x-axis description 'Days of...,Woman (cisgender),1,2025-10-04 00:22:51,1,Can you remove the x-axis description [OTHER] ...
4,5,great. Now I want to create another plot. The ...,Woman (cisgender),1,2025-10-04 00:22:51,1,great. Now I want to create another plot. The ...
...,...,...,...,...,...,...,...
74,75,Okay I need you to make some more adjustments....,Man (cisgender),0,2025-10-04 00:54:04,10,Okay I need you to make some more adjustments....
75,76,Okay almost! I still want lines for the x and ...,Man (cisgender),0,2025-10-04 00:54:04,10,Okay almost! I still want lines for the x and ...
76,77,very close. The only thing that needs to be ch...,Man (cisgender),0,2025-10-04 00:54:04,10,very close. The only thing that needs to be ch...
77,78,"Great! One more thing: Remove the ""Weekdays"" t...",Man (cisgender),0,2025-10-04 00:54:04,10,Great! One more thing: Remove the [TERM] title...


In [3]:
uploaded = files.upload()

Saving label2id.json to label2id (3).json


In [4]:
!hf auth login




    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `hf auth whoami` to get more information or `hf auth logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
The token `th

## Load Model and Tokenizer

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
#print("CUDA available:", torch.cuda.is_available())
print("Device:", device)


fine_tuned_model_path = "Mayaryin/gender-prompt_roberta_masked"

tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)
model = AutoModelForSequenceClassification.from_pretrained(fine_tuned_model_path).to(device)

with open("label2id.json", "r") as f:
    label2id = json.load(f)

print(tokenizer.special_tokens_map)
print(tokenizer.additional_special_tokens)


Device: cuda
{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>', 'additional_special_tokens': ['[URL]', '[CODE]', '[ID]', '[OTHER]', '[INFO]']}
['[URL]', '[CODE]', '[ID]', '[OTHER]', '[INFO]']


## Explain

In [6]:
class_names = list(label2id.keys())
id2label = {v: k for k, v in label2id.items()}


def predict(texts):
    # Tokenize and move each tensor to the correct device
    encodings = tokenizer(
        texts, return_tensors="pt", truncation=True, padding=True
    ).to(device)

    with torch.no_grad():
        outputs = model(**encodings)
        logits = outputs.logits.cpu().numpy() # for batch processing as expected by limes explainer, since it perturbs the text internally
        probs = softmax(logits, axis=1)
    return probs


## Set up custom explainer with roberta tokenizer

In [7]:
class SubwordLimeTextExplainer(LimeTextExplainer):
    def __init__(self, hf_tokenizer, **kwargs):
        super().__init__(**kwargs)
        self.hf_tokenizer = hf_tokenizer

    def tokenize(self, text):
        # Tokenize the text into subwords (by default returns list of strings/tokens)
        # Note: This usually includes special tokens, so we skip those
        tokens = self.hf_tokenizer.tokenize(text)
        return tokens

    def untokenize(self, tokens):
        # Convert the list of subword tokens back to a text string
        return self.hf_tokenizer.convert_tokens_to_string(tokens)

In [8]:
from collections import defaultdict
from tqdm import tqdm


explainer = SubwordLimeTextExplainer(hf_tokenizer=tokenizer, class_names=class_names)

# Sample column name is 'text'
importance_agg = defaultdict(float)  # token -> sum of scores
token_counts = defaultdict(int)      # token -> number of appearances in explanations

torch.cuda.empty_cache()

for sample_text in tqdm(data_set['masked_prompt'], desc="Explaining samples"):
    explanation = explainer.explain_instance(
        sample_text,
        predict,
        num_features=100,  # adjust as needed
        labels=[1]         # or the class index you're interested in
    )

    # Get the explanation as a list of (token, weight) tuples
    token_weights = explanation.as_list(label=1)  # use correct label

    for token, weight in token_weights:
        importance_agg[token] += weight
        token_counts[token] += 1

    torch.cuda.empty_cache() # to prevent out of memory errors


# Now aggregate: for example, calculate average importance for each token
average_importance = {token: importance_agg[token] / token_counts[token]
                      for token in importance_agg}

# Optionally, sort tokens by their average importance (desc)
sorted_tokens = sorted(average_importance.items(), key=lambda x: abs(x[1]), reverse=True)



# Print top 20 tokens
print("Top tokens by average importance:")
for token, score in sorted_tokens[:20]:
    print(f"{token}: {score:.4f}")


Explaining samples: 100%|██████████| 76/76 [04:02<00:00,  3.19s/it]

Top tokens by average importance:
interactively: -0.3080
working: -0.2646
horizontally: -0.2230
linux: -0.2144
hashtags: -0.1667
TERM: -0.1634
closer: -0.1519
visible: -0.1406
up: -0.1270
ID: 0.1140
mistake: -0.0893
ERROR: -0.0835
chat: -0.0815
developer: -0.0725
CODE: 0.0698
thanks: -0.0670
OTHER: 0.0652
work: -0.0632
now: -0.0627
This: -0.0617





In [16]:
special_tokens = "ID", "CODE", "OTHER", "INFO", "URL", "TERM", "ERROR"

exp_filtered = {tok: val for tok, val in average_importance.items() if tok not in special_tokens}

sorted_tokens_asc = sorted(exp_filtered.items(), key=lambda x: x[1])           # Lowest first
sorted_tokens_desc = sorted(exp_filtered.items(), key=lambda x: x[1], reverse=True)  # Highest first

top_n = 20  # Number to display for each


print("Top lowest tokens by average importance:")
for token, score in sorted_tokens_asc[:top_n]:
    print(f"{token}: {score:.4f}")

print("\nTop highest tokens by average importance:")
for token, score in sorted_tokens_desc[:top_n]:
    print(f"{token}: {score:.4f}")


Top lowest tokens by average importance:
interactively: -0.3080
working: -0.2646
horizontally: -0.2230
linux: -0.2144
hashtags: -0.1667
closer: -0.1519
visible: -0.1406
up: -0.1270
mistake: -0.0893
chat: -0.0815
developer: -0.0725
thanks: -0.0670
work: -0.0632
now: -0.0627
This: -0.0617
thank: -0.0577
conversation: -0.0520
passwords: -0.0502
decimal: -0.0476
Now: -0.0470

Top highest tokens by average importance:
Good: 0.0380
Yes: 0.0227
Password: 0.0160
Fri: 0.0154
These: 0.0152
machine: 0.0149
examples: 0.0146
Okay: 0.0127
mean: 0.0115
niceee: 0.0111
your: 0.0111
here: 0.0106
transparency: 0.0099
tuple: 0.0095
Friday: 0.0093
please: 0.0089
cases: 0.0086
Headline: 0.0083
filled: 0.0080
algorithm: 0.0080


In [14]:
print("id2label contents:", id2label)


id2label contents: {'Man (cisgender)': '0', 'Woman (cisgender)': '1'}


In [17]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

print("label2id contents:", label2id)

sample_texts = data_set['masked_prompt'].tolist()
batch_size = 32  # Adjust as needed for your hardware

results = []
all_preds = []
all_labels = []  # If you have true labels, fill this in!

for i in range(0, len(sample_texts), batch_size):
    batch_texts = sample_texts[i:i+batch_size]
    probs = predict(batch_texts)
    preds = probs.argmax(axis=1)
    for j, text in enumerate(batch_texts):
        result = {
            "text": text,
            "probabilities": probs[j],
            "predicted_label": preds[j]
        }
        results.append(result)
        all_preds.append(preds[j])
        # If your data_set has a true label column, e.g., 'label', collect it as ints:
        all_labels.append(data_set.iloc[i + j]['label'])  # adapt 'label' column name as appropriate

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# (Optional) Expand probability columns for analysis
prob_cols = [f"prob_{label}" for label in label2id.keys()]
prob_array = np.stack(results_df["probabilities"])
for idx, col in enumerate(prob_cols):
    results_df[col] = prob_array[:, idx]
results_df.drop(columns="probabilities", inplace=True)

# Preview results
display(results_df.head())

# ------ METRICS ------

target_names = list(id2label.keys())
# If your label2id maps like {"female": 0, "male": 1} and your data has numeric labels:
print("\nClassification report:")
print(classification_report(all_labels, all_preds, target_names=target_names, digits=3))

acc = accuracy_score(all_labels, all_preds)
precision, recall, f1, support = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
print(f"Accuracy: {acc:.3f}")
print(f"Weighted Precision: {precision:.3f}")
print(f"Weighted Recall:    {recall:.3f}")
print(f"Weighted F1:        {f1:.3f}")


label2id contents: {'0': 'Man (cisgender)', '1': 'Woman (cisgender)'}


Unnamed: 0,text,predicted_label,prob_0,prob_1
0,I need some python code to verify the password...,0,0.718067,0.281933
1,I want you to create a plot for weather data. ...,0,0.627307,0.372693
2,Perfect. Can you add little bullet point on th...,1,0.474782,0.525218
3,Can you remove the x-axis description 'Days of...,1,0.478431,0.521569
4,great. Now I want to create another plot. The ...,0,0.675618,0.324382



Classification report:
                   precision    recall  f1-score   support

  Man (cisgender)      0.640     0.711     0.674        45
Woman (cisgender)      0.500     0.419     0.456        31

         accuracy                          0.592        76
        macro avg      0.570     0.565     0.565        76
     weighted avg      0.583     0.592     0.585        76

Accuracy: 0.592
Weighted Precision: 0.583
Weighted Recall:    0.592
Weighted F1:        0.585
