In [None]:
# ---
# Week 3: Explainability (AI vs Human Classifier)
# ---

import os
import numpy as np
import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from lime.lime_text import LimeTextExplainer
from captum.attr import IntegratedGradients
import joblib
import pandas as pd

# -----------------------------
# Setup
# -----------------------------
PLOTS_DIR = "../reports/plots"
os.makedirs(PLOTS_DIR, exist_ok=True)

MODEL_NAME = "distilroberta-base"
MAX_LEN = 256
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
hf_model = AutoModel.from_pretrained(MODEL_NAME).to(device)
hf_model.eval()

# -----------------------------
# Embedding function (same as training)
# -----------------------------
def get_embedding(text: str):
    enc = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LEN).to(device)
    with torch.no_grad():
        out = hf_model(**enc).last_hidden_state
        mask = enc["attention_mask"].unsqueeze(-1).to(out.dtype)
        mean = (out * mask).sum(1) / mask.sum(1)
    return mean.cpu().numpy()

# -----------------------------
# Example for explanation
# -----------------------------
val_df = pd.read_csv("../data/processed/val.csv")   # adjust path if needed
val_texts = val_df["text"].tolist()

model = joblib.load("../models/logreg_transformer_emb_best.joblib")
label_encoder = joblib.load("../models/label_encoder_transformer.joblib")

W_t = torch.tensor(model.coef_, dtype=torch.float32, device=device)       # shape: [num_classes, dim]
b_t = torch.tensor(model.intercept_, dtype=torch.float32, device=device)   # shape: [num_classes]

print(f"✅ Loaded {len(val_texts)} validation samples")
print("Classes:", list(label_encoder.classes_))

example_idx = 5  # choose a sample index from val set
text_example = val_texts[example_idx]
true_label = val_df.iloc[example_idx]["label"]

print("🔍 Example Text:", text_example[:200], "...")
print("True Label:", true_label)

# -----------------------------
# LIME Explainability
# -----------------------------
def predict_proba_texts(texts):
    embs = np.vstack([get_embedding(t) for t in texts])
    return model.predict_proba(embs)

class_names = list(label_encoder.classes_)
lime_explainer = LimeTextExplainer(class_names=class_names)

lime_exp = lime_explainer.explain_instance(
    text_example,
    predict_proba_texts,
    num_features=10
)

fig = lime_exp.as_pyplot_figure()
plt.title("LIME Explanation (Top Features)")
plt.tight_layout()
plt.savefig(f"{PLOTS_DIR}/lime_example.png")
plt.close()
print("✅ LIME plot saved ->", f"{PLOTS_DIR}/lime_example.png")

# -----------------------------
# Captum Integrated Gradients (Improved)
# -----------------------------
# Define forward function (logreg on embeddings)
hf_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(label_encoder.classes_)
).to(device)
hf_model.eval()

# Get embeddings layer
embedding_layer = hf_model.get_input_embeddings()

# Forward function that takes embeddings instead of token IDs
def forward_func(inputs_embeds, attention_mask):
    outputs = hf_model(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
    probs = torch.softmax(outputs.logits, dim=-1)
    return probs

# Tokenize input (get IDs + mask)
enc = tokenizer(
    text_example,
    return_tensors="pt",
    truncation=True,
    padding="max_length",
    max_length=MAX_LEN
).to(device)

input_ids = enc["input_ids"]
attention_mask = enc["attention_mask"]

# Convert token IDs -> embeddings (and enable gradients)
inputs_embeds = embedding_layer(input_ids)
inputs_embeds.requires_grad_(True)

# Run IG for the true label
target_idx = int(label_encoder.transform([true_label])[0])
ig = IntegratedGradients(forward_func)
attributions = ig.attribute(
    inputs=inputs_embeds,
    baselines=torch.zeros_like(inputs_embeds),
    additional_forward_args=(attention_mask,),
    target=target_idx,
    n_steps=50
)

# Convert to CPU numpy
attr = attributions.squeeze(0).sum(-1).detach().cpu().numpy()  # sum over embedding dims
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

# Normalize for viz
attr_norm = (attr - attr.min()) / (attr.max() - attr.min() + 1e-8)

# Plot token attributions
plt.figure(figsize=(12, 4))
plt.bar(range(len(tokens)), attr_norm, tick_label=tokens)
plt.xticks(rotation=90, fontsize=8)
plt.title("Integrated Gradients Token Attribution (Captum)")
plt.tight_layout()
plt.savefig(f"{PLOTS_DIR}/captum_ig_tokens.png")
plt.close()
print("✅ Captum IG token-level plot saved ->", f"{PLOTS_DIR}/captum_ig_tokens.png")

# -----------------------------
# Deliverables Summary
# -----------------------------
print("🎯 Week 3 Deliverables Generated:")
print("- LIME local explanation plot")
print("- Captum Integrated Gradients attribution plot")


✅ Loaded 1500 validation samples
Classes: ['AI', 'Human']
🔍 Example Text: In today's world, online classes are becoming an increasingly popular way to access education. They are a valuable tool for those who cannot attend traditional schools due to personal or financial rea ...
True Label: AI
✅ LIME plot saved -> ../reports/plots/lime_example.png


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)