# Explainable AI for Sentiment Classification

This notebook implements three XAI methods:
1. LIME (Local Interpretable Model-agnostic Explanations)
2. SHAP (SHapley Additive exPlanations)
3. Integrated Gradients

In [1]:
!pip install transformers torch pandas matplotlib numpy scikit-learn scipy -q


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\Imanuel Girsang\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
import json
from transformers import RobertaTokenizer, AutoModelForSequenceClassification
from sklearn.linear_model import Ridge
from scipy.special import comb
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'pandas'

## Load Model

In [None]:
model_path = './best_roberta_model'
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained(model_path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

with open(f'{model_path}/label_mappings.json', 'r') as f:
    label_mappings = json.load(f)

label_list = label_mappings['label_list']
label2id = label_mappings['label2id']
id2label = {int(k): v for k, v in label_mappings['id2label'].items()}

def predict_sentiment(texts):
    if isinstance(texts, str):
        texts = [texts]
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)
    return probs.cpu().numpy()

## Test Samples

In [None]:
test_samples = [
    "This movie is absolutely fantastic! I loved every moment of it.",
    "Terrible experience, waste of time and money. Very disappointed.",
    "The product is okay, nothing special but does the job.",
    "I'm extremely happy with this purchase! Highly recommend!",
    "This is the worst service I've ever encountered. Absolutely horrible.",
    "Not bad, could be better but acceptable for the price.",
    "Outstanding quality! Exceeded all my expectations.",
    "Mediocre at best, wouldn't buy again.",
    "I feel neutral about this, it's just average.",
    "Brilliant work! Truly exceptional and inspiring."
]

## LIME Implementation

In [None]:
class LIMEExplainer:
    def __init__(self, predict_fn, num_samples=1000):
        self.predict_fn = predict_fn
        self.num_samples = num_samples
    
    def tokenize(self, text):
        return text.split()
    
    def kernel_fn(self, distances):
        return np.sqrt(np.exp(-(distances ** 2) / 25 ** 2))
    
    def explain(self, text, target_class=None):
        words = self.tokenize(text)
        n_words = len(words)
        
        perturbations = np.random.binomial(1, 0.5, (self.num_samples, n_words))
        perturbed_texts = [' '.join([w for w, m in zip(words, mask) if m == 1]) or '' 
                          for mask in perturbations]
        
        predictions = self.predict_fn(perturbed_texts)
        
        if target_class is None:
            target_class = np.argmax(self.predict_fn([text])[0])
        
        y = predictions[:, target_class]
        distances = np.sum(1 - perturbations, axis=1)
        weights = self.kernel_fn(distances)
        
        ridge = Ridge(alpha=1.0)
        ridge.fit(perturbations, y, sample_weight=weights)
        
        importance = list(zip(words, ridge.coef_))
        importance.sort(key=lambda x: abs(x[1]), reverse=True)
        
        return importance, target_class

## SHAP Implementation

In [None]:
class SHAPExplainer:
    def __init__(self, predict_fn, num_samples=500):
        self.predict_fn = predict_fn
        self.num_samples = num_samples
    
    def shapley_kernel(self, n, s):
        if s == 0 or s == n:
            return 1e10
        return (n - 1) / (comb(n, s) * s * (n - s))
    
    def explain(self, text, target_class=None):
        words = text.split()
        n_words = len(words)
        
        if target_class is None:
            target_class = np.argmax(self.predict_fn([text])[0])
        
        empty_pred = self.predict_fn([''])[0, target_class]
        shap_values = np.zeros(n_words)
        
        for _ in range(self.num_samples):
            z = np.random.binomial(1, 0.5, n_words)
            for i in range(n_words):
                z_with, z_without = z.copy(), z.copy()
                z_with[i], z_without[i] = 1, 0
                
                text_with = ' '.join([w for w, m in zip(words, z_with) if m == 1])
                text_without = ' '.join([w for w, m in zip(words, z_without) if m == 1])
                
                pred_with = self.predict_fn([text_with])[0, target_class] if text_with else empty_pred
                pred_without = self.predict_fn([text_without])[0, target_class] if text_without else empty_pred
                
                contrib = pred_with - pred_without
                weight = self.shapley_kernel(n_words, np.sum(z_without))
                shap_values[i] += contrib * weight
        
        shap_values /= self.num_samples
        importance = list(zip(words, shap_values))
        importance.sort(key=lambda x: abs(x[1]), reverse=True)
        
        return importance, target_class

## Integrated Gradients Implementation

In [None]:
class IntegratedGradientsExplainer:
    def __init__(self, model, tokenizer, device, n_steps=50):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.n_steps = n_steps
    
    def explain(self, text, target_class=None):
        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, max_length=64)
        input_ids = inputs['input_ids'].to(self.device)
        attention_mask = inputs['attention_mask'].to(self.device)
        
        if target_class is None:
            with torch.no_grad():
                target_class = self.model(input_ids=input_ids, attention_mask=attention_mask).logits.argmax().item()
        
        embeddings = self.model.roberta.embeddings(input_ids)
        embeddings.requires_grad_(True)
        baseline = torch.zeros_like(embeddings)
        
        gradients = []
        for i in range(self.n_steps + 1):
            scaled_input = baseline + (float(i) / self.n_steps) * (embeddings - baseline)
            scaled_input.requires_grad_(True)
            
            outputs = self.model(inputs_embeds=scaled_input, attention_mask=attention_mask)
            score = outputs.logits[0, target_class]
            self.model.zero_grad()
            score.backward(retain_graph=True)
            gradients.append(scaled_input.grad.clone())
        
        avg_gradients = torch.stack(gradients).mean(dim=0)
        integrated_gradients = (embeddings - baseline) * avg_gradients
        attributions = integrated_gradients.sum(dim=-1).squeeze(0).cpu().detach().numpy()
        
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0].cpu().numpy())
        importance = [(t, a) for t, a in zip(tokens, attributions) if t not in ['<s>', '</s>', '<pad>']]
        importance.sort(key=lambda x: abs(x[1]), reverse=True)
        
        return importance, target_class

## Run Explanations

In [None]:
lime_explainer = LIMEExplainer(predict_sentiment, num_samples=1000)
shap_explainer = SHAPExplainer(predict_sentiment, num_samples=500)
ig_explainer = IntegratedGradientsExplainer(model, tokenizer, device, n_steps=50)

results = []

for idx, text in enumerate(test_samples):
    print(f"\nSample {idx+1}: {text}")
    
    probs = predict_sentiment(text)[0]
    pred_class = np.argmax(probs)
    pred_label = id2label[pred_class]
    
    print(f"Prediction: {pred_label} ({probs[pred_class]:.3f})")
    
    lime_features, _ = lime_explainer.explain(text)
    shap_features, _ = shap_explainer.explain(text)
    ig_features, _ = ig_explainer.explain(text)
    
    results.append({
        'sample_id': idx + 1,
        'text': text,
        'prediction': pred_label,
        'confidence': probs[pred_class],
        'lime_features': lime_features[:5],
        'shap_features': shap_features[:5],
        'ig_features': ig_features[:5]
    })

## Visualization

In [None]:
def visualize_comparison(sample_result):
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    for idx, (method, ax) in enumerate(zip(['lime_features', 'shap_features', 'ig_features'], axes)):
        words, scores = zip(*sample_result[method])
        colors = ['green' if s > 0 else 'red' for s in scores]
        ax.barh(range(len(words)), scores, color=colors, alpha=0.6)
        ax.set_yticks(range(len(words)))
        ax.set_yticklabels(words)
        ax.set_title(['LIME', 'SHAP', 'Integrated Gradients'][idx])
        ax.axvline(x=0, color='black', linewidth=0.5)
        ax.grid(axis='x', alpha=0.3)
    
    text_preview = sample_result['text'][:60] + '...' if len(sample_result['text']) > 60 else sample_result['text']
    plt.suptitle(f'{text_preview}\nPrediction: {sample_result["prediction"]} ({sample_result["confidence"]:.3f})')
    plt.tight_layout()
    plt.show()

for idx in [0, 1, 4, 9]:
    visualize_comparison(results[idx])

## Feature Agreement Analysis

In [None]:
agreement_stats = []

for result in results:
    lime_words = set([w.lower() for w, _ in result['lime_features']])
    shap_words = set([w.lower() for w, _ in result['shap_features']])
    ig_words = set([w.lower() for w, _ in result['ig_features']])
    
    all_three = lime_words & shap_words & ig_words
    
    agreement_stats.append({
        'sample_id': result['sample_id'],
        'all_three': len(all_three),
        'lime_shap': len(lime_words & shap_words),
        'lime_ig': len(lime_words & ig_words),
        'shap_ig': len(shap_words & ig_words),
        'agreed_features': list(all_three)
    })

for stat in agreement_stats:
    print(f"\nSample {stat['sample_id']}: {stat['all_three']} features agreed by all three methods")
    print(f"  Features: {stat['agreed_features']}")

avg = np.mean([s['all_three'] for s in agreement_stats])
print(f"\nAverage agreement: {avg:.2f} features")

## Summary Table

In [None]:
summary_df = pd.DataFrame([{
    'Sample': r['sample_id'],
    'Text': r['text'][:40] + '...',
    'Prediction': r['prediction'],
    'Confidence': f"{r['confidence']:.3f}",
    'LIME Top': r['lime_features'][0][0],
    'SHAP Top': r['shap_features'][0][0],
    'IG Top': r['ig_features'][0][0]
} for r in results])

summary_df