# Sentiment Classification Interpretability
2. SHAP (SHapley Additive exPlanations)
3. Layer-wise Relevance Propagation (LRP)

In [None]:
!pip install transformers torch pandas matplotlib numpy scikit-learn scipy -q

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
import json
from transformers import RobertaTokenizer, AutoModelForSequenceClassification
from sklearn.linear_model import Ridge
from scipy.special import comb
import warnings
warnings.filterwarnings("ignore")

## Load Model

In [None]:
model_path = './best_roberta_model'
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = RobertaTokenizer.from_pretrained(model_path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

with open(f'{model_path}/label_mappings.json', 'r') as f:
    label_mappings = json.load(f)

label_list = label_mappings['label_list']
label2id = label_mappings['label2id']
id2label = {int(k): v for k, v in label_mappings['id2label'].items()}

def predict_sentiment(texts):
    if isinstance(texts, str):
        texts = [texts]
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=-1)
    return probs.cpu().numpy()

## Test Samples

In [None]:
test_samples = [
    "This movie is absolutely fantastic! I loved every moment of it.",
    "Terrible experience, waste of time and money. Very disappointed.",
    "The product is okay, nothing special but does the job.",
    "I'm extremely happy with this purchase! Highly recommend!",
    "This is the worst service I've ever encountered. Absolutely horrible.",
    "Not bad, could be better but acceptable for the price.",
    "Outstanding quality! Exceeded all my expectations.",
    "Mediocre at best, wouldn't buy again.",
    "I feel neutral about this, it's just average.",
    "Brilliant work! Truly exceptional and inspiring."
]

## LIME Implementation

In [None]:
class LIMEExplainer:
    def __init__(self, predict_fn, num_samples=1000):
        self.predict_fn = predict_fn
        self.num_samples = num_samples
    
    def tokenize(self, text):
        return text.split()
    
    def kernel_fn(self, distances):
        return np.sqrt(np.exp(-(distances ** 2) / 25 ** 2))
    
    def explain(self, text, target_class=None):
        words = self.tokenize(text)
        n_words = len(words)
        
        perturbations = np.random.binomial(1, 0.5, (self.num_samples, n_words))
        perturbed_texts = [' '.join([w for w, m in zip(words, mask) if m == 1]) or '' 
                          for mask in perturbations]
        
        predictions = self.predict_fn(perturbed_texts)
        
        if target_class is None:
            target_class = np.argmax(self.predict_fn([text])[0])
        
        y = predictions[:, target_class]
        distances = np.sum(1 - perturbations, axis=1)
        weights = self.kernel_fn(distances)
        
        ridge = Ridge(alpha=1.0)
        ridge.fit(perturbations, y, sample_weight=weights)
        
        importance = list(zip(words, ridge.coef_))
        importance.sort(key=lambda x: abs(x[1]), reverse=True)
        
        return importance, target_class

## SHAP Implementation

In [None]:
class SHAPExplainer:
    def __init__(self, predict_fn, num_samples=500):
        self.predict_fn = predict_fn
        self.num_samples = num_samples
    
    def shapley_kernel(self, n, s):
        if s == 0 or s == n:
            return 1e10
        return (n - 1) / (comb(n, s) * s * (n - s))
    
    def explain(self, text, target_class=None):
        words = text.split()
        n_words = len(words)
        
        if target_class is None:
            target_class = np.argmax(self.predict_fn([text])[0])
        
        empty_pred = self.predict_fn([''])[0, target_class]
        shap_values = np.zeros(n_words)
        
        for _ in range(self.num_samples):
            z = np.random.binomial(1, 0.5, n_words)
            for i in range(n_words):
                z_with, z_without = z.copy(), z.copy()
                z_with[i], z_without[i] = 1, 0
                
                text_with = ' '.join([w for w, m in zip(words, z_with) if m == 1])
                text_without = ' '.join([w for w, m in zip(words, z_without) if m == 1])
                
                pred_with = self.predict_fn([text_with])[0, target_class] if text_with else empty_pred
                pred_without = self.predict_fn([text_without])[0, target_class] if text_without else empty_pred
                
                contrib = pred_with - pred_without
                weight = self.shapley_kernel(n_words, np.sum(z_without))
                shap_values[i] += contrib * weight
        
        shap_values /= self.num_samples
        importance = list(zip(words, shap_values))
        importance.sort(key=lambda x: abs(x[1]), reverse=True)
        
        return importance, target_class

## Layer-wise Relevance Propagation (LRP) Implementation

In [None]:
class LRPExplainer:
    def __init__(self, model, tokenizer, device, epsilon=1e-10):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.epsilon = epsilon
    
    def clean_token(self, token):
        token = token.replace('Ġ', '').replace('</w>', '')
        token = token.strip('▁')
        if token in ['<s>', '</s>', '<pad>', '', 'Ċ']:
            return None
        return token
    
    def explain(self, text, target_class=None):
        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, max_length=64)
        input_ids = inputs['input_ids'].to(self.device)
        attention_mask = inputs['attention_mask'].to(self.device)
        
        embedding_layer = self.model.get_input_embeddings()
        embeddings = embedding_layer(input_ids)
        embeddings = embeddings.detach()
        embeddings.requires_grad_(True)
        
        outputs = self.model(inputs_embeds=embeddings, attention_mask=attention_mask)
        
        if target_class is None:
            target_class = outputs.logits.argmax().item()
        
        prediction_score = outputs.logits[0, target_class]
        self.model.zero_grad()
        if embeddings.grad is not None:
            embeddings.grad.zero_()
        prediction_score.backward()
        
        if embeddings.grad is not None:
            relevance = (embeddings * embeddings.grad).sum(dim=-1)
            relevance = relevance.squeeze(0).cpu().detach().numpy()
            
            max_abs = np.abs(relevance).max()
            if max_abs > self.epsilon:
                relevance = relevance / max_abs
        else:
            relevance = np.zeros(input_ids.shape[1])
        
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids[0].cpu().numpy())
        importance = []
        for token, rel_score in zip(tokens, relevance):
            cleaned = self.clean_token(token)
            if cleaned:
                importance.append((cleaned, float(rel_score)))
        
        importance.sort(key=lambda x: abs(x[1]), reverse=True)
        return importance, target_class

## Run Explanations

In [None]:
lime_explainer = LIMEExplainer(predict_sentiment, num_samples=1000)
shap_explainer = SHAPExplainer(predict_sentiment, num_samples=500)
lrp_explainer = LRPExplainer(model, tokenizer, device)

results = []

for idx, text in enumerate(test_samples):
    print(f"\nSample {idx+1}: {text}")
    
    probs = predict_sentiment(text)[0]
    pred_class = np.argmax(probs)
    pred_label = id2label[pred_class]
    
    print(f"Prediction: {pred_label} ({probs[pred_class]:.3f})")
    
    lime_features, _ = lime_explainer.explain(text)
    shap_features, _ = shap_explainer.explain(text)
    lrp_features, _ = lrp_explainer.explain(text)
    
    results.append({
        'sample_id': idx + 1,
        'text': text,
        'prediction': pred_label,
        'confidence': probs[pred_class],
        'lime_features': lime_features[:5],
        'shap_features': shap_features[:5],
        'lrp_features': lrp_features[:5]
    })

## Visualization

In [None]:
def visualize_comparison(sample_result):
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))

    method_names = ['LIME', 'SHAP', 'LRP']
    method_keys = ['lime_features', 'shap_features', 'lrp_features']

    for idx, (method_key, method_name, ax) in enumerate(zip(method_keys, method_names, axes)):
        features = sample_result[method_key]
        features_sorted = sorted(features, key=lambda x: abs(x[1]), reverse=True)
        words, scores = zip(*features_sorted)

        words = list(reversed(words))
        scores = list(reversed(scores))

        if method_key == 'lrp_features':
            colors = ['#3498db'] * len(scores)
        else:
            colors = ['#2ecc71' if s > 0 else '#e74c3c' for s in scores]

        ax.barh(range(len(words)), scores, color=colors, alpha=0.7, edgecolor='black', linewidth=0.5)
        ax.set_yticks(range(len(words)))
        ax.set_yticklabels(words, fontsize=12, fontweight='bold')
        ax.set_title(method_name, fontsize=14, fontweight='bold', pad=10)

        if method_key != 'lrp_features':
            ax.axvline(x=0, color='black', linewidth=1.5, linestyle='--')

        ax.grid(axis='x', alpha=0.3, linestyle=':')
        ax.set_xlabel('Importance Score', fontsize=11)

    text_preview = sample_result['text'][:80] + '...' if len(sample_result['text']) > 80 else sample_result['text']
    title = f"Sample {sample_result['sample_id']}: \"{text_preview}\"\nPrediction: {sample_result['prediction']} (Confidence: {sample_result['confidence']:.1%})"
    plt.suptitle(title, fontsize=12, fontweight='bold', y=1.0)
    plt.tight_layout()
    plt.show()

for idx in range(len(results)):
    visualize_comparison(results[idx])

## Summary Analysis

In [None]:
fig = plt.figure(figsize=(18, 12))
gs = fig.add_gridspec(3, 2, hspace=0.35, wspace=0.3)

ax1 = fig.add_subplot(gs[0, :])
agreement_matrix = []
for result in results:
    lime_words = set([w.lower() for w, _ in result['lime_features']])
    shap_words = set([w.lower() for w, _ in result['shap_features']])
    lrp_words = set([w.lower() for w, _ in result['lrp_features']])

    agreement_matrix.append([
        len(lime_words & shap_words),
        len(lime_words & lrp_words),
        len(shap_words & lrp_words),
        len(lime_words & shap_words & lrp_words)
    ])

agreement_matrix = np.array(agreement_matrix)
im1 = ax1.imshow(agreement_matrix.T, cmap='YlGn', aspect='auto', vmin=0, vmax=5)
ax1.set_yticks(range(4))
ax1.set_yticklabels(['L-S', 'L-LRP', 'S-LRP', 'All 3'], fontsize=11, fontweight='bold')
ax1.set_xticks(range(len(results)))
ax1.set_xticklabels([f"S{r['sample_id']}" for r in results], fontsize=10)
ax1.set_xlabel('Sample', fontsize=12, fontweight='bold')
ax1.set_title('Feature Agreement Between Methods', fontsize=14, fontweight='bold', pad=15)

for i in range(4):
    for j in range(len(results)):
        ax1.text(j, i, int(agreement_matrix[j, i]), ha="center", va="center", color="black", fontsize=10, fontweight='bold')

plt.colorbar(im1, ax=ax1, label='Shared Features')

ax2 = fig.add_subplot(gs[1, 0])
confidences = [r['confidence'] for r in results]
predictions = [r['prediction'] for r in results]
colors = ['#2ecc71' if p == 'Positive' else '#e74c3c' if p == 'Negative' else '#f39c12' for p in predictions]

ax2.bar(range(len(results)), confidences, color=colors, alpha=0.7, edgecolor='black')
ax2.set_ylim(0, 1.1)
ax2.set_xlabel('Sample', fontsize=12, fontweight='bold')
ax2.set_ylabel('Confidence', fontsize=12, fontweight='bold')
ax2.set_title('Prediction Confidence', fontsize=14, fontweight='bold')
ax2.set_xticks(range(len(results)))
ax2.set_xticklabels([f"S{i+1}" for i in range(len(results))])
ax2.grid(axis='y', alpha=0.3)

ax3 = fig.add_subplot(gs[1, 1])
avg_agreements = agreement_matrix.mean(axis=0)
pairs = ['L-S', 'L-LRP', 'S-LRP', 'All 3']

ax3.bar(pairs, avg_agreements, color=['#3498db', '#9b59b6', '#e67e22', '#e74c3c'], alpha=0.7, edgecolor='black')
ax3.set_ylabel('Avg Shared Features', fontsize=12, fontweight='bold')
ax3.set_title('Average Agreement', fontsize=14, fontweight='bold')
ax3.set_ylim(0, 5)
ax3.grid(axis='y', alpha=0.3)

for i, val in enumerate(avg_agreements):
    ax3.text(i, val + 0.1, f'{val:.1f}', ha='center', fontsize=11, fontweight='bold')

ax4 = fig.add_subplot(gs[2, :])
all_features = {}
for result in results:
    for word, _ in (result['lime_features'][:3] + result['shap_features'][:3] + result['lrp_features'][:3]):
        word = word.lower().strip('.,!?')
        if word:
            all_features[word] = all_features.get(word, 0) + 1

top_features = sorted(all_features.items(), key=lambda x: x[1], reverse=True)[:20]
words, counts = zip(*top_features)
colors_freq = plt.cm.viridis(np.linspace(0.3, 0.9, len(words)))

ax4.barh(range(len(words)), counts, color=colors_freq, alpha=0.8, edgecolor='black')
ax4.set_yticks(range(len(words)))
ax4.set_yticklabels(words, fontsize=12, fontweight='bold')
ax4.set_xlabel('Frequency', fontsize=12, fontweight='bold')
ax4.set_title('Most Influential Features', fontsize=14, fontweight='bold')
ax4.invert_yaxis()
ax4.grid(axis='x', alpha=0.3)

for i, count in enumerate(counts):
    ax4.text(count + 0.3, i, f'{int(count)}', va='center', fontsize=10, fontweight='bold')

plt.suptitle('Summary Analysis', fontsize=16, fontweight='bold', y=0.995)
plt.show()

print(f"Average confidence: {np.mean(confidences):.1%}")
print(f"Average agreement (all 3): {avg_agreements[3]:.2f} features")
print(f"Most influential word: '{top_features[0][0]}' ({top_features[0][1]} appearances)")