In [None]:
# ethics_review.ipynb

# -------------------------------
# 1. Setup
# -------------------------------
!pip install transformers datasets -q

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns

# Load your fine-tuned or baseline model
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).eval()

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# -------------------------------
# 2. Load & Analyze Dataset
# -------------------------------
dataset = load_dataset("civil_comments", split="validation[:1000]")
labels = dataset["toxicity"]

def classify(texts):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
    return probs[:, 1].cpu().numpy()

dataset = dataset.map(lambda x: {"model_score": classify([x["text"]])[0]})
print("✅ Model scores added")

# -------------------------------
# 3. Bias Audit by Demographic
# -------------------------------
import pandas as pd

df = dataset.to_pandas()
sensitive_groups = ["female", "male", "black", "white", "LGBTQ", "muslim", "christian"]

bias_scores = {}
for group in sensitive_groups:
    mask = df[group] > 0.5
    bias_scores[group] = df[mask]["model_score"].mean()

bias_df = pd.DataFrame.from_dict(bias_scores, orient="index", columns=["Avg Toxicity Score"])
bias_df.sort_values("Avg Toxicity Score", ascending=False, inplace=True)

# -------------------------------
# 4. Visualize Bias Scores
# -------------------------------
sns.barplot(x=bias_df.index, y="Avg Toxicity Score", data=bias_df)
plt.title("⚠️ Bias Detection – Avg Toxicity Score by Demographic")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# -------------------------------
# 5. Mitigation Plan (Markdown)
# -------------------------------
from IPython.display import Markdown

Markdown("""
### ✅ Ethical Risk Review Summary

**Bias Detected**: The model shows higher toxicity scores for certain groups.  
**Action**:
- Fine-tune on balanced, inclusive datasets (e.g., Jigsaw Demographic Balancer)
- Apply post-processing thresholds per demographic
- Flag high-risk responses for human review

**Fairness Goals**:
- Maintain demographic parity in false positive rates
- Use explainability (e.g., SHAP, LIME) for edge-case analysis
- Document data sources, annotation criteria

**Harm Prevention**:
- Model runs behind a safety filter
- No storage of personal data
- Explainability UI offered for human review

""")
