### Detect Bias in Data
**Description**: Use statistical tests to detect bias in data, which can affect AI model fairness.

In [None]:
# Write your code from here
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.calibration import calibration_curve
from sklearn.metrics import roc_curve, auc

def detect_bias(data, target_col, protected_col, privileged_group, unprivileged_group):
    """
    Comprehensive bias detection in datasets with statistical tests and fairness metrics
    
    Parameters:
    data (pd.DataFrame): Input dataset
    target_col (str): Name of target variable column
    protected_col (str): Name of protected attribute column
    privileged_group: Value representing privileged group
    unprivileged_group: Value representing unprivileged group
    """
    
    # Calculate base rates
    privileged_rate = data[data[protected_col]==privileged_group][target_col].mean()
    unprivileged_rate = data[data[protected_col]==unprivileged_group][target_col].mean()
    
    # 1. Statistical Parity Difference
    statistical_parity = privileged_rate - unprivileged_rate
    
    # 2. Disparate Impact Ratio
    disparate_impact = unprivileged_rate / privileged_rate if privileged_rate != 0 else np.nan
    
    # 3. T-test for group means
    t_stat, p_value = stats.ttest_ind(
        data[data[protected_col]==privileged_group][target_col],
        data[data[protected_col]==unprivileged_group][target_col],
        equal_var=False
    )
    
    # 4. Chi-square test for independence
    contingency_table = pd.crosstab(data[target_col], data[protected_col])
    chi2, chi_p, _, _ = stats.chi2_contingency(contingency_table)
    
    # 5. Kolmogorov-Smirnov test for distribution differences
    ks_stat, ks_p = stats.ks_2samp(
        data[data[protected_col]==privileged_group][target_col],
        data[data[protected_col]==unprivileged_group][target_col]
    )
    
    # 6. Equal Opportunity Difference (using predicted probabilities)
    # For demonstration, we'll use group means as proxy for predicted probabilities
    equal_opp_diff = (data[(data[protected_col]==privileged_group) & (data[target_col]==1)][target_col].mean() - \
                     (data[(data[protected_col]==unprivileged_group) & (data[target_col]==1)][target_col].mean())
    
    # Enhanced Visualizations
    plt.figure(figsize=(18, 12))
    
    # Distribution Plot
    plt.subplot(2, 3, 1)
    sns.histplot(
        data=data, 
        x=target_col, 
        hue=protected_col, 
        element="step", 
        stat="density", 
        common_norm=False,
        bins=20
    )
    plt.title('Outcome Distribution by Group')
    
    # Probability Calibration Plot
    plt.subplot(2, 3, 2)
    prob_true, prob_pred = calibration_curve(
        data[target_col],
        data.groupby(protected_col)[target_col].transform('mean'),
        n_bins=10
    )
    plt.plot(prob_pred, prob_true, marker='o')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.title('Calibration Plot')
    plt.xlabel('Predicted Probability')
    plt.ylabel('Actual Probability')
    
    # Count Plot
    plt.subplot(2, 3, 3)
    sns.countplot(x=protected_col, hue=target_col, data=data)
    plt.title('Count by Group and Outcome')
    
    # Mean Outcome Plot
    plt.subplot(2, 3, 4)
    data.groupby(protected_col)[target_col].mean().plot(kind='bar')
    plt.title('Mean Outcome by Group')
    plt.ylabel('Probability')
    
    # Cumulative Distribution Plot
    plt.subplot(2, 3, 5)
    for group in [privileged_group, unprivileged_group]:
        sns.ecdfplot(
            data=data[data[protected_col]==group],
            x=target_col,
            label=group
        )
    plt.title('Cumulative Distribution')
    plt.legend()
    
    # ROC Curve (if we had classifier scores)
    plt.subplot(2, 3, 6)
    for group in [privileged_group, unprivileged_group]:
        group_data = data[data[protected_col]==group]
        fpr, tpr, _ = roc_curve(group_data[target_col], 
                               group_data.groupby(protected_col)[target_col].transform('mean'))
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{group} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.title('ROC Curves by Group')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Print comprehensive results
    print("\n=== Bias Detection Report ===")
    print(f"\nBase Rates:")
    print(f"- Privileged group ({privileged_group}): {privileged_rate:.2%}")
    print(f"- Unprivileged group ({unprivileged_group}): {unprivileged_rate:.2%}")
    
    print("\nStatistical Tests:")
    print(f"1. Statistical Parity Difference: {statistical_parity:.4f}")
    print(f"   (Ideal: 0, Fair if within ±0.1)")
    print(f"2. Disparate Impact Ratio: {disparate_impact:.4f}")
    print(f"   (Ideal: 1, Fair if 0.8-1.25)")
    print(f"3. T-test p-value: {p_value:.4f}")
    print(f"   (Significant difference if p < 0.05)")
    print(f"4. Chi-square p-value: {chi_p:.4f}")
    print(f"   (Dependence if p < 0.05)")
    print(f"5. KS Test p-value: {ks_p:.4f}")
    print(f"   (Different distributions if p < 0.05)")
    print(f"6. Equal Opportunity Difference: {equal_opp_diff:.4f}")
    print(f"   (Ideal: 0, Fair if within ±0.1)")
    
    # Interpretation
    print("\n=== Bias Assessment ===")
    bias_detected = False
    if abs(statistical_parity) > 0.1:
        print("⚠️ Significant statistical parity difference")
        bias_detected = True
    if disparate_impact < 0.8 or disparate_impact > 1.25:
        print("⚠️ Disparate impact detected")
        bias_detected = True
    if p_value < 0.05:
        print("⚠️ Significant mean difference (t-test)")
        bias_detected = True
    if chi_p < 0.05:
        print("⚠️ Significant association between group and outcome (chi-square)")
        bias_detected = True
    if ks_p < 0.05:
        print("⚠️ Different outcome distributions (KS test)")
        bias_detected = True
    if abs(equal_opp_diff) > 0.1:
        print("⚠️ Equal opportunity difference detected")
        bias_detected = True
    
    if not bias_detected:
        print("✅ No significant bias detected between groups")

# Example usage with more realistic synthetic data
np.random.seed(42)
data_size = 2000

# Create synthetic dataset with multiple protected attributes
data = pd.DataFrame({
    'age': np.random.normal(45, 15, data_size).clip(18, 80),
    'gender': np.random.choice(['Male', 'Female'], data_size, p=[0.6, 0.4]),
    'race': np.random.choice(['A', 'B', 'C'], data_size, p=[0.5, 0.3, 0.2]),
    'income': np.random.lognormal(10.5, 0.4, data_size),
    'credit_score': np.random.normal(650, 100, data_size).clip(300, 850)
})

# Create more realistic biased approval process
data['approved'] = np.where(
    ((data['income'] > 35000) & (data['gender'] == 'Male')) |
    ((data['income'] > 45000) & (data['gender'] == 'Female')),
    np.random.choice([0, 1], data_size, p=[0.15, 0.85]),
    np.random.choice([0, 1], data_size, p=[0.65, 0.35])
)

# Detect bias for gender
print("\nAnalyzing Gender Bias:")
detect_bias(data, 
           target_col='approved',
           protected_col='gender',
           privileged_group='Male',
           unprivileged_group='Female')

# Detect bias for race (group A vs C)
print("\nAnalyzing Race Bias (Group A vs C):")
detect_bias(data, 
           target_col='approved',
           protected_col='race',
           privileged_group='A',
           unprivileged_group='C')

SyntaxError: '(' was never closed (3585584294.py, line 52)