# Counterfactual Fairness Analysis for Generative AI Outputs

This notebook implements tools for analyzing and optimizing fairness in generative AI
outputs using counterfactual analysis. It helps identify potential biases by examining
how outputs change when protected attributes are varied while holding other factors constant.

## Installation
### Prerequisites
Ensure you have Python 3.8+ installed. The project dependencies are listed in requirements.txt:
```bash 
pip install -r requirements.txt

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [None]:
@dataclass
class CounterfactualConfig:
    """Configuration for counterfactual generation and analysis."""
    protected_attributes: List[str]
    similarity_threshold: float = 0.8
    num_counterfactuals: int = 5
    random_seed: int = 42

In [None]:
class CounterfactualGenerator:
    """Generates counterfactual examples for fairness analysis."""
    
    def __init__(self, config: CounterfactualConfig):
        """
        Initialize the counterfactual generator.
        
        Args:
            config: Configuration object containing parameters
        """
        self.config = config
        np.random.seed(config.random_seed)
        
    def generate_counterfactuals(
        self,
        original_input: Dict,
        embeddings: np.ndarray
    ) -> List[Dict]:
        """
        Generate counterfactual examples by varying protected attributes.
        
        Args:
            original_input: Original input features
            embeddings: Pre-computed embeddings for similarity calculation
            
        Returns:
            List of counterfactual examples
        """
        counterfactuals = []
        
        for attr in self.config.protected_attributes:
            # Create variations of protected attributes
            variations = self._generate_attribute_variations(
                original_input[attr],
                attr
            )
            
            for variation in variations:
                counterfactual = original_input.copy()
                counterfactual[attr] = variation
                
                # Only keep counterfactuals that are sufficiently similar
                if self._check_similarity(
                    original_input,
                    counterfactual,
                    embeddings
                ):
                    counterfactuals.append(counterfactual)
                    
        return counterfactuals[:self.config.num_counterfactuals]
    
    def _generate_attribute_variations(
        self,
        original_value: str,
        attribute: str
    ) -> List[str]:
        """Generate variations for a protected attribute."""
        # This is a simplified example - in practice, you would want a more
        # comprehensive approach to generating meaningful variations
        if attribute == "gender":
            return ["male", "female", "non-binary"]
        elif attribute == "age":
            return ["young", "middle-aged", "senior"]
        elif attribute == "ethnicity":
            return ["Asian", "Black", "Hispanic", "White", "Other"]
        return [original_value]
    
    def _check_similarity(
        self,
        original: Dict,
        counterfactual: Dict,
        embeddings: np.ndarray
    ) -> bool:
        """Check if counterfactual is sufficiently similar to original."""
        # Convert inputs to feature vectors (simplified)
        orig_vec = self._dict_to_vector(original)
        cf_vec = self._dict_to_vector(counterfactual)
        
        similarity = cosine_similarity(
            orig_vec.reshape(1, -1),
            cf_vec.reshape(1, -1)
        )[0][0]
        
        return similarity >= self.config.similarity_threshold
    
    def _dict_to_vector(self, input_dict: Dict) -> np.ndarray:
        """Convert input dictionary to feature vector."""
        # This is a simplified conversion - you would need to implement
        # proper feature engineering in practice
        return np.array([hash(str(v)) for v in input_dict.values()])

In [None]:
class FairnessAnalyzer:
    """Analyzes fairness metrics across counterfactuals."""
    
    def __init__(self):
        """Initialize the fairness analyzer."""
        self.metrics = {}
        
    def analyze_outputs(
        self,
        original_output: str,
        counterfactual_outputs: List[str],
        protected_attributes: List[str]
    ) -> Dict:
        """
        Calculate fairness metrics across original and counterfactual outputs.
        
        Args:
            original_output: Output from original input
            counterfactual_outputs: Outputs from counterfactual inputs
            protected_attributes: List of protected attributes
            
        Returns:
            Dictionary of fairness metrics
        """
        metrics = {
            'output_similarity': self._calculate_output_similarity(
                original_output,
                counterfactual_outputs
            ),
            'attribute_impact': self._calculate_attribute_impact(
                counterfactual_outputs,
                protected_attributes
            ),
            'bias_score': self._calculate_bias_score(
                original_output,
                counterfactual_outputs
            )
        }
        
        self.metrics = metrics
        return metrics
    
    def _calculate_output_similarity(
        self,
        original: str,
        counterfactuals: List[str]
    ) -> float:
        """Calculate average similarity between original and counterfactual outputs."""
        # This is a simplified similarity calculation
        # In practice, you might want to use more sophisticated NLP metrics
        similarities = []
        for cf in counterfactuals:
            common_words = set(original.split()) & set(cf.split())
            total_words = set(original.split()) | set(cf.split())
            similarities.append(len(common_words) / len(total_words))
        return np.mean(similarities)
    
    def _calculate_attribute_impact(
        self,
        outputs: List[str],
        attributes: List[str]
    ) -> Dict[str, float]:
        """Calculate impact score for each protected attribute."""
        impact_scores = {}
        for attr in attributes:
            # Simplified impact calculation
            # In practice, you would want to use more sophisticated methods
            attr_mentions = sum(1 for output in outputs if attr.lower() in output.lower())
            impact_scores[attr] = attr_mentions / len(outputs)
        return impact_scores
    
    def _calculate_bias_score(
        self,
        original: str,
        counterfactuals: List[str]
    ) -> float:
        """Calculate overall bias score based on output variations."""
        # Simplified bias score calculation
        # Higher score indicates more potential bias
        word_variations = len(set(
            word for output in counterfactuals
            for word in output.split()
        )) - len(set(original.split()))
        return word_variations / len(counterfactuals)

In [None]:
class FairnessVisualizer:
    """Visualizes fairness analysis results."""
    
    def plot_metrics(self, metrics: Dict) -> None:
        """
        Create visualization of fairness metrics.
        
        Args:
            metrics: Dictionary of calculated fairness metrics
        """
        plt.figure(figsize=(15, 5))
        
        # Plot 1: Output Similarity
        plt.subplot(131)
        plt.bar(['Output Similarity'], [metrics['output_similarity']])
        plt.title('Output Similarity')
        plt.ylim(0, 1)
        
        # Plot 2: Attribute Impact
        plt.subplot(132)
        impact_scores = metrics['attribute_impact']
        plt.bar(impact_scores.keys(), impact_scores.values())
        plt.title('Attribute Impact')
        plt.xticks(rotation=45)
        plt.ylim(0, 1)
        
        # Plot 3: Bias Score
        plt.subplot(133)
        plt.bar(['Bias Score'], [metrics['bias_score']])
        plt.title('Bias Score')
        plt.ylim(0, max(2, metrics['bias_score'] * 1.2))
        
        plt.tight_layout()
        plt.show()

In [None]:

def optimize_fairness(
    generator: CounterfactualGenerator,
    analyzer: FairnessAnalyzer,
    original_input: Dict,
    original_output: str,
    embeddings: np.ndarray
) -> Tuple[Dict, List[str]]:
    """
    Optimize output fairness using counterfactual analysis.
    
    Args:
        generator: Counterfactual generator instance
        analyzer: Fairness analyzer instance
        original_input: Original input features
        original_output: Original model output
        embeddings: Pre-computed embeddings
        
    Returns:
        Tuple of optimization suggestions and warnings
    """
    counterfactuals = generator.generate_counterfactuals(original_input, embeddings)
    metrics = analyzer.analyze_outputs(
        original_output,
        [cf['output'] for cf in counterfactuals],
        generator.config.protected_attributes
    )
    
    suggestions = []
    warnings = []
    
    # Analyze metrics and generate suggestions
    if metrics['output_similarity'] < 0.7:
        warnings.append(
            "High output variation detected across protected attributes"
        )
        suggestions.append(
            "Consider adjusting model to maintain more consistent outputs"
        )
    
    for attr, impact in metrics['attribute_impact'].items():
        if impact > 0.3:
            warnings.append(
                f"High impact detected for attribute: {attr}"
            )
            suggestions.append(
                f"Review handling of {attr} in model logic"
            )
    
    if metrics['bias_score'] > 0.5:
        warnings.append("Significant potential bias detected")
        suggestions.append(
            "Consider retraining model with more balanced dataset"
        )
    
    return suggestions, warnings

# Code Implementation

In [None]:
# Configuration
config = CounterfactualConfig(
    protected_attributes=["gender", "age", "ethnicity"],
    similarity_threshold=0.8,
    num_counterfactuals=5
)

In [None]:
# Sample input and output
original_input = {
    "gender": "female",
    "age": "young",
    "ethnicity": "Asian",
    "occupation": "engineer",
    "education": "graduate"
}

In [None]:
original_output = "A talented young female engineer with graduate education"

In [None]:
embeddings = np.random.rand(10, 128)  # In practice, use real embeddings

In [None]:
# Initialize components
generator = CounterfactualGenerator(config)
analyzer = FairnessAnalyzer()
visualizer = FairnessVisualizer()

In [None]:
# Generate and analyze counterfactuals
counterfactuals = generator.generate_counterfactuals(original_input, embeddings)
metrics = analyzer.analyze_outputs(
    original_output,
    [
        "A talented engineer with graduate education",
        "A skilled professional in engineering",
        "An experienced engineer with advanced degree",
        "A qualified technical professional",
        "A dedicated engineering specialist"
    ],
    config.protected_attributes
)

In [None]:
# Visualize results
visualizer.plot_metrics(metrics)

In [None]:
# Get optimization suggestions
suggestions, warnings = optimize_fairness(
    generator,
    analyzer,
    original_input,
    original_output,
    embeddings
)
    
print("\nOptimization Suggestions:")
for suggestion in suggestions:
    print(f"- {suggestion}")
    
print("\nWarnings:")
for warning in warnings:
    print(f"- {warning}")