In [5]:
from aif360.datasets import AdultDataset as Aif360AdultDataset

import torch.nn as nn

class AdultDataset():
    """
    Adult dataset class for loading and preprocessing Adult dataset
    """

    def __init__(self):
        super().__init__()

    def load_data(self):
        """
        Load the Adult dataset
        """
        adult_ds = Aif360AdultDataset(
            protected_attribute_names=['sex'],  # Primary protected attribute
            privileged_classes=[['Male']],      # Privileged group definition
            categorical_features=['workclass', 'education', 'marital-status', 
                                'occupation', 'relationship', 'race', 'native-country'],
            features_to_keep=['age', 'workclass', 'education', 'education-num',
                            'marital-status', 'occupation', 'relationship', 'race',
                            'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
                            'native-country'],
            na_values=['?'],  # Handle missing values
            custom_preprocessing=lambda df: df.dropna() # df.fillna('Unknown')  # Simple imputation
        )
        # adult_ds.convert_to_dataframe()

        return adult_ds

class AdultMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

In [6]:
dataset = AdultDataset()
adult_data = dataset.load_data()

# Now, print the datatypes for each column in the dataset
print("Column data types in the Adult AIF360 dataset:")
df, attributes = adult_data.convert_to_dataframe()
for column, dtype in df.dtypes.items():
    print(f"{column}: {dtype}")

print(len(df))  # Print the number of rows in the dataset
print("Num features:", adult_data.features.shape[1])


Column data types in the Adult AIF360 dataset:
age: float64
education-num: float64
sex: float64
capital-gain: float64
capital-loss: float64
hours-per-week: float64
workclass=Federal-gov: float64
workclass=Local-gov: float64
workclass=Private: float64
workclass=Self-emp-inc: float64
workclass=Self-emp-not-inc: float64
workclass=State-gov: float64
workclass=Without-pay: float64
education=10th: float64
education=11th: float64
education=12th: float64
education=1st-4th: float64
education=5th-6th: float64
education=7th-8th: float64
education=9th: float64
education=Assoc-acdm: float64
education=Assoc-voc: float64
education=Bachelors: float64
education=Doctorate: float64
education=HS-grad: float64
education=Masters: float64
education=Preschool: float64
education=Prof-school: float64
education=Some-college: float64
marital-status=Divorced: float64
marital-status=Married-AF-spouse: float64
marital-status=Married-civ-spouse: float64
marital-status=Married-spouse-absent: float64
marital-status=Nev

In [3]:
adult_train, adult_test = adult_data.split([0.8], shuffle=True, seed=42)

from aif360.metrics import BinaryLabelDatasetMetric

# Analyze bias in the original dataset
print("Original dataset bias analysis:")
metric_orig_train = BinaryLabelDatasetMetric(
    adult_train, 
    unprivileged_groups=[{'sex': 0}],  # Female
    privileged_groups=[{'sex': 1}]     # Male
)

print(f"Training set size: {adult_train.features.shape[0]}")
print(f"Test set size: {adult_test.features.shape[0]}")
print(f"Difference in mean outcomes between groups: {metric_orig_train.mean_difference():.4f}")
print(f"Disparate impact: {metric_orig_train.disparate_impact():.4f}")
# Disparate impact < 0.8 or > 1.25 indicates potential bias


pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[AdversarialDebiasing]'
pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'


Original dataset bias analysis:
Training set size: 39073
Test set size: 9769
Difference in mean outcomes between groups: -0.1933
Disparate impact: 0.3584
