## 3. AIF360 적용

- 특정 feature에 대하여 라벨의 분포가 편향되어있는지를 판단합니다.

- AIF가 AI Fairness의 약자임을 감안한다면, 라벨 분포가 편향되어 있으면 안 되는 feature에 대하여 교정을 할 수 있도록 도와주는 라이브러리입니다.

In [None]:
from sklearn.datasets import fetch_openml

import copy
from pathlib import Path

path_lecture = Path("data")
adult_data = fetch_openml(name="adult", version=2, as_frame=True)
raw_df = adult_data.frame
raw_df.to_csv(path_lecture / "adult_income.csv", index=False)

display(raw_df.head())
print(f"Data Shape: {raw_df.shape}", end="\n\n")

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


Data Shape: (48842, 15)



In [3]:
from aif360.datasets import BinaryLabelDataset
from sklearn.preprocessing import LabelEncoder

df = copy.deepcopy(raw_df)
df = df.dropna(how="any")

drop_columns = ["education"]
encoding_columns = ["workclass","marital-status","occupation","relationship","race","sex","native-country",'class']
df_filtered = df.drop(columns=drop_columns)

encoding_mappings = {}
for col in encoding_columns:
    le = LabelEncoder()
    df_filtered[col] = le.fit_transform(df_filtered[col].astype(str)) 
    encoding_mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))

protected_attrs = ['sex']

dataset = BinaryLabelDataset(df=df_filtered, label_names=['class'], protected_attribute_names=protected_attrs) 
print(dataset.instance_weights)


[1. 1. 1. ... 1. 1. 1.]


In [None]:
from aif360.metrics import BinaryLabelDatasetMetric

# 보호 속성 (sex)에 대한 편향성 분석
metric = BinaryLabelDatasetMetric(dataset, 
                                privileged_groups=[{'sex': 1}],
                                unprivileged_groups=[{'sex': 0}]
                            )  

print("📌 원본 데이터 편향성 점검")
print("Disparate Impact (DI):", metric.disparate_impact())  # 1에 가까울수록 공정
print("Statistical Parity Difference (SPD):", metric.statistical_parity_difference())  # 0에 가까울수록 공정

2025-03-10 08:33:50.432376: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-10 08:33:50.483225: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741563230.520901    5300 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741563230.529935    5300 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-10 08:33:50.571763: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

📌 원본 데이터 편향성 점검
Disparate Impact (DI): 0.3634695423643793
Statistical Parity Difference (SPD): -0.198901432678815


In [5]:
from aif360.algorithms.preprocessing import Reweighing

# Reweighing 적용
reweigher = Reweighing(privileged_groups=[{'sex': 1}], 
                        unprivileged_groups=[{'sex': 0}])

balanced_dataset = reweigher.fit_transform(dataset)

# 균형 조정 후 데이터 변환
df_balanced = balanced_dataset.convert_to_dataframe()[0]
print(balanced_dataset.instance_weights)

[1.0940093  1.0940093  0.79315786 ... 0.84852855 1.0940093  2.18218522]
