In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from lightgbm import LGBMClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

from fairlearn.metrics import MetricFrame
from fairlearn.metrics import demographic_parity_difference, demographic_parity_ratio
from fairlearn.metrics import equalized_odds_difference, equalized_odds_ratio
from fairlearn.metrics import selection_rate, false_positive_rate, true_positive_rate, count
from fairlearn.postprocessing import ThresholdOptimizer

from imblearn.over_sampling import SMOTE
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
import warnings

pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# Caricamento e Preprocessing dei Dati
file_path = "adult.csv"
try:
    df_original = pd.read_csv(file_path, na_values='?')
except FileNotFoundError:
    print(f"Errore: File '{file_path}' non trovato.")
    df_original = pd.DataFrame()

df = df_original.copy()
if not df.empty:
    df = df.dropna()

    df['high_income'] = df['income'].map({'<=50K': 0, '>50K': 1})
    df.drop('income', axis=1, inplace=True)

    df['gender_original'] = df['gender']
    df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})

    df['race_original'] = df['race']
    df['race_binary'] = df['race'].apply(lambda x: 'White' if x.strip() == 'White' else 'Non-White')
    df['race'] = df['race_binary'].map({'Non-White': 0, 'White': 1})

    df['native-country'] = df['native-country'].apply(lambda x: 'Other' if x.strip() != 'United-States' else 'United-States')
    df.drop(['education', 'fnlwgt', 'race_binary'], axis=1, inplace=True)

    X_raw = df.drop('high_income', axis=1)
    y = df['high_income']

    sensitive_features_gender_series = X_raw['gender']
    sensitive_features_race_series = X_raw['race']

    X_model_input = X_raw.drop(['gender_original', 'race_original'], axis=1)

    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        X_model_input, y, test_size=0.2, random_state=42, stratify=y
    )

    X_test_sensitive_gender = X_test_raw['gender']
    X_test_sensitive_race = X_test_raw['race']
    X_train_sensitive_gender_raw = X_train_raw['gender']
    X_train_sensitive_race_raw = X_train_raw['race']

    categorical_features_to_encode = ['workclass', 'marital-status', 'occupation', 'relationship', 'native-country']
    numerical_features = ['age', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'gender', 'race']

    def preprocess_data(X_train_df, X_test_df, cat_feats, num_feats):
        X_train_p = X_train_df.copy()
        X_test_p = X_test_df.copy()

        ordinal_enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        X_train_p[cat_feats] = ordinal_enc.fit_transform(X_train_df[cat_feats])
        X_test_p[cat_feats] = ordinal_enc.transform(X_test_df[cat_feats])

        scaler_func = StandardScaler()
        X_train_p[num_feats] = scaler_func.fit_transform(X_train_p[num_feats])
        X_test_p[num_feats] = scaler_func.transform(X_test_p[num_feats])
        return X_train_p, X_test_p, ordinal_enc, scaler_func

    X_train_processed, X_test_processed, _, _ = preprocess_data(
        X_train_raw, X_test_raw, categorical_features_to_encode, numerical_features
    )

    print("Dimensioni X_train_processed:", X_train_processed.shape)
    print("Dimensioni X_test_processed:", X_test_processed.shape)
else:
    print("Dataset vuoto, saltare le celle successive.")


# Addestramento del Modello LightGBM (Baseline)
if not df.empty:
    lgbm_model_baseline = LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42, verbose=-1)
    lgbm_model_baseline.fit(X_train_processed, y_train)

    y_pred_baseline = lgbm_model_baseline.predict(X_test_processed)
    y_pred_proba_baseline = lgbm_model_baseline.predict_proba(X_test_processed)[:, 1]

    print("Valutazione Modello Baseline:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred_baseline):.4f}")
    print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba_baseline):.4f}")

# Valutazione dell'Equità del Modello Baseline
if not df.empty:
    fairness_metrics_dict = {
        'accuracy': accuracy_score,
        'selection_rate': selection_rate,
        'true_positive_rate': true_positive_rate,
        'false_positive_rate': false_positive_rate,
        'count': count
    }

    def print_fairness_metrics(y_true, y_pred, sensitive_features, attribute_name):
        print(f"\n--- Equità rispetto a {attribute_name} ---")
        grouped_metrics = MetricFrame(metrics=fairness_metrics_dict,
                                      y_true=y_true,
                                      y_pred=y_pred,
                                      sensitive_features=sensitive_features)
        print(grouped_metrics.by_group)

        dpd = demographic_parity_difference(y_true, y_pred, sensitive_features=sensitive_features)
        print(f"Demographic Parity Difference ({attribute_name}): {dpd:.4f}")

        eod = equalized_odds_difference(y_true, y_pred, sensitive_features=sensitive_features)
        print(f"Equalized Odds Difference ({attribute_name}): {eod:.4f}")
        return grouped_metrics, dpd, eod

    print("\n--- Valutazione Equità Modello Baseline ---")
    _, _, _ = print_fairness_metrics(y_test, y_pred_baseline, X_test_sensitive_gender, "Gender (0=Male, 1=Female)")
    _, _, _ = print_fairness_metrics(y_test, y_pred_baseline, X_test_sensitive_race, "Race (0=Non-White, 1=White)")

# Mitigazione del Bias con Pre-processing: FairSMOTE
def apply_fairsmote(X_df, y_series, sensitive_feature_series, numerical_features, categorical_features, sensitive_attribute_name):
    print(f"\nApplicazione di FairSMOTE per l'attributo: {sensitive_attribute_name}")

    df_train_temp = X_df.copy()
    df_train_temp['target'] = y_series
    df_train_temp['sensitive'] = sensitive_feature_series

    subgroup_counts = df_train_temp.groupby(['target', 'sensitive']).size()
    print("Dimensioni sottogruppi prima di FairSMOTE:")
    print(subgroup_counts)

    if subgroup_counts.empty or len(subgroup_counts) < 2:
        print("Numero insufficiente di sottogruppi, FairSMOTE non applicato.")
        return X_df.copy(), y_series.copy()

    target_count = subgroup_counts.max()
    print(f"Dimensione target per sottogruppo: {target_count}")

    final_dfs = []

    for name, group_df in df_train_temp.groupby(['target', 'sensitive']):
        current_size = len(group_df)

        if current_size >= target_count:
            print(f"Sottogruppo {name} (dim={current_size}) è un gruppo di maggioranza, non viene modificato.")
            final_dfs.append(group_df)
            continue

        samples_to_generate = target_count - current_size
        print(f"Sottogruppo {name} (dim={current_size}) è un gruppo di minoranza. Genero {samples_to_generate} campioni sintetici...")

        group_numerical_X = group_df[numerical_features]
        group_categorical_X = group_df[categorical_features]

        n_neighbors = min(current_size - 1, 5)

        if n_neighbors < 1:
            print(f"   -> ATTENZIONE: Il sottogruppo {name} è troppo piccolo per SMOTE (n_samples <= 1). Eseguo Random Oversampling come fallback.")
            resampled_group = group_df.sample(n=samples_to_generate, replace=True, random_state=42)
            final_dfs.append(pd.concat([group_df, resampled_group]))
            continue

        nn = NearestNeighbors(n_neighbors=n_neighbors, algorithm='auto')
        nn.fit(group_numerical_X)

        new_samples_list = []
        random_indices = np.random.choice(group_numerical_X.index, size=samples_to_generate, replace=True)
        neighbors_indices = nn.kneighbors(group_numerical_X, return_distance=False)

        index_map = {index: i for i, index in enumerate(group_numerical_X.index)}

        for i in random_indices:
            base_sample_numerical = group_numerical_X.loc[i]

            neighbor_pos = np.random.choice(neighbors_indices[index_map[i]][1:])
            chosen_neighbor_index = group_numerical_X.index[neighbor_pos]
            neighbor_sample_numerical = group_numerical_X.loc[chosen_neighbor_index]

            diff = neighbor_sample_numerical - base_sample_numerical
            synthetic_numerical = base_sample_numerical + np.random.random() * diff

            synthetic_categorical = group_categorical_X.loc[i]

            new_sample = pd.concat([
                pd.Series(synthetic_numerical, index=numerical_features),
                pd.Series(synthetic_categorical, index=categorical_features),
                pd.Series({'target': name[0], 'sensitive': name[1]})
            ])
            new_samples_list.append(new_sample)

        if new_samples_list:
              new_samples_df = pd.DataFrame(new_samples_list)
              final_dfs.append(pd.concat([group_df, new_samples_df]))
        else:
            final_dfs.append(group_df)

    if not final_dfs:
        print("Nessun dato generato, FairSMOTE non ha modificato i dati.")
        return X_df.copy(), y_series.copy()

    df_fairsmote = pd.concat(final_dfs, ignore_index=True)

    print("\nDimensioni sottogruppi dopo FairSMOTE:")
    print(df_fairsmote.groupby(['target', 'sensitive']).size())

    X_fairsmote = df_fairsmote.drop(columns=['target', 'sensitive'])
    y_fairsmote = df_fairsmote['target']

    return X_fairsmote, y_fairsmote


if not 'df' in locals() or df.empty:
    print("DataFrame 'df' non trovato o vuoto. Esecuzione saltata.")
    from sklearn.datasets import make_classification
    X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=0, n_classes=2, random_state=42)
    df = pd.DataFrame(X, columns=[f'num_{i}' for i in range(8)] + [f'cat_{i}' for i in range(2)])
    df['target'] = y
    df['gender'] = np.random.choice(['Male', 'Female'], size=1000, p=[0.7, 0.3])
    df['race'] = np.random.choice(['A', 'B'], size=1000, p=[0.8, 0.2])
    mask = (df['target'] == 1) & (df['gender'] == 'Female')
    df = df.drop(df[mask].sample(frac=0.8, random_state=42).index)

    from sklearn.model_selection import train_test_split
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        df.drop('target', axis=1), df['target'], test_size=0.3, random_state=42
    )
    X_train_sensitive_gender_raw = X_train_raw['gender']
    X_test_sensitive_gender = X_test_raw['gender']
    X_test_sensitive_race = X_test_raw['race']

    numerical_features = [col for col in X_train_raw.columns if 'num' in col]
    categorical_features_to_encode = [col for col in X_train_raw.columns if 'cat' in col or col in ['gender', 'race']]

    def preprocess_data(X_train, X_test, cat_feats, num_feats):
        X_train_proc = pd.get_dummies(X_train, columns=[c for c in cat_feats if c in X_train.columns], drop_first=True)
        X_test_proc = pd.get_dummies(X_test, columns=[c for c in cat_feats if c in X_test.columns], drop_first=True)
        train_cols = X_train_proc.columns
        test_cols = X_test_proc.columns
        shared_cols = list(set(train_cols) & set(test_cols))
        return X_train_proc[shared_cols], X_test_proc[shared_cols], None, None

    def print_fairness_metrics(y_true, y_pred, sensitive_features, label):
        print(f"\nMetriche di Fairness per {label}:")
        df_fairness = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred, 'sensitive': sensitive_features})
        print(df_fairness.groupby('sensitive')['y_pred'].mean())
        return None, None, None

X_train_fairsmote_gender, y_train_fairsmote_gender = apply_fairsmote(
    X_df=X_train_raw,
    y_series=y_train,
    sensitive_feature_series=X_train_sensitive_gender_raw,
    numerical_features=numerical_features,
    categorical_features=categorical_features_to_encode,
    sensitive_attribute_name="Gender"
)

X_test_raw_copy_for_fairsmote_model = X_test_raw.copy()
X_train_fairsmote_gender_processed_final, X_test_fairsmote_gender_processed_final, _, _ = preprocess_data(
    X_train_fairsmote_gender, X_test_raw_copy_for_fairsmote_model,
    categorical_features_to_encode, numerical_features
)

lgbm_fairsmote_gender = LGBMClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42, verbose=-1)
lgbm_fairsmote_gender.fit(X_train_fairsmote_gender_processed_final, y_train_fairsmote_gender)

y_pred_fairsmote_gender = lgbm_fairsmote_gender.predict(X_test_fairsmote_gender_processed_final)
y_pred_proba_fairsmote_gender = lgbm_fairsmote_gender.predict_proba(X_test_fairsmote_gender_processed_final)[:,1]

print("\n--- Valutazione Modello DOPO FairSMOTE (Gender) ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_fairsmote_gender):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba_fairsmote_gender):.4f}")
print_fairness_metrics(y_test, y_pred_fairsmote_gender, X_test_sensitive_gender, "Gender (Post-FairSMOTE)")
print_fairness_metrics(y_test, y_pred_fairsmote_gender, X_test_sensitive_race, "Race (Post-FairSMOTE Gender)")

# Mitigazione del Bias con Post-processing: ThresholdOptimizer
if not df.empty:
    print("\n--- Mitigazione Post-processing: ThresholdOptimizer ---")
    postprocess_gender = ThresholdOptimizer(
        estimator=lgbm_model_baseline,
        constraints="demographic_parity",
        objective="accuracy_score",
        prefit=True,
        predict_method='predict_proba'
    )
    postprocess_gender.fit(X_train_processed, y_train, sensitive_features=X_train_sensitive_gender_raw)
    y_pred_postprocess_gender = postprocess_gender.predict(X_test_processed, sensitive_features=X_test_sensitive_gender)

    print("\n--- Valutazione DOPO Mitigazione Post-processing per Gender (ThresholdOptimizer) ---")
    print(f"Accuracy (PostProc-Gender): {accuracy_score(y_test, y_pred_postprocess_gender):.4f}")
    _, _, _ = print_fairness_metrics(y_test, y_pred_postprocess_gender, X_test_sensitive_gender, "Gender (Post-ThresholdOptimizer, target: DemographicParity)")

    postprocess_race = ThresholdOptimizer(
        estimator=lgbm_model_baseline,
        constraints="demographic_parity",
        objective="accuracy_score",
        prefit=True,
        predict_method='predict_proba'
    )
    postprocess_race.fit(X_train_processed, y_train, sensitive_features=X_train_sensitive_race_raw)
    y_pred_postprocess_race = postprocess_race.predict(X_test_processed, sensitive_features=X_test_sensitive_race)

    print("\n--- Valutazione DOPO Mitigazione Post-processing per Race (ThresholdOptimizer) ---")
    print(f"Accuracy (PostProc-Race): {accuracy_score(y_test, y_pred_postprocess_race):.4f}")
    _, _, _ = print_fairness_metrics(y_test, y_pred_postprocess_race, X_test_sensitive_race, "Race (Post-ThresholdOptimizer, target: DemographicParity)")