In [117]:
import pandas as pd
import numpy as np

from helper.helper_functions import load_dataset, save_dataset, load_model, encode_nominal_features

from scipy.stats import entropy

### Loading the datasets

In [118]:
data_original = load_dataset("../data/assignment2_income_cleaned.xlsx")
data_test = load_dataset("../data/assignment2_test.xlsx")

### Test Data Inspection

In [119]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       2000 non-null   int64  
 1   workclass                 2000 non-null   object 
 2   education                 2000 non-null   int64  
 3   marital status            2000 non-null   object 
 4   occupation                2000 non-null   object 
 5   workinghours              2000 non-null   int64  
 6   sex                       2000 non-null   object 
 7   ability to speak english  105 non-null    float64
 8   gave birth this year      562 non-null    object 
dtypes: float64(1), int64(3), object(5)
memory usage: 140.8+ KB


In [120]:
# check the amount of missing values in the 'gave birth this year' column for the age bins (17-28, 28-38, 38-49, 49-65, 65-93) for females
female_data = data_test[data_test['sex'] == 'Female']
female_data['gave birth this year'][female_data['gave birth this year'].isnull()].groupby(
    pd.cut(female_data['age'], bins=[0, 28, 38, 49, 65, 93]), observed=False).size()

age
(0, 28]       0
(28, 38]      0
(38, 49]      0
(49, 65]    230
(65, 93]     55
Name: gave birth this year, dtype: int64

In [121]:
data_test['ability to speak english'] = data_test['ability to speak english'].fillna(0)
data_test['gave birth this year'] = data_test['gave birth this year'].fillna('No')

### Test Data Distribution Discrepancy

Using the KL divergence to measure the distribution discrepancy between the training and test sets, we want to identify the features that have the highest discrepancy and calculate the mean discrepancy between the two datasets.

In [122]:
def compute_kl_divergence(train_set: pd.DataFrame, test_set: pd.DataFrame, numerical_features: list[str]):
    """
    Compute KL divergence between the distributions of features in the training and test sets.
    :param train_set: DataFrame containing features of the training set.
    :param test_set: DataFrame containing features of the test set.
    :param numerical_features: List of numerical features.
    :return: Array of KL divergence values for each feature in Series format.
    """

    kl_divergences = []
    for feature in test_set.columns:  # Use test set columns since it has no target label
        if feature not in numerical_features:  # Handle categorical features
            train_dist = train_set[feature].value_counts(normalize=True)
            test_dist = test_set[feature].value_counts(normalize=True)

            # Ensure all possible values are represented in both distributions
            all_values = set(train_dist.index) | set(test_dist.index)
            for value in all_values:
                if value not in train_dist.index:
                    train_dist[value] = 0.000001  # Add a small non-zero count for missing value
                if value not in test_dist.index:
                    test_dist[value] = 0.000001  # Add a small non-zero count for missing value
        else:  # Handle numerical features
            train_dist, _ = np.histogram(train_set[feature], bins=10, density=True)
            test_dist, _ = np.histogram(test_set[feature], bins=10, density=True)

        assert train_dist.shape == test_dist.shape

        # Normalizing
        train_dist /= np.sum(train_dist)
        test_dist /= np.sum(test_dist)

        # KL divergence
        kl_divergence = entropy(train_dist, test_dist)
        kl_divergences.append(kl_divergence)

    return pd.Series(kl_divergences, index=test_set.columns)


def aggregate_kl_divergence(kl_divergences: np.ndarray | pd.Series):
    """
    Aggregate KL divergence values across all features.
    :param kl_divergences: Array of KL divergence values for each feature.
    :return: Overall measure of distribution discrepancy (in this case the mean value).
    """
    if isinstance(kl_divergences, np.ndarray):
        return np.mean(kl_divergences)
    elif isinstance(kl_divergences, pd.Series):
        return kl_divergences.mean()

In [123]:
kl_divergences = compute_kl_divergence(data_original, data_test, ['age', 'workinghours'])
kl_divergences_df = pd.DataFrame({'KL Divergence': kl_divergences})
kl_divergences_df = kl_divergences_df.sort_values(by='KL Divergence', ascending=False)
kl_divergences_df

Unnamed: 0,KL Divergence
workinghours,0.29094
education,0.06856
age,0.047542
occupation,0.028383
sex,0.017073
workclass,0.008632
ability to speak english,0.002802
marital status,0.001831
gave birth this year,1.9e-05


In [124]:
aggregate_kl_divergence(kl_divergences)

0.051753484383575445

### Test Data Feature Encoding

In [125]:
# drop columns that our model does not use
columns_to_exclude = ['sex', 'gave birth this year', 'marital status']
data_test_sexexcl = data_test.drop(columns=columns_to_exclude)

In [126]:
nominal_features_lc = list(
    {'workclass', 'marital status', 'gave birth this year', 'sex', 'occupation'} - set(
        columns_to_exclude))  # low cardinality features
nominal_features_hc = []

# Encoded test set
data_test_encoded = encode_nominal_features(data_test_sexexcl, nominal_features_lc, nominal_features_hc)

### Generate predictions for test data using the best model

In [127]:
# load the best model
model = load_model('../output/saved_models/dt_model_sexexcl_ffs_tuned_fair.joblib')

In [128]:
# Get the feature names used during training
train_features = model.feature_names_in_
data_test_reordered = data_test_encoded[train_features]
data_test_encoded = data_test_reordered
# predict the test data
y_pred = model.predict(data_test_encoded)
# save the predictions
y_pred = pd.DataFrame(y_pred, columns=['income'])
save_dataset(y_pred, '../output/test_predictions/best_model_predictions.xlsx', index=True)

In [129]:
p = load_dataset('../output/test_predictions/best_model_predictions.xlsx')
print(len(p), len(y_pred)) # the lengths should be the same
# see if the predictions were saved correctly
differing_values = (y_pred['income'] != p['income']).sum()

2000 2000


In [130]:
differing_values

0

### Inspection of the predictions

In [131]:
# Count amount of low (0) and high (1) income predictions
predicted_1 = (y_pred == 1).sum()
predicted_0 = (y_pred == 0).sum()

# Create a DataFrame
predictions_df = pd.DataFrame({
    'Prediction': [1, 0],
    'Count': [predicted_1, predicted_0]
})

In [132]:
y_pred.value_counts()

income
1         1051
0          949
Name: count, dtype: int64

In [133]:
# Filter predictions for males
male_predictions = y_pred[data_test['sex'] == 'Male']

male_predicted_1 = (male_predictions == 1).sum()
male_predicted_0 = (male_predictions == 0).sum()

# Filter predictions for females
female_predictions = y_pred[data_test['sex'] == 'Female']

female_predicted_1 = (female_predictions == 1).sum()
female_predicted_0 = (female_predictions == 0).sum()

predictions_df_mf = pd.DataFrame({
    'Sex': ['Male', 'Female'],
    'Predicted_1': [male_predicted_1, female_predicted_1],
    'Predicted_0': [male_predicted_0, female_predicted_0]
})

In [134]:
predictions_df_mf

Unnamed: 0,Sex,Predicted_1,Predicted_0
0,Male,income 666 dtype: int64,income 487 dtype: int64
1,Female,income 385 dtype: int64,income 462 dtype: int64
