In [354]:
import pandas as pd
import numpy as np

from helper.helper_functions import load_dataset, save_dataset, load_model, encode_nominal_features

from scipy.stats import entropy

### Loading the datasets

In [355]:
data_original = load_dataset("../data/assignment2_income_cleaned.xlsx")
data_test = load_dataset("../data/assignment2_test.xlsx")

### Test Data Inspection

In [356]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       2000 non-null   int64  
 1   workclass                 2000 non-null   object 
 2   education                 2000 non-null   int64  
 3   marital status            2000 non-null   object 
 4   occupation                2000 non-null   object 
 5   workinghours              2000 non-null   int64  
 6   sex                       2000 non-null   object 
 7   ability to speak english  105 non-null    float64
 8   gave birth this year      562 non-null    object 
dtypes: float64(1), int64(3), object(5)
memory usage: 140.8+ KB


In [357]:
# check the amount of missing values in the 'gave birth this year' column for the age bins (17-28, 28-38, 38-49, 49-65, 65-93) for females
female_data = data_test[data_test['sex'] == 'Female']
female_data['gave birth this year'][female_data['gave birth this year'].isnull()].groupby(
    pd.cut(female_data['age'], bins=[0, 28, 38, 49, 65, 93]), observed=False).size()

age
(0, 28]       0
(28, 38]      0
(38, 49]      0
(49, 65]    230
(65, 93]     55
Name: gave birth this year, dtype: int64

We impute the missing values in the 'ability to speak english' and 'gave birth this year' columns in the same manner as before.

In [358]:
data_test['ability to speak english'] = data_test['ability to speak english'].fillna(0)
data_test['gave birth this year'] = data_test['gave birth this year'].fillna('No')

### Test Data Distribution Discrepancy

Using the KL divergence to measure the distribution discrepancy between the training and test sets, we want to identify the features that have the highest discrepancy and calculate the mean discrepancy between the two datasets.

In [359]:
def compute_kl_divergence(train_set: pd.DataFrame, test_set: pd.DataFrame, numerical_features: list[str]):
    """
    Compute KL divergence between the distributions of features in the training and test sets.
    :param train_set: DataFrame containing features of the training set.
    :param test_set: DataFrame containing features of the test set.
    :param numerical_features: List of numerical features.
    :return: Array of KL divergence values for each feature in Series format.
    """

    kl_divergences = []
    for feature in test_set.columns:  # Use test set columns since it has no target label
        if feature not in numerical_features:  # Handle categorical features
            train_dist = train_set[feature].value_counts(normalize=True)
            test_dist = test_set[feature].value_counts(normalize=True)

            # Ensure all possible values are represented in both distributions
            all_values = set(train_dist.index) | set(test_dist.index)
            for value in all_values:
                if value not in train_dist.index:
                    train_dist[value] = 0.000001  # Add a small non-zero count for missing value
                if value not in test_dist.index:
                    test_dist[value] = 0.000001  # Add a small non-zero count for missing value
        else:  # Handle numerical features
            train_dist, _ = np.histogram(train_set[feature], bins=10, density=True)
            test_dist, _ = np.histogram(test_set[feature], bins=10, density=True)

        assert train_dist.shape == test_dist.shape

        # Normalizing
        train_dist /= np.sum(train_dist)
        test_dist /= np.sum(test_dist)

        # KL divergence
        kl_divergence = entropy(train_dist, test_dist)
        kl_divergences.append(kl_divergence)

    return pd.Series(kl_divergences, index=test_set.columns)


def aggregate_kl_divergence(kl_divergences: np.ndarray | pd.Series):
    """
    Aggregate KL divergence values across all features.
    :param kl_divergences: Array of KL divergence values for each feature.
    :return: Overall measure of distribution discrepancy (in this case the mean value).
    """
    if isinstance(kl_divergences, np.ndarray):
        return np.mean(kl_divergences)
    elif isinstance(kl_divergences, pd.Series):
        return kl_divergences.mean()

In [360]:
kl_divergences = compute_kl_divergence(data_original, data_test, ['age', 'workinghours'])
kl_divergences_df = pd.DataFrame({'KL Divergence': kl_divergences})
kl_divergences_df = kl_divergences_df.sort_values(by='KL Divergence', ascending=False)
kl_divergences_df

Unnamed: 0,KL Divergence
workinghours,0.29094
education,0.06856
age,0.047542
occupation,0.028383
sex,0.017073
workclass,0.008632
ability to speak english,0.002802
marital status,0.001831
gave birth this year,1.9e-05


In [361]:
aggregate_kl_divergence(kl_divergences)

0.051753484383575445

### Test Data Feature Encoding

Here, we encode the features of the test data using the same encoding as the training data. We exclude the columns that were not used in the model.

In [362]:
# drop columns that our model does not use
columns_to_exclude = ['sex', 'gave birth this year', 'marital status']
data_test_sexexcl = data_test.drop(columns=columns_to_exclude)

In [363]:
nominal_features_lc = list(
    {'workclass', 'marital status', 'gave birth this year', 'sex', 'occupation'} - set(
        columns_to_exclude))  # low cardinality features
nominal_features_hc = []

# Encoded test set
data_test_encoded = encode_nominal_features(data_test_sexexcl, nominal_features_lc, nominal_features_hc)

### Generate predictions for test data using the best model

In [364]:
# load the best model
model = load_model('../output/saved_models/dt_model_sexexcl_ffs_tuned_fair.joblib')

In [365]:
# Get the feature names used during training
train_features = model.feature_names_in_
data_test_reordered = data_test_encoded[train_features]
data_test_encoded = data_test_reordered
# predict the test data
y_pred = model.predict(data_test_encoded)
# save the predictions
y_pred = pd.DataFrame(y_pred, columns=['income'])
save_dataset(y_pred, '../output/test_predictions/best_model_predictions.xlsx', index=True)

In [366]:
p = load_dataset('../output/test_predictions/best_model_predictions.xlsx')
print(len(p), len(y_pred)) # the lengths should be the same
# see if the predictions were saved correctly
differing_values = (y_pred['income'] != p['income']).sum()

2000 2000


In [367]:
differing_values

0

### Inspection of the predictions

In [368]:
# Count amount of low (0) and high (1) income predictions
predicted_1 = (y_pred == 1).sum()
predicted_0 = (y_pred == 0).sum()

# Create a DataFrame
predictions_df = pd.DataFrame({
    'Prediction': [1, 0],
    'Count': [predicted_1, predicted_0]
})

In [369]:
y_pred.value_counts()

income
1         1051
0          949
Name: count, dtype: int64

In [370]:
# Filter predictions for males
male_predictions = y_pred[data_test['sex'] == 'Male']

male_predicted_1 = (male_predictions == 1).sum()
male_predicted_0 = (male_predictions == 0).sum()

# Filter predictions for females
female_predictions = y_pred[data_test['sex'] == 'Female']

female_predicted_1 = (female_predictions == 1).sum()
female_predicted_0 = (female_predictions == 0).sum()

predictions_df_mf = pd.DataFrame({
    'Sex': ['Male', 'Female'],
    'Predicted_1': [male_predicted_1, female_predicted_1],
    'Predicted_0': [male_predicted_0, female_predicted_0]
})

In [371]:
predictions_df_mf

Unnamed: 0,Sex,Predicted_1,Predicted_0
0,Male,income 666 dtype: int64,income 487 dtype: int64
1,Female,income 385 dtype: int64,income 462 dtype: int64


In [372]:
from sklearn.model_selection import train_test_split
from helper.helper_functions import encode_all_features
from helper.helper_functions import get_features_and_target

columns_to_exclude = ['sex', 'gave birth this year', 'marital status']
X_original, y_original = get_features_and_target(data_original, 'income')
X_original = X_original.drop(columns=columns_to_exclude)
X_original_encoded, y_original_encoded = encode_all_features(X_original, y_original, columns_to_exclude)
X_train, X_test, y_train, y_test = train_test_split(X_original_encoded, y_original_encoded, test_size=0.2, random_state=42)

### Estimating the accuracy

In [373]:
from sklearn.metrics import accuracy_score
from itertools import product

def compute_accuracy(train_set, test_set, model, features):
    """
    Compute the accuracy of the model on the test set using feature value distributions.
    
    Args:
        train_set (pd.DataFrame): DataFrame containing features of the original dataset.
        test_set (pd.DataFrame): DataFrame containing features of the test set.
        model: Trained classification model.
        features (list): List of feature names to consider.
    
    Returns:
        accuracy (float): Estimated accuracy of the model on the test set.
    """
    # Get unique values for each feature from the original dataset
    feature_values = {}
    for feature in features:
        feature_values[feature] = train_set[feature].unique()
    
    # Compute accuracy for each combination of feature values
    total_weight = 0
    total_accuracy = 0
    for values in product(*feature_values.values()):
        # Filter rows with the specific combination of feature values
        train_subset = train_set.copy()
        test_subset = test_set.copy()
        for feature, value in zip(features, values):
            train_subset = train_subset[train_subset[feature] == value]
            test_subset = test_subset[test_subset[feature] == value]
        
        # Calculate accuracy on the original dataset
        original_accuracy = accuracy_score(train_subset['income'], model.predict(train_subset.drop(columns=['income'])))
        
        # # Calculate accuracy on the test set
        # test_accuracy = model.evaluate(test_subset.drop(columns=['label']), test_subset['label'])
        dist_1 = len(train_subset)
        dist_2 = len(test_subset)
        
        # Compute weight based on KL divergence between feature value distributions
        weight = compute_kl_divergence(train_subset, test_subset)
        
        # Update total accuracy and weight
        total_weight += weight
        total_accuracy += weight * original_accuracy
    
    # Normalize total accuracy by total weight
    accuracy = total_accuracy / total_weight
    return accuracy

# estimated_accuracy = compute_accuracy(data_original, data_test, model, ['workinghours', 'education', 'age'])

In [374]:
def compute_accuracy_by_feature_value(model, feature_name, X_test, y_test):
    """
    Compute the accuracy of the model on the test set for each unique value of a feature.
    :param model: 
    :param feature_name: 
    :param feature_values: 
    :param X_test: 
    :param y_test: 
    :return: 
    """
    feature_values = X_test[feature_name].unique()
    accuracies = {}
    for value in feature_values:
        # Filter the data for the current feature value
        subset_data = X_test[X_test[feature_name] == value]
        # Get the probability of the current feature value
        prob = len(subset_data) / len(X_test)
        y_subset = y_test.loc[subset_data.index]
        
        # Get the feature names used during training
        train_features = model.feature_names_in_
        data_test_reordered = subset_data[train_features]
        subset_data = data_test_reordered
    
        y_pred = model.predict(subset_data)
        accuracy = accuracy_score(y_subset, y_pred)
        
        accuracies[value] = prob, accuracy
        
    assert 0.99 < sum([prob for prob, _ in accuracies.values()]) <= 1
    return accuracies

In [375]:
feature_name = 'workinghours'
feature_values = X_test[feature_name].unique()
accuracy_by_value = compute_accuracy_by_feature_value(model, feature_name, X_test, y_test)
sum_of_prob = sum([accuracy_by_value[key][0] for key in accuracy_by_value])
# get the weighted accuracy by summing the product of the probability and accuracy
weighted_accuracy = sum([accuracy_by_value[key][0] * accuracy_by_value[key][1] for key in accuracy_by_value]) / sum_of_prob
print("Weighted Accuracy:", weighted_accuracy)
# print(accuracy_by_value)

Weighted Accuracy: 0.7616666666666664


In [376]:
# Next, calculate the probability distribution of feature values in X_test
feature_distribution = data_test[feature_name].value_counts(normalize=True)
# print(feature_distribution)

weighted_accuracy = 0
for value in feature_values:
    # Get the probability and accuracy of the current feature value in X_test
    prob = feature_distribution.get(value, 0.0000001)
    accuracy = accuracy_by_value.get(value, (prob, weighted_accuracy))[1]  # 0 as default accuracy if value not found
    # Update the weighted accuracy by adding the product of probability and accuracy
    weighted_accuracy += prob * accuracy

# Finally, print or return the weighted accuracy
print("Weighted Accuracy:", weighted_accuracy)

Weighted Accuracy: 0.7048779149273225
