In [133]:
from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model, get_features_and_target, encode_all_features
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SequentialFeatureSelector

In [134]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html#:~:text=This%20Sequential%20Feature%20Selector%20adds,validation%20score%20of%20an%20estimator.
def seq_feat_selection(model, X_train: pd.DataFrame, y_train: pd.Series, direction: str = "backwards", scoring: str = "accuracy"):
    
    # Create Sequential Feature Selector
    sfs = SequentialFeatureSelector(model, 
                                     direction=direction, # backward feature elimination
                                     scoring=scoring,
                                     cv=StratifiedKFold())
    
    # Fit the feature selector to training data
    sfs.fit(X_train, y_train)
    
    # Get selected features and feature indices
    selected_features = sfs.get_support()
    selected_feature_indices = [i for i, val in enumerate(selected_features) if val]
    
    print("Selected features:")
    for i, feature in enumerate(X_train.columns[selected_feature_indices]):
        print(f"Feature {i+1}: {feature}")

In [135]:
# https://medium.com/@agrawalsam1997/hyperparameter-tuning-of-knn-classifier-a32f31af25c7
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

def tune_knn(model, X_train: pd.DataFrame, y_train: pd.Series, X_test, y_test, param_grid) -> Tuple[dict, KNeighborsClassifier, float]:
    """
    Tune the hyperparameters of the K-Nearest Neighbors model using GridSearchCV.
    :param X_encoded: 
    :param y_encoded: 
    :param X_test: 
    :param y_test: 
    :return: 
    """    
    # Perform grid search with 5-fold cross-validation
    gscv = GridSearchCV(model, param_grid=param_grid, cv=5, verbose=1)
    
    gscv.fit(X_train, y_train)
    
    best_params = gscv.best_params_ 
    best_model = gscv.best_estimator_ 
    best_accuracy = best_model.score(X_test, y_test)
    
    return best_params, best_model, best_accuracy

In [136]:
def forward_feat_selection_hypertuning(X_encoded: pd.DataFrame, y_encoded: pd.Series) -> Tuple[List[str], dict, float]:
    """
    Forward feature selection with hyperparameter tuning for K-Nearest Neighbors.
    :param X_encoded: features dataset, with features encoded
    :param y_encoded: target dataset, with target encoded
    :return: (best subset of features, best hyperparameters, best model accuracy)
    """
    best_subset: List[str] = []
    best_params: dict = None
    best_score: float = 0.0
    
    remaining_features = [['age'], 
                          ['education'], 
                          ['workinghours'],
                          [col for col in X_encoded.columns if col.startswith('workclass')],
                          [col for col in X_encoded.columns if col.startswith('marital status')],
                          [col for col in X_encoded.columns if col.startswith('occupation')]]
    
    param_grid = {
                'n_neighbors': np.arange(2, 30, 1),
                'weights': ['uniform', 'distance'],
                'p': [1, 2]
    }
    
    while remaining_features:
        subset_scores = []
        subset_params = []
    
        for feature_cat in remaining_features:
            # Combine the current best subset with the new feature
            current_subset = best_subset + feature_cat if best_subset else feature_cat.copy()
            
            X_subset = X_encoded[current_subset]
            X_train, X_val, y_train, y_val = train_test_split(X_subset, y_encoded, test_size=0.2, random_state=42)
            
            best_params, best_model, score = tune_knn(KNeighborsClassifier(), X_subset, y_encoded, X_val, y_val, param_grid)
            
            subset_scores.append(score)
            subset_params.append(best_params)
            
        # Select the feature that improves performance the most
        best_index = subset_scores.index(max(subset_scores))
        best_score = subset_scores[best_index]
        best_params = subset_params[best_index]
        best_feature = remaining_features[best_index]
        
        # Update the best subset and remaining features
        best_subset = best_subset + best_feature if best_subset else best_feature.copy()
        del remaining_features[best_index]
        
        print("Best subset:", best_subset)
        print("Remaining features:", remaining_features)
        
    return best_subset, best_params, best_score

In [137]:
"""
Both functions below are heavily based on the following example from the scikit-learn documentation:
https://scikit-learn.org/stable/auto_examples/model_selection/plot_multi_metric_evaluation.html#sphx-glr-auto-examples-model-selection-plot-multi-metric-evaluation-py
"""

def multi_metric_cv(model, scoring, X_train, y_train, param_grid, refit="AUC"):
    """
    Perform GridSearchCV with multiple scorers for a given model.
    :param model: 
    :param scoring: 
    :param X_train: 
    :param y_train: 
    :param param_grid: 
    :param refit: 
    :return: 
    """
    # Initialize GridSearchCV
    gs = GridSearchCV(
        model,
        param_grid=param_grid,
        scoring=scoring,
        refit=refit,
        n_jobs=2,
        return_train_score=True,
    )
    
    # Fit the model
    gs.fit(X_train, y_train)
    results = gs.cv_results_
    
    return results
    
def plot_multi_score_cv_results(x_label, x_min_val, x_max_val, y_min_val, y_max_val, results, param_name, param_type, scoring):
    """
    Plot the results of GridSearchCV with multiple scorers.
    :param x_label: 
    :param x_min_val: 
    :param x_max_val: 
    :param y_min_val: 
    :param y_max_val: 
    :param results: 
    :param param_name: 
    :param param_type: 
    :param scoring: 
    :return: 
    """
    # Plot the results
    plt.figure(figsize=(13, 13))
    plt.title("GridSearchCV evaluating using multiple scorers simultaneously", fontsize=16)
    
    plt.xlabel(x_label)
    plt.ylabel("Score")
    
    ax = plt.gca()
    ax.set_xlim(x_min_val, x_max_val-1)
    ax.set_ylim(y_min_val, y_max_val)
    
    # Get the regular numpy array from the MaskedArray
    X_axis = np.array(results[param_name].data, dtype=param_type)
    colors = ["g", "b", "r", "y", "m", "k"]
    
    for scorer, color in zip(sorted(scoring), colors[:len(scoring)]):
        for sample, style in (("train", "--"), ("test", "-")):
            sample_score_mean = results["mean_%s_%s" % (sample, scorer)]
            sample_score_std = results["std_%s_%s" % (sample, scorer)]
            ax.fill_between(
                X_axis,
                sample_score_mean - sample_score_std,
                sample_score_mean + sample_score_std,
                alpha=0.1 if sample == "test" else 0,
                color=color,
            )
            ax.plot(
                X_axis,
                sample_score_mean,
                style,
                color=color,
                alpha=1 if sample == "test" else 0.7,
                label="%s (%s)" % (scorer, sample),
            )
    
        best_index = np.nonzero(results["rank_test_%s" % scorer] == 1)[0][0]
        best_score = results["mean_test_%s" % scorer][best_index]
    
        # Plot a dotted vertical line at the best score for that scorer marked by x
        ax.plot(
            [
                X_axis[best_index],
            ]
            * 2,
            [0, best_score],
            linestyle="-.",
            color=color,
            marker="x",
            markeredgewidth=3,
            ms=8,
        )
    
        # Annotate the best score for that scorer
        ax.annotate("%0.2f" % best_score, (X_axis[best_index], best_score + 0.005))
    
    plt.legend(loc="best")
    plt.grid(False)
    plt.show()

### Loading the cleaned dataset

In [138]:
data = load_dataset('../data/assignment2_income_cleaned.xlsx')

In [139]:
# Splitting the data into features (X) and target (y)
X, y = get_features_and_target(data, 'income')
columns_to_exclude = ['ability to speak english', 'gave birth this year']
# Encoding the features and target, and excluding some columns
X_encoded, y_encoded = encode_all_features(X, y, columns_to_exclude)
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

### Model

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [140]:
# K-Nearest Neighbors model
knn_model = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn_model.fit(X_train, y_train)
# Predictions
knn_preds = knn_model.predict(X_test)
# Accuracy evaluation
knn_accuracy = accuracy_score(y_test, knn_preds)

print(classification_report(y_test, knn_preds))
print("K-Nearest Neighbors Accuracy:", knn_accuracy)

              precision    recall  f1-score   support

           0       0.80      0.80      0.80      1175
           1       0.62      0.62      0.62       625

    accuracy                           0.74      1800
   macro avg       0.71      0.71      0.71      1800
weighted avg       0.74      0.74      0.74      1800

K-Nearest Neighbors Accuracy: 0.7388888888888889


In [141]:
seq_feat_sel = False
if seq_feat_sel:
    seq_feat_selection(KNeighborsClassifier(), X_train, y_train)

### Hyperparameter tuning

In [142]:
tune = False
if tune:
    param_grid = {
                'n_neighbors': np.arange(2, 30, 1),
                'weights': ['uniform', 'distance'],
                'p': [1, 2]
    }
    
    best_params, best_model, best_accuracy = tune_knn(KNeighborsClassifier(), X_train, y_train, X_test, y_test, param_grid)
    
    print("Best Hyperparameters:", best_params)
    print("Best Model:", best_model)
    print("Best Model Accuracy:", best_accuracy)

Best Hyperparameters: {'n_neighbors': 28, 'p': 1, 'weights': 'uniform'}, Best Model Accuracy: 0.7777777777777778

https://www.geeksforgeeks.org/feature-selection-techniques-in-machine-learning/


In [143]:
multi_metric_cv = False
if multi_metric_cv:
    # Define the scorers
    scoring = {"AUC": "roc_auc", "Accuracy": make_scorer(accuracy_score)}
    
    n_neighbours_param_min = 2
    n_neighbours_param_max = 31
    param_grid = {"n_neighbors": range(n_neighbours_param_min, n_neighbours_param_max, 1)}
    
    results = multi_metric_cv(KNeighborsClassifier(), scoring, X_train, y_train, param_grid)
    plot_multi_score_cv_results("n_neighbors", n_neighbours_param_min, n_neighbours_param_max, 0.7, 1, results, "param_n_neighbors", int, scoring)

In [144]:
# Forward feature selection with hyperparameter tuning
if False:
    best_subset, best_params, best_score = forward_feat_selection_hypertuning(X_train, y_train)
    
    print("Best subset of features:", best_subset)
    print("Best hyperparameters:", best_params)
    print("Best model accuracy:", best_score)
    
    # Use the best subset and best hyperparameters for final model
    final_model = KNeighborsClassifier(**best_params)
    final_model.fit(X_train[best_subset], y_train)
    final_model_preds = final_model.predict(X_test[best_subset])
    final_model_accuracy = accuracy_score(y_test, final_model_preds)
    
    print(classification_report(y_test, final_model_preds))

Feature Selection: Ensure that sensitive attributes such as sex are not included as features in the model. This helps prevent the model from directly learning discriminatory patterns based on sensitive attributes.

Fair Representation: Balance the representation of different groups within the dataset. Ensure that the dataset is representative of the population with respect to sex and other sensitive attributes. This can help mitigate bias in the model's predictions.

Fair Preprocessing: Apply preprocessing techniques that aim to mitigate bias in the dataset. For example, use techniques such as reweighting or resampling to balance the dataset with respect to sensitive attributes.

Reweighing:
Adjust the weights of instances in your dataset to balance the impact of different groups.
Assign higher weights to underrepresented groups (e.g., females) and lower weights to overrepresented groups (e.g., males).
This helps reduce bias during model training.
Suppression:
Remove or suppress the sensitive attribute (‘sex’) from the dataset.
Train your model without using this attribute.
Note that this approach may lead to information loss, so use it cautiously.
Massaging the Dataset:
Modify class labels to achieve fairness.
For instance, you can swap the labels for different groups (e.g., relabel some ‘male’ instances as ‘female’ and vice versa).
This ensures that the model does not learn discriminatory patterns based on the sensitive attribute.
Resampling Techniques:
Oversample the underrepresented group (e.g., females) or undersample the overrepresented group (e.g., males).
This balances the class distribution and reduces bias.


Model Evaluation: Evaluate the model's performance and fairness across different subgroups defined by sensitive attributes such as sex. Use metrics such as disparate impact ratio, equal opportunity difference, or predictive parity to assess fairness.

In [145]:
from sklearn.metrics import confusion_matrix

X_test['sex'] = X_test['sex_Male'] * 1
X_train['sex'] = X_train['sex_Male'] * 1
X_test = X_test.drop(columns=['sex_Male', 'sex_Female'])
X_train = X_train.drop(columns=['sex_Male', 'sex_Female'])

# Train your KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Get predicted labels from the KNN model
y_pred = knn_model.predict(X_test)

# calculate DI:
# DI = (num_positives(priviliged=False) / num_instances(priviliged=False)) / (num_positives(priviliged=True) / num_instances(priviliged=True))
num_instances_priv_false = X_test[X_test['sex']==0].shape[0]
num_instances_priv_true = X_test[X_test['sex']==1].shape[0]

# Calculate the number of positive outcomes and instances for each group
num_positives_priv_false = np.sum((y_pred == 1) & (X_test['sex'] == 0))
num_positives_priv_true = np.sum((y_pred == 1) & (X_test['sex'] == 1))

# Calculate the Disparate Impact (DI) criterion
DI = (num_positives_priv_false / num_instances_priv_false) / (num_positives_priv_true / num_instances_priv_true)

print(f"Disparate Impact (DI): {DI:.2f}")

# # Get predicted labels for females and males
# y_pred_female = y_pred[X_test['sex'] == 0]
# y_pred_male = y_pred[X_test['sex'] == 1]
# 
# # Calculate the probability of predicted positive outcomes for females and males
# prob_positive_female = np.mean(y_pred_female)
# prob_positive_male = np.mean(y_pred_male)
# 
# # Calculate the Disparate Impact (DI) criterion
# DI = prob_positive_female / prob_positive_male
# 
# print(f"Disparate Impact (DI): {DI:.2f}")

# # Make predictions on the test set
# y_pred = knn_model.predict(X_test)
# 
# # Create a confusion matrix
# conf_matrix = confusion_matrix(y_test, y_pred)
# 
# # Extract counts for each group (male and female)
# male_count = np.sum(X_test['sex_Male'])
# female_count = np.sum(X_test['sex_Female'])
# 
# # Calculate favorable outcomes (e.g., 'income' = 1) for each group
# male_favorable = conf_matrix[1, 1]
# female_favorable = conf_matrix[0, 1]
# 
# # Calculate disparate impact
# disparate_impact = (female_favorable / female_count) / (male_favorable / male_count)
# 
# # Print the result
# print(f"Disparate Impact: {disparate_impact:.2f}")

# Interpretation:
# - If disparate impact = 1, outcomes are equally favorable for both groups.
# - If disparate impact > 1, one group has more favorable outcomes than the other.
# - If disparate impact < 1, the opposite is true.

# You can set a threshold (e.g., 0.8) to determine fairness based on your context.

Disparate Impact (DI): 0.49


In [146]:
conf_matrix # Thus in binary classification, the count of true negatives is C0,0, false negatives is C1,0, true positives is C1,1 and false positives is C0,1

array([[940, 235],
       [235, 390]], dtype=int64)

In [147]:
# Disparate Impact Ratio
from aif360.sklearn.metrics import disparate_impact_ratio
disparate_impact = disparate_impact_ratio(y_true=y_test, y_pred=y_pred, prot_attr=X_test[['sex_Male', 'sex_Female']].to_numpy())
print("Disparate Impact Ratio:", disparate_impact)


KeyError: "None of [Index(['sex_Male', 'sex_Female'], dtype='object')] are in the [columns]"

In [None]:
# from aif360.sklearn.metrics import statistical_parity_difference
# 
# # encode the 'sex' feature
# X['sex'] = X['sex'].map({'Male': 0, 'Female': 1})
# y = y.map({'low': 0, 'high': 1})
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 
# # Train your KNN model
# knn_model = KNeighborsClassifier()
# knn_model.fit(X_train, y_train)
# 
# # Predictions
# knn_preds = knn_model.predict(X_test)
# 
# # Calculate fairness metrics using model predictions and specify 'sex' as the sensitive attribute
# spd_sex = statistical_parity_difference(y_true=y_test, y_pred=knn_preds, sensitive_features=X_test['sex'])
# eo = equal_opportunity_difference(y_true=y_test, y_pred=knn_preds, sensitive_features=X_test['sex'])
# aod = average_odds_difference(y_true=y_test, y_pred=knn_preds, sensitive_features=X_test['sex'])
# apv = average_predictive_value_difference(y_true=y_test, y_pred=knn_preds, sensitive_features=X_test['sex'])
# 
# print("Statistical Parity Difference:", spd_sex)
# print("Equal Opportunity Difference:", eo)
# print("Average Odds Difference:", aod)
# print("Average Predictive Value Difference:", apv)

In [None]:
# Saving the model
save_model(final_model, '../output/saved_models/knn_model.joblib')