In [None]:
from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model, get_features_and_target, encode_all_features
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.neighbors import KNeighborsClassifier

### Loading the cleaned dataset

In [69]:
data = load_dataset('../data/assignment2_income_cleaned.xlsx')

In [70]:
# Splitting the data into features (X) and target (y)
X, y = get_features_and_target(data, 'income')
columns_to_exclude = ['sex', 'ability to speak english', 'gave birth this year']
# Encoding the features and target, and excluding some columns
X_encoded, y_encoded = encode_all_features(X, y, columns_to_exclude)
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

### Model

In [71]:
# K-Nearest Neighbors model
knn_model = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn_model.fit(X_train, y_train)

In [72]:
# Predictions
knn_preds = knn_model.predict(X_test)
# Accuracy evaluation
knn_accuracy = accuracy_score(y_test, knn_preds)

print(classification_report(y_test, knn_preds))
print("K-Nearest Neighbors Accuracy:", knn_accuracy)

              precision    recall  f1-score   support

           0       0.80      0.79      0.80      1175
           1       0.62      0.63      0.62       625

    accuracy                           0.74      1800
   macro avg       0.71      0.71      0.71      1800
weighted avg       0.74      0.74      0.74      1800

K-Nearest Neighbors Accuracy: 0.7366666666666667


### Hyperparameter tuning

In [73]:
from sklearn.model_selection import GridSearchCV
import numpy as np
# https://medium.com/@agrawalsam1997/hyperparameter-tuning-of-knn-classifier-a32f31af25c7

tune = False
if tune:
    # Define the hyperparameter grid
    param_grid = {
        'n_neighbors': np.arange(2, 30, 1),
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
    }
    
    knn = KNeighborsClassifier()
    
    # Perform grid search with 5-fold cross-validation
    knn_cv = GridSearchCV(knn, param_grid=param_grid, cv=5, verbose=1)
    
    knn_cv.fit(X_train, y_train)
    
    best_params = knn_cv.best_params_ # best hyperparameters
    best_model = knn_cv.best_estimator_ # best model
    
    accuracy = best_model.score(X_test, y_test)
    
    print("Best Hyperparameters:", best_params)
    print("Best Model:", best_model)
    print("Best Model Accuracy:", accuracy)

Best Hyperparameters: {'n_neighbors': 28, 'p': 1, 'weights': 'uniform'}, Best Model Accuracy: 0.7605555555555555

In [74]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SequentialFeatureSelector

# Create KNN classifier
knn = KNeighborsClassifier()

# Create Sequential Feature Selector
sfs = SequentialFeatureSelector(knn, 
                                 direction="backward", # backward feature elimination
                                 scoring='accuracy',
                                 cv=StratifiedKFold())

# Fit the feature selector to training data
sfs.fit(X_train, y_train)

# Get selected features and feature indices
selected_features = sfs.get_support()
selected_feature_indices = [i for i, val in enumerate(selected_features) if val]

In [75]:
print("Selected features:")
for i, feature in enumerate(X_train.columns[selected_feature_indices]):
    print(f"Feature {i+1}: {feature}")

Selected features:
Feature 1: education
Feature 2: workinghours
Feature 3: marital status_Husband
Feature 4: marital status_Never married
Feature 5: marital status_Widowed
Feature 6: occupation_Construction/Extraction
Feature 7: occupation_Education
Feature 8: occupation_Finance/Accounting
Feature 9: occupation_Healthcare/Medical Services
Feature 10: occupation_Management/Business
Feature 11: occupation_Office/Administrative Support
Feature 12: occupation_Protective Services
Feature 13: occupation_Repair/Maintenance
Feature 14: occupation_Sales
Feature 15: occupation_Science, Engineering, Technology
Feature 16: workclass_governmental


In [76]:
from itertools import combinations

# Define the range of hyperparameters to search
param_grid = {
        'n_neighbors': np.arange(2, 30, 1),
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
}

best_score = 0
best_subset = None
best_params = None

# Generate all possible subsets of features
all_subsets = []
for k in range(1, len(X_encoded.columns) + 1):
    all_subsets.extend(combinations(X_encoded.columns, k))

# Iterate over all feature subsets
for subset in all_subsets:
    # Split data into training and validation sets
    X_subset = X_encoded[list(subset)]
    X_train, X_val, y_train, y_val = train_test_split(X_subset, y_encoded, test_size=0.2, random_state=42)
    
    # Perform hyperparameter tuning using cross-validation
    knn = KNeighborsClassifier()
    grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    
    # Evaluate model performance on validation set
    score = grid_search.score(X_val, y_val)
    
    # Check if current subset has higher performance
    if score > best_score:
        best_score = score
        best_subset = subset
        best_params = grid_search.best_params_

KeyError: "None of [Index(['workclass'], dtype='object')] are in the [columns]"

In [ ]:
print("Best subset of features:", best_subset)
print("Best hyperparameters:", best_params)
print("Best model accuracy:", best_score)

In [ ]:
# Use the best subset and best hyperparameters for final model
final_model = ...  # Instantiate model with best hyperparameters
final_X = X[list(best_subset)]
final_model.fit(final_X, y)

In [None]:
# Saving the model
save_model(knn_model, '../output/saved_models/knn_model.joblib')