In [7]:
from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model, get_features_and_target, encode_all_features
from helper.clfmodel_functions import tune_model, seq_feat_selection, multi_metric_cv, plot_multi_score_cv_results, forward_feat_selection_hypertuning
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.ensemble import RandomForestClassifier

### Loading the cleaned dataset

In [8]:
data = load_dataset('../data/assignment2_income_cleaned.xlsx')

In [9]:
# Splitting the data into features (X) and target (y)
X, y = get_features_and_target(data, 'income')
columns_to_exclude = ['sex', 'ability to speak english', 'gave birth this year']
# Encoding the features and target, and excluding some columns
X_encoded, y_encoded = encode_all_features(X, y, [])
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

### Model

In [11]:
# Random Forest Classifier model
dt_model = RandomForestClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_preds = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_preds)

print(classification_report(y_test, dt_preds))
print("Decision Tree Accuracy:", dt_accuracy)

              precision    recall  f1-score   support

           0       0.81      0.83      0.82      1175
           1       0.66      0.62      0.64       625

    accuracy                           0.76      1800
   macro avg       0.73      0.73      0.73      1800
weighted avg       0.76      0.76      0.76      1800

Decision Tree Accuracy: 0.7577777777777778


### Hyperparameter tuning

In [12]:
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'random_state': [42]
}

best_params, best_model, best_accuracy = tune_model(RandomForestClassifier(), X_train, y_train, X_test, y_test, param_grid)

print("\nBest Hyperparameters:", best_params)
print("Best Model:", best_model)
print("Best Model Accuracy:", best_accuracy)

Fitting 5 folds for each of 1350 candidates, totalling 6750 fits

Best Hyperparameters: {'bootstrap': True, 'criterion': 'log_loss', 'max_depth': 40, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200, 'random_state': 42}
Best Model: RandomForestClassifier(criterion='log_loss', max_depth=40, min_samples_leaf=2,
                       min_samples_split=5, n_estimators=200, random_state=42)
Best Model Accuracy: 0.7888888888888889


In [ ]:
# Forward feature selection with hyperparameter tuning
if False:
    param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'random_state': [42]
}
    
    best_subset, best_params, best_score = forward_feat_selection_hypertuning(RandomForestClassifier(), param_grid, X_train, y_train, X_test, y_test)
    
    print("Best subset of features:", best_subset)
    print("Best hyperparameters:", best_params)
    print("Best model accuracy:", best_score)
    
    # Use the best subset and best hyperparameters for final model
    final_model = RandomForestClassifier(**best_params)
    final_model.fit(X_train[best_subset], y_train)
    final_model_preds = final_model.predict(X_test[best_subset])
    final_model_accuracy = accuracy_score(y_test, final_model_preds)
    
    print(classification_report(y_test, final_model_preds))

In [13]:
# save model
save_model(dt_model, '../output/saved_models/random_forest_model.joblib')