In [30]:
from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model, get_features_and_target, encode_all_features
from helper.clfmodel_functions import tune_model, seq_feat_selection, multi_metric_cv, plot_multi_score_cv_results, forward_feat_selection_hypertuning
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.tree import DecisionTreeClassifier

### Loading the cleaned dataset

In [31]:
data = load_dataset('../data/assignment2_income_cleaned.xlsx')

In [32]:
# Splitting the data into features (X) and target (y)
X, y = get_features_and_target(data, 'income')
columns_to_exclude = ['sex', 'ability to speak english', 'gave birth this year']
# Encoding the features and target, and excluding some columns
X_encoded, y_encoded = encode_all_features(X, y, [])
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [33]:
X_train.head()

Unnamed: 0,age,education,workinghours,ability to speak english,occupation_Construction/Extraction,occupation_Counseling/Mental Health Services,occupation_Education,occupation_Entertainment,"occupation_Farming, Fishing, Forestry",occupation_Finance/Accounting,...,marital status_Never married,marital status_Separated,marital status_Widowed,marital status_Wife,gave birth this year_No,gave birth this year_Yes,workclass_governmental,workclass_no paid work,workclass_private,workclass_self employed
6317,22,16,36,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
740,61,22,40,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3781,48,16,40,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
7850,62,18,65,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2963,53,19,44,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0


### Model

In [34]:
# Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

dt_preds = dt_model.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_preds)

print(classification_report(y_test, dt_preds))
print("Decision Tree Accuracy:", dt_accuracy)

              precision    recall  f1-score   support

           0       0.78      0.79      0.78      1175
           1       0.59      0.58      0.58       625

    accuracy                           0.71      1800
   macro avg       0.68      0.68      0.68      1800
weighted avg       0.71      0.71      0.71      1800

Decision Tree Accuracy: 0.715


In [35]:
seq_feat_sel = True
if seq_feat_sel:
    seq_feat_selection(DecisionTreeClassifier(), X_train, y_train, direction='backward')

Selected features:
Feature 1: age
Feature 2: education
Feature 3: workinghours
Feature 4: occupation_Construction/Extraction
Feature 5: occupation_Counseling/Mental Health Services
Feature 6: occupation_Education
Feature 7: occupation_Entertainment
Feature 8: occupation_Farming, Fishing, Forestry
Feature 9: occupation_Finance/Accounting
Feature 10: occupation_Healthcare/Medical Services
Feature 11: occupation_Legal Services
Feature 12: occupation_Management/Business
Feature 13: occupation_Military Services
Feature 14: occupation_Office/Administrative Support
Feature 15: occupation_Production/Assembly
Feature 16: occupation_Protective Services
Feature 17: occupation_Repair/Maintenance
Feature 18: occupation_Sales
Feature 19: occupation_Science, Engineering, Technology
Feature 20: occupation_Service/Hospitality
Feature 21: occupation_Transport
Feature 22: sex_Female
Feature 23: sex_Male
Feature 24: marital status_Divorced
Feature 25: marital status_Husband
Feature 26: marital status_Neve

### Hyperparameter tuning

In [36]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'random_state': [42]
}

best_params, best_model, best_accuracy = tune_model(DecisionTreeClassifier(), X_train, y_train, X_test, y_test, param_grid)

print("\nBest Hyperparameters:", best_params)
print("Best Model:", best_model)
print("Best Model Accuracy:", best_accuracy)

Fitting 5 folds for each of 540 candidates, totalling 2700 fits

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'random_state': 42, 'splitter': 'random'}
Best Model: DecisionTreeClassifier(max_depth=10, min_samples_leaf=4, random_state=42,
                       splitter='random')
Best Model Accuracy: 0.7827777777777778


In [37]:
# Forward feature selection with hyperparameter tuning
if True:
    param_grid = {
        'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_depth': [None, 5, 10, 15, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [None, 'sqrt', 'log2'],
        'random_state': [42]
    }
    
    best_subset, best_params, best_score = forward_feat_selection_hypertuning(DecisionTreeClassifier(), param_grid, X_train, y_train, X_test, y_test)
    
    print("Best subset of features:", best_subset)
    print("Best hyperparameters:", best_params)
    print("Best model accuracy:", best_score)
    
    # Use the best subset and best hyperparameters for final model
    final_model = DecisionTreeClassifier(**best_params)
    final_model.fit(X_train[best_subset], y_train)
    final_model_preds = final_model.predict(X_test[best_subset])
    final_model_accuracy = accuracy_score(y_test, final_model_preds)
    
    print(classification_report(y_test, final_model_preds))

Fitting 5 folds for each of 540 candidates, totalling 2700 fits
Fitting 5 folds for each of 540 candidates, totalling 2700 fits
Fitting 5 folds for each of 540 candidates, totalling 2700 fits
Fitting 5 folds for each of 540 candidates, totalling 2700 fits
Fitting 5 folds for each of 540 candidates, totalling 2700 fits
Fitting 5 folds for each of 540 candidates, totalling 2700 fits
Fitting 5 folds for each of 540 candidates, totalling 2700 fits
Fitting 5 folds for each of 540 candidates, totalling 2700 fits
Fitting 5 folds for each of 540 candidates, totalling 2700 fits
Best subset: ['education']
Remaining features: [['age'], ['workinghours'], ['ability to speak english'], ['workclass_governmental', 'workclass_no paid work', 'workclass_private', 'workclass_self employed'], ['marital status_Divorced', 'marital status_Husband', 'marital status_Never married', 'marital status_Separated', 'marital status_Widowed', 'marital status_Wife'], ['occupation_Construction/Extraction', 'occupation_Co

### Saving the model

In [38]:
# save model
save_model(dt_model, '../output/saved_models/decision_tree_model.joblib')