In [40]:
import warnings
warnings.filterwarnings("ignore")

from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model, get_features_and_target, encode_all_features
from helper.clfmodel_functions import tune_model, seq_feat_selection, multi_metric_cv, plot_multi_score_cv_results, forward_feat_selection_hypertuning
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.inspection import permutation_importance

### Loading the cleaned dataset

In [41]:
data: pd.DataFrame = load_dataset('../data/assignment2_income_cleaned.xlsx')

### Feature Engineering (encoding) & Train-Test Split

In [42]:
# Splitting the data into features (X) and target (y)
X, y = get_features_and_target(data, 'income')
# Encoding the features and target, and excluding some columns
X_encoded, y_encoded = encode_all_features(X, y, [])
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [43]:
X_train.head()

Unnamed: 0,age,education,workinghours,ability to speak english,workclass_governmental,workclass_no paid work,workclass_private,workclass_self employed,occupation_Construction/Extraction,occupation_Counseling/Mental Health Services,...,marital status_Divorced,marital status_Husband,marital status_Never married,marital status_Separated,marital status_Widowed,marital status_Wife,sex_Female,sex_Male,gave birth this year_No,gave birth this year_Yes
6317,22,16,36,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,1,0
740,61,22,40,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,1,1,0
3781,48,16,40,0,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,1,0
7850,62,18,65,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,1,0
2963,53,19,44,0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0


### Model

Here, we quickly train and evaluate a Logistic Regression model with random parameters for demonstration.

In [44]:
# Logistic Regression model
lr_model = LogisticRegression(max_iter=2000)
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_preds)

print(classification_report(y_test, lr_preds))
print("Logistic Regression Accuracy:", lr_accuracy)

              precision    recall  f1-score   support

           0       0.82      0.87      0.84      1175
           1       0.72      0.64      0.68       625

    accuracy                           0.79      1800
   macro avg       0.77      0.76      0.76      1800
weighted avg       0.79      0.79      0.79      1800

Logistic Regression Accuracy: 0.7911111111111111


### Feature Importance using the model itself

In [45]:
# Get the coefficients of the logistic regression model
feature_importance_scores = lr_model.coef_[0]

# Create a DataFrame to store the feature importance scores
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': feature_importance_scores
})

# Sort the DataFrame by the absolute value of the coefficients
feature_importance_df['Absolute_Coefficient'] = feature_importance_df['Coefficient'].abs()
feature_importance_df = feature_importance_df.sort_values(by='Absolute_Coefficient', ascending=False).drop(columns='Absolute_Coefficient')

In [46]:
feature_importance_df

Unnamed: 0,Feature,Coefficient
24,occupation_Service/Hospitality,-1.547451
23,"occupation_Science, Engineering, Technology",1.22106
13,occupation_Finance/Accounting,0.955415
15,occupation_Legal Services,0.817418
16,occupation_Management/Business,0.786187
28,marital status_Never married,-0.627166
12,"occupation_Farming, Fishing, Forestry",-0.626381
5,workclass_no paid work,-0.614467
17,occupation_Military Services,0.592107
18,occupation_Office/Administrative Support,-0.568162


In [47]:
# https://stackoverflow.com/questions/34052115/how-to-find-the-importance-of-the-features-for-a-logistic-regression-model
# https://scikit-learn.org/stable/modules/permutation_importance.html
model_fi = permutation_importance(lr_model, X_encoded, y_encoded, n_repeats=30, random_state=0)
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model_fi['importances_mean']
})
feature_importance_df = feature_importance_df.sort_values(by='Coefficient', ascending=False)

In [48]:
feature_importance_df

Unnamed: 0,Feature,Coefficient
1,education,0.036422
2,workinghours,0.035289
0,age,0.028207
24,occupation_Service/Hospitality,0.010622
27,marital status_Husband,0.007419
23,"occupation_Science, Engineering, Technology",0.007252
16,occupation_Management/Business,0.007178
28,marital status_Never married,0.005022
4,workclass_governmental,0.003133
25,occupation_Transport,0.002104


### Hyperparameter Tuning

Here, we define the base parameter grid for our hyperparameter tuning function.

In [49]:
param_grid = {
        'penalty': ['l2'],
        'C': [0.01, 0.1, 1, 10],
        'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky'],
        'max_iter': [5000],
        'random_state': [42]
    }

Below, we tune the hyperparameters of the Logistic Regression model using the defined parameter grid and using all features.

In [50]:
if False:
    best_params, best_model, best_accuracy = tune_model(LogisticRegression(), X_train, y_train, X_test, y_test, param_grid)
    
    print("\nBest Hyperparameters:", best_params)
    print("Best Model:", best_model)
    print("Best Model Accuracy:", best_accuracy)

Below, we tune the hyperparameters of the Logistic Regression model using the defined parameter grid and using a **subset of features**. We exclude the columns 'age', 'ability to speak english', and 'workclass'.

In [51]:
if True:
    # Splitting the data into features (X) and target (y)
    X_, y_ = get_features_and_target(data, 'income')
    columns_to_exclude = ['age', 'ability to speak english', 'workclass']
    X_ = X_.drop(columns=columns_to_exclude)
    # Encoding the features and target, and excluding some columns
    X_encoded_, y_encoded_ = encode_all_features(X_, y_, columns_to_exclude)
    X_train_, X_test_, y_train_, y_test_ = train_test_split(X_encoded_, y_encoded_, test_size=0.2, random_state=42)
    
    best_params, best_model, best_accuracy = tune_model(LogisticRegression(), X_train_, y_train_, X_test_, y_test_, param_grid)
    
    print("\nBest Hyperparameters:", best_params)
    print("Best Model:", best_model)
    print("Best Model Accuracy:", best_accuracy)

Fitting 5 folds for each of 16 candidates, totalling 80 fits

Best Hyperparameters: {'C': 0.1, 'max_iter': 5000, 'penalty': 'l2', 'random_state': 42, 'solver': 'newton-cholesky'}
Best Model: LogisticRegression(C=0.1, max_iter=5000, random_state=42,
                   solver='newton-cholesky')
Best Model Accuracy: 0.7727777777777778


In [52]:
# Forward feature selection with hyperparameter tuning
if True:
    param_grid = {
        'penalty': ['l2'],
        'C': [0.01, 0.1, 1, 10],
        'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky'],
        'max_iter': [5000],
        'random_state': [42]
    }
    
    best_subset, best_params, best_score = forward_feat_selection_hypertuning(LogisticRegression(), param_grid, X_train, y_train, X_test, y_test)
    
    print("Best subset of features:", best_subset)
    print("Best hyperparameters:", best_params)
    print("Best model accuracy:", best_score)
    
    # Use the best subset and best hyperparameters for final model
    final_model = LogisticRegression(**best_params)
    final_model.fit(X_train[best_subset], y_train)
    final_model_preds = final_model.predict(X_test[best_subset])
    final_model_accuracy = accuracy_score(y_test, final_model_preds)
    
    print(classification_report(y_test, final_model_preds))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best subset: ['occupation_Construction/Extraction', 'occupation_Counseling/Mental Health Services', 'occupation_Education', 'occupation_Entertainment', 'occupation_Farming, Fishing, Forestry', 'occupation_Finance/Accounting', 'occupation_Healthcare/Medical Services', 'occupation_Legal Services', 'occupation_Management/Business', 'occupation_Military Services', 'occupation_Office/Administrative Support', 'occupation_Production/Assembly', 'occupatio

In [58]:
# Define the parameter grid for AdaBoost
param_grid = {
    'algorithm': ['SAMME', 'SAMME.R'],
    'n_estimators': [50, 75, 100, 150],
    'learning_rate': [0.1, 0.5, 1.0],
    'estimator': [final_model, LogisticRegression(random_state=42)],
    'random_state': [42]
}

best_params, best_model, best_accuracy = tune_model(AdaBoostClassifier(), X_train, y_train, X_test, y_test, param_grid)

print("\nBest Hyperparameters:", best_params)
print("Best Model:", best_model)
print("Best Model Accuracy:", best_accuracy)

Fitting 5 folds for each of 48 candidates, totalling 240 fits

Best Hyperparameters: {'algorithm': 'SAMME.R', 'estimator': LogisticRegression(random_state=42), 'learning_rate': 1.0, 'n_estimators': 150, 'random_state': 42}
Best Model: AdaBoostClassifier(estimator=LogisticRegression(random_state=42),
                   n_estimators=150, random_state=42)
Best Model Accuracy: 0.7833333333333333


In [54]:
# load the test dataset
# test_data = load_dataset('../data/assignment2_test.xlsx')

In [55]:
# test_predictions = lr_model.predict(test_data)

In [56]:
# test_predictions

### Saving the model

In [57]:
save_model(lr_model, '../output/saved_models/logistic_regression_model.joblib')