In [31]:
from typing import List, Tuple
from helper.helper_functions import load_dataset, save_model, get_features_and_target, encode_all_features
from helper.clfmodel_functions import tune_model, seq_feat_selection, multi_metric_cv, plot_multi_score_cv_results, forward_feat_selection_hypertuning
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier

### Loading the cleaned dataset

In [32]:
data: pd.DataFrame = load_dataset('../data/assignment2_income_cleaned.xlsx')

In [33]:
# Splitting the data into features (X) and target (y)
X, y = get_features_and_target(data, 'income')
columns_to_exclude = ['sex', 'ability to speak english', 'gave birth this year']
# Encoding the features and target, and excluding some columns
X_encoded, y_encoded = encode_all_features(X, y, [])
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=42)

In [34]:
X_train.head()

Unnamed: 0,age,education,workinghours,ability to speak english,occupation_Construction/Extraction,occupation_Counseling/Mental Health Services,occupation_Education,occupation_Entertainment,"occupation_Farming, Fishing, Forestry",occupation_Finance/Accounting,...,gave birth this year_No,gave birth this year_Yes,marital status_Divorced,marital status_Husband,marital status_Never married,marital status_Separated,marital status_Widowed,marital status_Wife,sex_Female,sex_Male
6317,22,16,36,0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
740,61,22,40,1,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3781,48,16,40,0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7850,62,18,65,0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2963,53,19,44,0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,22,19,25,0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5191,24,16,28,0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5390,35,16,40,0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
860,23,20,40,0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


### Model

It should be noted, for the whole file, that 

In [35]:
# Logistic Regression model
lr_model = LogisticRegression(max_iter=2000)
lr_model.fit(X_train, y_train)

In [36]:
# Predictions
lr_preds = lr_model.predict(X_test)
# Accuracy evaluation
lr_accuracy = accuracy_score(y_test, lr_preds)

print(classification_report(y_test, lr_preds))
print("Logistic Regression Accuracy:", lr_accuracy)

              precision    recall  f1-score   support

           0       0.82      0.87      0.85      1175
           1       0.73      0.64      0.68       625

    accuracy                           0.79      1800
   macro avg       0.77      0.76      0.76      1800
weighted avg       0.79      0.79      0.79      1800

Logistic Regression Accuracy: 0.7916666666666666


### Feature Importance using the model itself

In [37]:
# Get the coefficients of the logistic regression model
feature_importance_scores = lr_model.coef_[0]

# Create a DataFrame to store the feature importance scores
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': feature_importance_scores
})

# Sort the DataFrame by the absolute value of the coefficients
feature_importance_df['Absolute_Coefficient'] = feature_importance_df['Coefficient'].abs()
feature_importance_df = feature_importance_df.sort_values(by='Absolute_Coefficient', ascending=False).drop(columns='Absolute_Coefficient')

In [38]:
feature_importance_df

Unnamed: 0,Feature,Coefficient
20,occupation_Service/Hospitality,-1.555471
19,"occupation_Science, Engineering, Technology",1.213175
9,occupation_Finance/Accounting,0.955709
11,occupation_Legal Services,0.822549
12,occupation_Management/Business,0.783448
8,"occupation_Farming, Fishing, Forestry",-0.632913
30,marital status_Never married,-0.630774
13,occupation_Military Services,0.593828
23,workclass_no paid work,-0.581434
14,occupation_Office/Administrative Support,-0.572085


In [39]:
from sklearn.inspection import permutation_importance
# https://stackoverflow.com/questions/34052115/how-to-find-the-importance-of-the-features-for-a-logistic-regression-model
# https://scikit-learn.org/stable/modules/permutation_importance.html
model_fi = permutation_importance(lr_model, X_encoded, y_encoded, n_repeats=30, random_state=0)

In [40]:
feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model_fi['importances_mean']
})
feature_importance_df = feature_importance_df.sort_values(by='Coefficient', ascending=False)

In [41]:
feature_importance_df

Unnamed: 0,Feature,Coefficient
1,education,0.036322
2,workinghours,0.035319
0,age,0.028037
20,occupation_Service/Hospitality,0.010693
19,"occupation_Science, Engineering, Technology",0.007107
12,occupation_Management/Business,0.006978
29,marital status_Husband,0.006844
30,marital status_Never married,0.005
22,workclass_governmental,0.00283
21,occupation_Transport,0.002081


### Hyperparameter Tuning

In [42]:
param_grid = {
    'penalty': ['l2'],
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky'],
    'max_iter': [5000],
    'random_state': [42]
}

best_params, best_model, best_accuracy = tune_model(LogisticRegression(), X_train, y_train, X_test, y_test, param_grid)

print("\nBest Hyperparameters:", best_params)
print("Best Model:", best_model)
print("Best Model Accuracy:", best_accuracy)

Fitting 5 folds for each of 16 candidates, totalling 80 fits

Best Hyperparameters: {'C': 1, 'max_iter': 5000, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs'}
Best Model: LogisticRegression(C=1, max_iter=5000, random_state=42)
Best Model Accuracy: 0.7916666666666666


In [43]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for AdaBoost
param_grid = {
    'algorithm': ['SAMME'],
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.1, 0.5, 1.0],
    'estimator': [LogisticRegression(C=1, max_iter=5000, penalty='l2', random_state=42, solver='newton-cg'), LogisticRegression(random_state=42)],
    'random_state': [42]
}

best_params, best_model, best_accuracy = tune_model(AdaBoostClassifier(), X_train, y_train, X_test, y_test, param_grid)

print("\nBest Hyperparameters:", best_params)
print("Best Model:", best_model)
print("Best Model Accuracy:", best_accuracy)

Fitting 5 folds for each of 18 candidates, totalling 90 fits

Best Hyperparameters: {'algorithm': 'SAMME', 'estimator': LogisticRegression(C=1, max_iter=5000, random_state=42, solver='newton-cg'), 'learning_rate': 0.5, 'n_estimators': 150}
Best Model: AdaBoostClassifier(algorithm='SAMME',
                   estimator=LogisticRegression(C=1, max_iter=5000,
                                                random_state=42,
                                                solver='newton-cg'),
                   learning_rate=0.5, n_estimators=150)
Best Model Accuracy: 0.7605555555555555


In [44]:
# load the test dataset
# test_data = load_dataset('../data/assignment2_test.xlsx')

In [45]:
# test_predictions = lr_model.predict(test_data)

In [46]:
# test_predictions

### Saving the model

In [47]:
save_model(lr_model, '../output/saved_models/logistic_regression_model.joblib')