In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
import os
from pprint import pprint

# Importing the models to be tested
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier, Pool


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from auxiliar_func import *
from plot_func import *

In [3]:
df = pd.read_csv('Census-Income-KDD.csv')
target = 'income_50k'
df_tr, df_te = train_test_split(df, test_size=0.3, random_state=42)

TARGET_METRIC = 'f1_macro'
SEED = 42

prep_params_grid = {
    'scaling': [None, 'minmax', 'standard'],
    'imputation': ['mode'],
    'cat_age': [False, True],
    'merge_capital': [False, True],
    'downsampling_method': ['random', 'NearMiss'],
    'target_freq': [0.75, 0.8, 0.85],
    'generate_dummies': [True]
}

def n_comb(grid: dict, print_=True):
    n = 1
    for k in grid.keys():
        n *= len(grid[k])
    if print_:
        print(f'Number of model combinations to be tested: {n}')
    else:
        return n
    
n_comb(prep_params_grid)

Number of model combinations to be tested: 72


### Tuning parameters for LDA

In [3]:
mod_par_grid = {
    'solver': ['svd', 'lsqr', 'eigen'],
    'shrinkage': [None, 'auto'],
    'store_covariance': [True, False],
    'tol': [1e-4, 1e-3, 1e-2],
}

n_comb(mod_par_grid)

lda = LDA()

if not os.path.exists('results_lda.csv'):
    results = search_best_combination(lda, mod_par_grid, prep_params_grid, df_tr, target_metric=TARGET_METRIC)
    results.to_csv('results_lda.csv', index=False)

Number of model combinations to be tested: 36
===Iteration 1===
Searching preprocessing parameters...
it: 36/36
Searching model parameters...
it: 36/36
Best metric: 0.757974545624374
Best preprocessing parameters: [{'scaling': 'minmax', 'imputation': 'mode', 'cat_age': True, 'target_freq': 0.85, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'mode', 'cat_age': True, 'target_freq': 0.85, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'nancat', 'cat_age': True, 'target_freq': 0.85, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': 'minmax', 'imputation': 'nancat', 'cat_age': True, 'target_freq': 0.85, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': 'standard', 'imputation': 'mode', 'cat_age': True, 'target_freq': 0.85, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': 'standard', 'imputation': 'nancat', 'cat_age': True, 'target_freq': 0.85, 'generate_dummies': T

### Tuning parameters for logistic regression

In [None]:
mod_par_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'C': [0.1, 1, 10],
    'class_weight': [None, 'balanced'],
    'fit_intercept': [True, False],
    'intercept_scaling': [0.1, 1, 10],
    'max_iter': [1000],
    'multi_class': ['auto'],
    'random_state': [SEED],
    'solver': ['saga']
}

n_comb(mod_par_grid)

logreg = LogisticRegression()

if not os.path.exists('results_log_regression.csv'):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        results = search_best_combination(logreg, mod_par_grid, prep_params_grid, df_tr, target_metric=TARGET_METRIC)
    results.to_csv('results_log_regression.csv', index=False)

### Tuning parameters for SVM

In [None]:
mod_par_grid = {
    'penalty': ['l1', 'l2'],
    'loss': ['squared_hinge','hinge'],
    'dual': [False],
    'C': [0.1, 1, 10],
    'fit_intercept': [True, False],
    'intercept_scaling': [0.1, 1],
    'class_weight': [None, 'balanced'],
    'max_iter': [1000],
    'random_state': [SEED]
}

n_comb(mod_par_grid)

svm = LinearSVC()

if not os.path.exists('results_svm.csv'):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        results = search_best_combination(svm, mod_par_grid, prep_params_grid, df_tr, target_metric=TARGET_METRIC, verbose=2)
    results.to_csv('results_svm.csv', index=False)

Number of model combinations to be tested: 96
===Iteration 1===
Searching preprocessing parameters...
it: 36/36
Searching model parameters...
it: 96/96
Best metric: 0.6911005968409094
Best preprocessing parameters: [{'scaling': None, 'imputation': 'nancat', 'cat_age': False, 'target_freq': 0.8, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'mode', 'cat_age': False, 'target_freq': 0.8, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'mode', 'cat_age': False, 'target_freq': 0.85, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'nancat', 'cat_age': False, 'target_freq': 0.85, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': 'standard', 'imputation': 'mode', 'cat_age': False, 'target_freq': 0.8, 'generate_dummies': True, 'remove_duplicates': True}]
Best model parameters: [{'penalty': 'l1', 'loss': 'squared_hinge', 'dual': False, 'C': 0.1, 'fit_intercept': Tru

### Tuning parameters for random forest

In [None]:
mod_par_grid = {
    'n_estimators': [50, 75, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 20, 25, 30],
    'max_features': ['sqrt', 'log2'],
    'random_state': [SEED],
    'verbose': [0],
    'warm_start': [False],
    'class_weight': [None, 'balanced', 'balanced_subsample'],
}

n_comb(mod_par_grid)

rf = RandomForestClassifier()

if not os.path.exists('results_rf.csv'):
    results = search_best_combination(rf, mod_par_grid, prep_params_grid, df_tr, target_metric=TARGET_METRIC)
    results.to_csv('results_rf.csv', index=False)

Number of model combinations to be tested: 144
===Iteration 1===
Searching preprocessing parameters...
it: 36/36
Searching model parameters...
it: 144/144
Best metric: 0.7074323534081814
Best preprocessing parameters: [{'scaling': None, 'imputation': 'mode', 'cat_age': False, 'target_freq': 0.8, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'mode', 'cat_age': False, 'target_freq': 0.75, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': 'standard', 'imputation': 'mode', 'cat_age': False, 'target_freq': 0.8, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': 'minmax', 'imputation': 'mode', 'cat_age': False, 'target_freq': 0.75, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': 'minmax', 'imputation': 'mode', 'cat_age': False, 'target_freq': 0.8, 'generate_dummies': True, 'remove_duplicates': True}]
Best model parameters: [{'n_estimators': 50, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqr

### Tuning parameters for catboost

In [None]:
prep_params_grid2 = {
    'scaling': [None],
    'imputation': ['mode'],
    'cat_age': [False],
    'target_freq': [0.75, 0.8, 0.85],
    'generate_dummies': [False]
}

# a first preprocess to get the categorical features
df_tr_pre = preprocessing(df_tr, imputation='mode', cat_age=False, generate_dummies=False)
X_train, y_train = df_tr_pre.drop(target, axis=1), df_tr_pre[target]
cat_features = list(X_train.select_dtypes(include=['category']).columns)

mod_par_grid = {
    'iterations': [5],
    'depth': [6, 8, 10],
    'border_count': [32, 64, 128],
    'random_seed': [SEED],
    'verbose': [0],
    'loss_function': ['Logloss'],
    'eval_metric': ['F1', 'AUC'],
    'class_weights': [[1, 1], [1, 2], [1, 3]],
    'cat_features': [cat_features],
}

n_comb(mod_par_grid)

cat_model = CatBoostClassifier()

if not os.path.exists('results_catboost.csv'):
    results = search_best_combination(cat_model, mod_par_grid, prep_params_grid2, df_tr, target_metric=TARGET_METRIC)
    results.to_csv('results_catboost.csv', index=False)

Number of model combinations to be tested: 54
===Iteration 1===
Searching preprocessing parameters...
it: 3/3
Searching model parameters...
it: 54/54
Best metric: 0.6690903906697929
Best preprocessing parameters: [{'scaling': None, 'imputation': 'mode', 'cat_age': False, 'target_freq': 0.8, 'generate_dummies': False, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'mode', 'cat_age': False, 'target_freq': 0.75, 'generate_dummies': False, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'mode', 'cat_age': False, 'target_freq': 0.85, 'generate_dummies': False, 'remove_duplicates': True}]
Best model parameters: [{'iterations': 5, 'depth': 6, 'border_count': 128, 'random_seed': 42, 'verbose': 0, 'loss_function': 'Logloss', 'eval_metric': 'AUC', 'class_weights': [1, 1], 'cat_features': ['class_worker', 'det_ind_code', 'det_occ_code', 'education', 'hs_college', 'marital_stat', 'major_ind_code', 'major_occ_code', 'race', 'hisp_origin', 'sex', 'union_member', 'unemp_reaso