In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
import itertools
import os
from pprint import pprint

# Importing the models to be tested
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from auxiliar_func import *
from plot_func import *

## Parameter tunning

In [2]:
target = 'income_50k'               # target variable
df_tr = pd.read_csv('../train.csv') # training set

TARGET_METRIC = 'f1_macro'          # metric to be used in the grid search
SEED = 42                           # seed for reproducibility

# Grid of preprocessing hyperparameters for each model
prep_params_grid = {
    'scaling': [None, 'minmax', 'standard'],
    'imputation': ['mode'],
    'cat_age': [False, True],
    'remove_outliers': [False, True],
    'merge_capital': [False, True],
    'downsampling_method': ['random'],
    'target_freq': [0.75, 0.8, 0.85],
    'generate_dummies': [True]
}

def n_comb(grid: dict, print_=True):
    """Returns the number of combinations to be tested given a grid of parameters"""
    n = 1
    for k in grid.keys():
        n *= len(grid[k])
    if print_:
        print(f'Number of combinations to be tested: {n}')
    else:
        return n

def test_model(mod, prep_grid, mod_grid, name, rewrite=False, **kwargs):
    """Tests a model with all the possible combinations of preprocessing parameters and hyperparameters"""
    if rewrite or not os.path.exists(f'./results/results_{name}.csv'):
        try:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore") # to avoid convergence warnings
                results = search_best_combination(mod, mod_grid, prep_grid, df_tr,
                target_metric=TARGET_METRIC, cv=5, N=15, random_state=SEED, **kwargs)
                results.to_csv(f'./results/results_{name}.csv', index=False)
        except Exception as e:
            print(e)
            return None

n_comb(prep_params_grid)

Number of combinations to be tested: 72


### Naive Bayes

In [3]:
prep_params_grid_discriminant = {
    'scaling': [None],
    'imputation': ['mode'],
    'cat_age': [False, True],
    'remove_outliers': [False, True],
    'merge_capital': [False, True],
    'downsampling_method': ['random'],
    'target_freq': [0.75, 0.8, 0.85, 0.9],
    'generate_dummies': [True]
}

mod_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
}

n_comb(mod_grid)

mod = GaussianNB()
test_model(mod, prep_params_grid_discriminant, mod_grid, 'nb')

Number of combinations to be tested: 5


### Linear Discriminant Analysis (LDA)

In [4]:
mod_par_grid = {
    'solver': ['svd'], # 'svd' is faster and recommended for large datasets
    'priors': [None], # By default, the class proportions are inferred from the training data.
    'tol': [1e-4, 1e-3, 1e-2], # Absolute threshold for a singular value of X to be considered significant, used to estimate the rank of X.
}

n_comb(mod_par_grid) 

lda = LDA()
test_model(lda, prep_params_grid_discriminant, mod_par_grid, 'lda')

Number of combinations to be tested: 3


### Quadratic Discriminant Analysis (QDA)

In [5]:
mod_par_grid = {
    'priors': [None], # By default, the class proportions are inferred from the training data.
    'reg_param': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5], # Regularizes the per-class covariance estimates: S = (1 - reg_param) * S + reg_param * np.eye(n_features),
    'store_covariance': [True, False], # If True, the covariance matrices are computed and stored in the self.covariance_ attribute.
    'tol': [1e-4, 1e-3, 1e-2], # Absolute threshold for a singular value of X to be considered significant, used to estimate the rank of X. Does not affect the predictions.
}

n_comb(mod_par_grid)

qda = QDA()
test_model(qda, prep_params_grid_discriminant, mod_par_grid, 'qda')

Number of combinations to be tested: 36
===Iteration 1===
Searching preprocessing parameters...
it: 32/32
Searching model parameters...
it: 36/36
Best metric: 0.7195462916743848
Best preprocessing parameters: [{'scaling': None, 'imputation': 'mode', 'cat_age': False, 'remove_outliers': True, 'merge_capital': True, 'downsampling_method': 'random', 'target_freq': 0.75, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'mode', 'cat_age': False, 'remove_outliers': True, 'merge_capital': True, 'downsampling_method': 'random', 'target_freq': 0.85, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'mode', 'cat_age': False, 'remove_outliers': True, 'merge_capital': True, 'downsampling_method': 'random', 'target_freq': 0.8, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'mode', 'cat_age': False, 'remove_outliers': True, 'merge_capital': True, 'downsampling_method': 'random', 'target_fr

### K-Nearest Neighbors (KNN)

In [6]:
mod_par_grid = {
    'n_neighbors': [7, 9, 11, 13, 15, 17, 19], # Number of neighbors to use by default for kneighbors queries.
    'weights': ['uniform', 'distance'], # weight function used in prediciton
}

n_comb(mod_par_grid)

knn = KNN()
test_model(knn, prep_params_grid, mod_par_grid, 'knn')

Number of combinations to be tested: 14
===Iteration 1===
Searching preprocessing parameters...
it: 72/72
Searching model parameters...
it: 14/14
Best metric: 0.7589379479060314
Best preprocessing parameters: [{'scaling': 'standard', 'imputation': 'mode', 'cat_age': False, 'remove_outliers': False, 'merge_capital': False, 'downsampling_method': 'random', 'target_freq': 0.85, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': 'standard', 'imputation': 'mode', 'cat_age': True, 'remove_outliers': False, 'merge_capital': False, 'downsampling_method': 'random', 'target_freq': 0.8, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': 'standard', 'imputation': 'mode', 'cat_age': False, 'remove_outliers': False, 'merge_capital': True, 'downsampling_method': 'random', 'target_freq': 0.85, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': 'standard', 'imputation': 'mode', 'cat_age': True, 'remove_outliers': False, 'merge_capital': False, 'downsampling_

### Logistic regression

In [7]:
prep_params_grid_discriminant = {
    'scaling': ['minmax', 'standard'],
    'imputation': ['mode'],
    'cat_age': [False, True],
    'remove_outliers': [False, True],
    'merge_capital': [False, True],
    'downsampling_method': ['random'],
    'target_freq': [0.75, 0.8, 0.85],
    'generate_dummies': [True]
}

mod_par_grid = {
    'penalty': ['l1', 'l2'], # Used to specify the norm used in the penalization.
    'C': [0.1, 1, 10], # Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
    'class_weight': [None, 'balanced'], 
    'intercept_scaling': [0.1, 1, 10],  
    'max_iter': [1000], # Maximum number of iterations taken for the solvers to converge.
    'random_state': [SEED],
    'solver': ['saga'] # Algorithm to use in the optimization problem if the penalty is 'l1'.
}
n_comb(mod_par_grid)

aux = list(itertools.product(*mod_par_grid.values()))
mod_par_grid = [{k: v for k, v in zip(mod_par_grid.keys(), combination)} for combination in aux]
# for the combinations that use l2 penalty, we can use the 'newton-cholesky' solver, which is faster
for d in mod_par_grid:
    if d['penalty'] == 'l2':
        d['solver'] = 'newton-cg'

logreg = LogisticRegression()
test_model(logreg, prep_params_grid, mod_par_grid, 'logreg')

Number of combinations to be tested: 36
===Iteration 1===
Searching preprocessing parameters...
it: 72/72
Searching model parameters...
it: 35/35
Best metric: 0.7748222081609301
Best preprocessing parameters: [{'scaling': 'standard', 'imputation': 'mode', 'cat_age': True, 'remove_outliers': False, 'merge_capital': False, 'downsampling_method': 'random', 'target_freq': 0.8, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': 'standard', 'imputation': 'mode', 'cat_age': False, 'remove_outliers': False, 'merge_capital': False, 'downsampling_method': 'random', 'target_freq': 0.8, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': 'standard', 'imputation': 'mode', 'cat_age': False, 'remove_outliers': False, 'merge_capital': False, 'downsampling_method': 'random', 'target_freq': 0.85, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': 'minmax', 'imputation': 'mode', 'cat_age': True, 'remove_outliers': False, 'merge_capital': False, 'downsampling_me

### Support Vector Machines (SVM)

In [8]:
mod_par_grid = {
    'penalty': ['l1', 'l2'], # Used to specify the norm used in the penalization.
    'dual': [False], # Dual or primal formulation. 
    'C': [0.1, 1, 10], # Inverse of regularization strength; must be a positive float.
    'intercept_scaling': [0.1, 1, 10], 
    'class_weight': [None, 'balanced'], 
    'max_iter': [1000], # Maximum number of iterations taken for the solvers to converge.
    'random_state': [SEED]
}

n_comb(mod_par_grid)

svm = LinearSVC()
test_model(svm, prep_params_grid, mod_par_grid, 'svm')

Number of combinations to be tested: 36
===Iteration 1===
Searching preprocessing parameters...
it: 72/72
Searching model parameters...
it: 36/36
Best metric: 0.7747738144743455
Best preprocessing parameters: [{'scaling': None, 'imputation': 'mode', 'cat_age': True, 'remove_outliers': True, 'merge_capital': False, 'downsampling_method': 'random', 'target_freq': 0.8, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'mode', 'cat_age': True, 'remove_outliers': False, 'merge_capital': False, 'downsampling_method': 'random', 'target_freq': 0.8, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': 'standard', 'imputation': 'mode', 'cat_age': True, 'remove_outliers': False, 'merge_capital': False, 'downsampling_method': 'random', 'target_freq': 0.8, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'mode', 'cat_age': False, 'remove_outliers': True, 'merge_capital': False, 'downsampling_method': 'random', 'ta

### Random forest

In [9]:
mod_par_grid = {
    'n_estimators': [50, 75, 100, 125], # The number of trees in the forest.
    'criterion': ['gini'],  # The function to measure the quality of a split.
    'max_depth': [None, 25, 30, 35, 40], # The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
    'max_features': ['sqrt', 'log2'], # The number of features to consider when looking for the best split.
    'random_state': [SEED], 
    'verbose': [0],
    'class_weight': [None, 'balanced'] # Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one.
}

n_comb(mod_par_grid)

rf = RandomForestClassifier()
test_model(rf, prep_params_grid, mod_par_grid, 'rf')

Number of combinations to be tested: 80
===Iteration 1===
Searching preprocessing parameters...
it: 72/72
Searching model parameters...
it: 80/80
Best metric: 0.7810307742825338
Best preprocessing parameters: [{'scaling': None, 'imputation': 'mode', 'cat_age': False, 'remove_outliers': False, 'merge_capital': False, 'downsampling_method': 'random', 'target_freq': 0.8, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': 'minmax', 'imputation': 'mode', 'cat_age': False, 'remove_outliers': False, 'merge_capital': False, 'downsampling_method': 'random', 'target_freq': 0.8, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'mode', 'cat_age': False, 'remove_outliers': False, 'merge_capital': False, 'downsampling_method': 'random', 'target_freq': 0.85, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': 'minmax', 'imputation': 'mode', 'cat_age': False, 'remove_outliers': False, 'merge_capital': False, 'downsampling_method': 'rand

### Gradient Boosting (XGBoost)

In [10]:
prep_params_grid_xgb = {
    'scaling': [None],
    'imputation': ['mode'],
    'cat_age': [False],
    'remove_outliers': [False, True],
    'merge_capital': [False, True],
    'downsampling_method': ['random'],
    'target_freq': [0.8, 0.85, 0.9],
    'generate_dummies': [True]
}

mod_par_grid = {
    'n_estimators': [65, 75],
    'max_depth': [None],
    'learning_rate': [0.2, 0.3, 0.4],
    'booster': ['gbtree', 'dart'],
    'reg_lambda': [0.1, 0.2, 0.3],
    'reg_alpha': [0.1, 0.2, 0.3],
    'random_state': [SEED],
    'verbosity': [0]
}

n_comb(mod_par_grid)

# we need to remove special characters from the features and category names for xgboost to work
df_tr = df_tr.applymap(lambda x: x.replace('[', '').replace(']', '').replace('<', '') if isinstance(x, str) else x)

xgb = XGBClassifier()
test_model(xgb, prep_params_grid, mod_par_grid, 'xgb')

Number of combinations to be tested: 108
===Iteration 1===
Searching preprocessing parameters...
it: 72/72
Searching model parameters...
it: 108/108
Best metric: 0.7943150967522675
Best preprocessing parameters: [{'scaling': None, 'imputation': 'mode', 'cat_age': False, 'remove_outliers': True, 'merge_capital': True, 'downsampling_method': 'random', 'target_freq': 0.85, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'mode', 'cat_age': False, 'remove_outliers': True, 'merge_capital': False, 'downsampling_method': 'random', 'target_freq': 0.85, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'mode', 'cat_age': False, 'remove_outliers': False, 'merge_capital': False, 'downsampling_method': 'random', 'target_freq': 0.85, 'generate_dummies': True, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'mode', 'cat_age': False, 'remove_outliers': False, 'merge_capital': True, 'downsampling_method': 'random', 't

### Gradient Boosting (CatBoost)

In [11]:
prep_params_grid_catboost = {
    'scaling': [None],
    'imputation': ['mode'],
    'cat_age': [False],
    'remove_outliers': [False, True],
    'merge_capital': [False, True],
    'downsampling_method': ['random'],
    'target_freq': [0.8, 0.85, 0.9],
    'generate_dummies': [False]
}

# a first preprocess to get the categorical features
df_tr_pre = preprocessing(df_tr, imputation='mode', cat_age=False, generate_dummies=False)
X_train, y_train = df_tr_pre.drop(target, axis=1), df_tr_pre[target]
cat_features = list(X_train.select_dtypes(include=['category']).columns)

mod_par_grid = {
    'iterations': [500, 750],
    'depth': [1, 2, 4, 6],
    'border_count': [32, 64, 96],
    'random_seed': [SEED],
    'verbose': [0],
    'loss_function': ['Logloss'],
    'eval_metric': ['F1', 'AUC'],
    'cat_features': [cat_features],
}

n_comb(mod_par_grid)

cat_model = CatBoostClassifier()
test_model(cat_model, prep_params_grid_catboost, mod_par_grid, 'catboost')

Number of combinations to be tested: 48
===Iteration 1===
Searching preprocessing parameters...
it: 12/12
Searching model parameters...
it: 48/48
Best metric: 0.7928322198458531
Best preprocessing parameters: [{'scaling': None, 'imputation': 'mode', 'cat_age': False, 'remove_outliers': False, 'merge_capital': False, 'downsampling_method': 'random', 'target_freq': 0.8, 'generate_dummies': False, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'mode', 'cat_age': False, 'remove_outliers': True, 'merge_capital': False, 'downsampling_method': 'random', 'target_freq': 0.85, 'generate_dummies': False, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'mode', 'cat_age': False, 'remove_outliers': True, 'merge_capital': True, 'downsampling_method': 'random', 'target_freq': 0.85, 'generate_dummies': False, 'remove_duplicates': True}, {'scaling': None, 'imputation': 'mode', 'cat_age': False, 'remove_outliers': False, 'merge_capital': True, 'downsampling_method': 'random', 'ta