In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pprint import pprint

from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from auxiliar_func import *
from plot_func import *

In [2]:
df = pd.read_csv('Census-Income-KDD.csv')
target = 'income_50k'
df_tr, df_te = train_test_split(df, test_size=0.3, random_state=42)

## Finding the best preprocessing parameters

In [7]:
params = {
    'scaling': [None],
    'imputation': ['mode'],
    'cat_age': [False],
    'target_freq': [0.7, 0.8, 0.9, 1.0],
    'generate_dummies': [False]
}

# a first preprocess to get the categorical features
df_tr_pre = preprocessing(df_tr, scaling=None, imputation='mode', cat_age=False, target_freq=0.8, generate_dummies=False)
X_train, y_train = df_tr_pre.drop(target, axis=1), df_tr_pre[target]
cat_features = list(X_train.select_dtypes(include=['category']).columns)

cat_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=42,
    logging_level='Silent',
    cat_features=cat_features
)

results = test_preprocess_params(df_tr, cat_model, params, cv=5)

results.to_csv('results_catboost.csv', index=False)

Adjusting for (None, 'mode', False, 0.7, False)
Adjusting for (None, 'mode', False, 0.8, False)


  results = pd.concat([results, pd.DataFrame([list(combination) + list(cross_val_results.values())],


Adjusting for (None, 'mode', False, 0.9, False)


  results = pd.concat([results, pd.DataFrame([list(combination) + list(cross_val_results.values())],


Adjusting for (None, 'mode', False, 1.0, False)


  results = pd.concat([results, pd.DataFrame([list(combination) + list(cross_val_results.values())],


## Metrics of the final catboost model

In [9]:
prep_params = {
    'scaling': None,
    'imputation': 'mode',
    'cat_age': False,
    'target_freq': 0.8,
    'generate_dummies': False
}

# a first preprocess to get the categorical features
df_tr_pre = preprocessing(df_tr, scaling=None, imputation='mode', cat_age=False, target_freq=0.8, generate_dummies=False)
X_train, y_train = df_tr_pre.drop(target, axis=1), df_tr_pre[target]
cat_features = list(X_train.select_dtypes(include=['category']).columns)

cat_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    eval_metric='AUC',
    random_seed=42,
    logging_level='Silent',
    cat_features=cat_features
)

pprint(cross_validation(cat_model, df_tr, prep_params, cv=5))

{'accuracy': 0.9469233803228757,
 'f1_macro': 0.7935422430562745,
 'precision_macro': 0.7720670224462554,
 'recall_macro': 0.8199608440229795}
