In [1]:
import pandas as pd

from hyperopt.pyll import scope
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, f1_score, 
                             precision_score, recall_score, roc_auc_score)

import warnings
warnings.filterwarnings('ignore')

In [2]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Attrition")

<Experiment: artifact_location='/home/godwin/Documents/Workflow/Attriton/notebooks/mlruns/1', creation_time=1703084781950, experiment_id='1', last_update_time=1703084781950, lifecycle_stage='active', name='Attrition', tags={}>

In [5]:
train_data = pd.read_csv('../processed_data/attrition.csv')

numerical_col = train_data.select_dtypes(exclude=['object']).columns.tolist()
numerical_col.remove('id')
numerical_col.remove('attrition')

categorical_col = train_data.select_dtypes(include=['object']).columns.tolist()

In [6]:
train_df, test_df = train_test_split(train_data, test_size = 0.25, random_state=0)
train_y, test_y = train_df.pop('attrition'), test_df.pop("attrition")

X_train = train_df[categorical_col + numerical_col]
X_test = test_df[categorical_col + numerical_col]
categorical_feature_indices = [X_train.columns.get_loc(col) for col in categorical_col]

In [9]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "catboost")
        mlflow.log_params(params)

        booster = CatBoostClassifier(**params)
        booster.fit(X_train, train_y, cat_features=categorical_feature_indices)

        prediction0 = booster.predict(X_test)
        prediction = (prediction0 >= 0.5).astype('int')
        aroc = roc_auc_score(test_y, prediction0)
        f1 = f1_score(test_y, prediction)
        
        output = {"acc": accuracy_score(test_y, prediction), "f1_score": f1, 
                    "precision": precision_score(test_y, prediction), "recall": recall_score(test_y, prediction), "area_roc":aroc}
        mlflow.log_metrics(output)

    return {'loss': -f1, 'status': STATUS_OK}

In [10]:
search_space = {
    'depth': scope.int(hp.quniform('depth', 4, 16, 1)),
    'iterations': 20,
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_samples': scope.int(hp.quniform('max_depth', 4, 30, 1)),
    'loss_function': 'Logloss', 
    'random_seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]0:	learn: 0.5973420	total: 11.8ms	remaining: 224ms
1:	learn: 0.5315728	total: 18.6ms	remaining: 168ms
2:	learn: 0.4753103	total: 26.7ms	remaining: 151ms
3:	learn: 0.3993877	total: 38.7ms	remaining: 155ms
4:	learn: 0.3398840	total: 45.4ms	remaining: 136ms
5:	learn: 0.3083388	total: 51.8ms	remaining: 121ms
6:	learn: 0.2808747	total: 56.7ms	remaining: 105ms
7:	learn: 0.2635590	total: 62.9ms	remaining: 94.4ms
8:	learn: 0.2439714	total: 68.2ms	remaining: 83.4ms
9:	learn: 0.2346724	total: 73.6ms	remaining: 73.6ms
10:	learn: 0.2211841	total: 79.1ms	remaining: 64.7ms
11:	learn: 0.2007767	total: 84.6ms	remaining: 56.4ms
12:	learn: 0.1948375	total: 89.5ms	remaining: 48.2ms
13:	learn: 0.1823458	total: 94.6ms	remaining: 40.5ms
14:	learn: 0.1767944	total: 99.3ms	remaining: 33.1ms
15:	learn: 0.1716820	total: 105ms	remaining: 26.3ms
16:	learn: 0.1605046	total: 110ms	remaining: 19.4ms
17:	learn: 0.1559735	total: 115ms	remaining: 12.8ms
18:	learn: 0

In [11]:
from imblearn.under_sampling import RandomUnderSampler

In [13]:
new_train_data = train_data.copy()

undersample = RandomUnderSampler(sampling_strategy=0.4, random_state = 0)
y = new_train_data.pop('attrition')
X_train_new, y = undersample.fit_resample(new_train_data, y)
train_x,  test_x,train_y, test_y = train_test_split(X_train_new, y, test_size = 0.25, random_state=0)

X_train = train_x[categorical_col + numerical_col]
X_test = test_x[categorical_col + numerical_col]
categorical_feature_indices = [X_train.columns.get_loc(col) for col in categorical_col]

In [14]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "catboost")
        mlflow.log_params(params)

        booster = CatBoostClassifier(**params)
        booster.fit(X_train, train_y, cat_features=categorical_feature_indices)

        prediction0 = booster.predict(X_test)
        prediction = (prediction0 >= 0.5).astype('int')
        aroc = roc_auc_score(test_y, prediction0)
        f1 = f1_score(test_y, prediction)
        
        output = {"acc": accuracy_score(test_y, prediction), "f1_score": f1, 
                    "precision": precision_score(test_y, prediction), "recall": recall_score(test_y, prediction),'area_roc':aroc}
        mlflow.log_metrics(output)

    return {'loss': -f1, 'status': STATUS_OK}

In [15]:
search_space = {
    'depth': scope.int(hp.quniform('depth', 4, 16, 1)),
    'iterations': 100,
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_samples': scope.int(hp.quniform('max_depth', 4, 30, 1)),
    'loss_function': 'Logloss', 
    'random_seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]0:	learn: 0.5744351	total: 7.64ms	remaining: 757ms
1:	learn: 0.5155982	total: 12.1ms	remaining: 594ms
2:	learn: 0.4235666	total: 16.5ms	remaining: 534ms
3:	learn: 0.3666045	total: 21.1ms	remaining: 506ms
4:	learn: 0.3318936	total: 25.6ms	remaining: 486ms
5:	learn: 0.3091433	total: 29.7ms	remaining: 466ms
6:	learn: 0.2904436	total: 33.5ms	remaining: 446ms
7:	learn: 0.2733797	total: 39.1ms	remaining: 450ms
8:	learn: 0.2528660	total: 44.9ms	remaining: 454ms
9:	learn: 0.2420899	total: 49.1ms	remaining: 442ms
10:	learn: 0.2315137	total: 53.6ms	remaining: 434ms
11:	learn: 0.2236307	total: 59.5ms	remaining: 437ms
12:	learn: 0.2208999	total: 61.8ms	remaining: 414ms
13:	learn: 0.1998913	total: 65.8ms	remaining: 404ms
14:	learn: 0.1902553	total: 70.6ms	remaining: 400ms
15:	learn: 0.1796994	total: 76.1ms	remaining: 400ms
16:	learn: 0.1652893	total: 80.1ms	remaining: 391ms
17:	learn: 0.1545221	total: 86.6ms	remaining: 394ms
18:	learn: 0.1416244