In [27]:
import pandas as pd
import numpy as np

from hyperopt.pyll import scope
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, f1_score, 
                             precision_score, recall_score, roc_auc_score)

import warnings
warnings.filterwarnings('ignore')

In [29]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Attrition")

<Experiment: artifact_location='/home/godwin/Documents/Workflow/Attriton/notebooks/mlruns/1', creation_time=1701070848011, experiment_id='1', last_update_time=1701070848011, lifecycle_stage='active', name='Attrition', tags={}>

In [38]:
train_data = pd.read_csv('../data/newtrain1.csv')
test_data = pd.read_csv('../data/bct-data-summit/test.csv')

numerical_col = train_data.select_dtypes(exclude=['object']).columns.tolist()
numerical_col.remove('id')
numerical_col.remove( 'Attrition')
numerical_col.remove('EmployeeCount')
numerical_col.remove('StandardHours')

categorical_col = train_data.select_dtypes(include=['object']).columns.tolist()
categorical_col.remove('Over18')

In [39]:
train_df, test_df = train_test_split(train_data, test_size = 0.25, random_state=0)
train_y, test_y = train_df.pop('Attrition'), test_df.pop("Attrition")

X_train = train_df[categorical_col + numerical_col]
X_test = test_df[categorical_col + numerical_col]
categorical_feature_indices = [X_train.columns.get_loc(col) for col in categorical_col]

In [40]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "catboost")
        mlflow.log_params(params)

        booster = CatBoostClassifier(**params)
        booster.fit(X_train, train_y, cat_features=categorical_feature_indices)

        prediction0 = booster.predict(X_test)
        prediction = (prediction0 >= 0.5).astype('int')
        aroc = roc_auc_score(test_y, prediction0)
        f1 = f1_score(test_y, prediction)
        
        output = {"acc": accuracy_score(test_y, prediction), "f1_score": f1, 
                    "precision": precision_score(test_y, prediction), "recall": recall_score(test_y, prediction), "area_roc":aroc}
        mlflow.log_metrics(output)

    return {'loss': -aroc, 'status': STATUS_OK}

In [41]:
search_space = {
    'depth': scope.int(hp.quniform('depth', 4, 16, 1)),
    'iterations': 20,
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_samples': scope.int(hp.quniform('max_depth', 4, 30, 1)),
    'loss_function': 'Logloss', 
    'random_seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]0:	learn: 0.3468405	total: 4.66ms	remaining: 88.5ms
1:	learn: 0.2760630	total: 8.56ms	remaining: 77ms
2:	learn: 0.2600642	total: 12ms	remaining: 67.7ms
3:	learn: 0.2451196	total: 19.9ms	remaining: 79.7ms
4:	learn: 0.2340620	total: 25.3ms	remaining: 75.9ms
5:	learn: 0.2163390	total: 33.5ms	remaining: 78.2ms
6:	learn: 0.2006830	total: 37.5ms	remaining: 69.6ms
7:	learn: 0.1855461	total: 43.1ms	remaining: 64.6ms
8:	learn: 0.1772502	total: 47.3ms	remaining: 57.8ms
9:	learn: 0.1694786	total: 52.8ms	remaining: 52.8ms
10:	learn: 0.1598955	total: 57.8ms	remaining: 47.3ms
11:	learn: 0.1515453	total: 63.3ms	remaining: 42.2ms
12:	learn: 0.1336965	total: 68.6ms	remaining: 37ms
13:	learn: 0.1278088	total: 73.8ms	remaining: 31.6ms
14:	learn: 0.1170880	total: 78.6ms	remaining: 26.2ms
15:	learn: 0.1114006	total: 86.4ms	remaining: 21.6ms
16:	learn: 0.1029879	total: 89.9ms	remaining: 15.9ms
17:	learn: 0.0949135	total: 93.3ms	remaining: 10.4ms
18:	lear

In [42]:
from imblearn.under_sampling import RandomUnderSampler

In [43]:
new_train_data = train_data.copy()

undersample = RandomUnderSampler(sampling_strategy=0.4, random_state = 0)
y = new_train_data.pop('Attrition')
X_train_new, y = undersample.fit_resample(new_train_data, y)
train_x,  test_x,train_y, test_y = train_test_split(X_train_new, y, test_size = 0.25, random_state=0)

X_train = train_x[categorical_col + numerical_col]
X_test = test_x[categorical_col + numerical_col]
categorical_feature_indices = [X_train.columns.get_loc(col) for col in categorical_col]

In [45]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "catboost")
        mlflow.log_params(params)

        booster = CatBoostClassifier(**params)
        booster.fit(X_train, train_y, cat_features=categorical_feature_indices)

        prediction0 = booster.predict(X_test)
        prediction = (prediction0 >= 0.5).astype('int')
        aroc = roc_auc_score(test_y, prediction0)
        f1 = f1_score(test_y, prediction)
        
        output = {"acc": accuracy_score(test_y, prediction), "f1_score": f1, 
                    "precision": precision_score(test_y, prediction), "recall": recall_score(test_y, prediction),'area_roc':aroc}
        mlflow.log_metrics(output)

    return {'loss': -aroc, 'status': STATUS_OK}

In [47]:
search_space = {
    'depth': scope.int(hp.quniform('depth', 4, 16, 1)),
    'iterations': 100,
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_samples': scope.int(hp.quniform('max_depth', 4, 30, 1)),
    'loss_function': 'Logloss', 
    'random_seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]0:	learn: 0.5107803	total: 2.55ms	remaining: 253ms
1:	learn: 0.4487390	total: 5.19ms	remaining: 254ms
2:	learn: 0.4249826	total: 7.66ms	remaining: 248ms
3:	learn: 0.4071376	total: 11ms	remaining: 265ms
4:	learn: 0.3981780	total: 13.5ms	remaining: 256ms
5:	learn: 0.3867258	total: 15.8ms	remaining: 248ms
6:	learn: 0.3608529	total: 18.2ms	remaining: 242ms
7:	learn: 0.3415989	total: 21.4ms	remaining: 246ms
8:	learn: 0.3332519	total: 26.6ms	remaining: 269ms
9:	learn: 0.3239992	total: 29.4ms	remaining: 265ms
10:	learn: 0.3171594	total: 36.2ms	remaining: 293ms
11:	learn: 0.2974152	total: 42.7ms	remaining: 313ms
12:	learn: 0.2719971	total: 54.9ms	remaining: 368ms
13:	learn: 0.2569091	total: 59.9ms	remaining: 368ms
14:	learn: 0.2457718	total: 62.8ms	remaining: 356ms
15:	learn: 0.2394449	total: 65.1ms	remaining: 342ms
16:	learn: 0.2200066	total: 67.5ms	remaining: 330ms
17:	learn: 0.2076081	total: 71.2ms	remaining: 324ms
18:	learn: 0.1902907	t