In [18]:
import json
import pandas as pd

from hyperopt.pyll import scope
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

import lightgbm as lgb
from sklearn.metrics import (accuracy_score, f1_score, 
                             precision_score, recall_score, roc_auc_score)

import warnings
warnings.filterwarnings('ignore')

In [2]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Attrition")

2023/12/20 16:06:20 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/12/20 16:06:20 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='/home/godwin/Documents/Workflow/Attriton/notebooks/mlruns/1', creation_time=1703084781950, experiment_id='1', last_update_time=1703084781950, lifecycle_stage='active', name='Attrition', tags={}>

In [3]:
train_data = pd.read_csv('../processed_data/attrition.csv')

numerical_col = train_data.select_dtypes(exclude=['object']).columns.tolist()
numerical_col.remove('id')
numerical_col.remove('attrition')

categorical_col = train_data.select_dtypes(include=['object']).columns.tolist()

In [4]:
# categorical_col = ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus', 'OverTime','newage', 'masterylevel', 'loyaltylevel', 'oldyoung', 'loyal']
# numerical_col = ['DailyRate', 'DistanceFromHome',  'Education',  'EnvironmentSatisfaction',
#             'HourlyRate', 'JobInvolvement', 'JobSatisfaction',  'MonthlyIncome',  'NumCompaniesWorked', 'PerformanceRating',
#             'RelationshipSatisfaction',  'StockOptionLevel',  'TrainingTimesLastYear',  'WorkLifeBalance',]

In [5]:
train_df, test_df = train_test_split(train_data, test_size = 0.25, random_state=0)
train_y, test_y = train_df.pop('attrition'), test_df.pop("attrition")

In [6]:
vectorizer = DictVectorizer()

train_dicts = train_df[categorical_col + numerical_col].to_dict(orient='records')
val_dicts = test_df[categorical_col + numerical_col].to_dict(orient='records')

vectorizer.fit(train_dicts)
feature_names = vectorizer.get_feature_names_out().tolist()

X_train = vectorizer.transform(train_dicts)
X_val = vectorizer.transform(val_dicts)

In [7]:
dtrain = lgb.Dataset(X_train, label = train_y, free_raw_data=False)
dtest = lgb.Dataset(X_val, label = test_y, reference=dtrain,
                    free_raw_data=False)

In [8]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "lgb")
        mlflow.set_tag("Features", [categorical_col + numerical_col])
        mlflow.log_params(params)
        booster = lgb.train(
            params=params,
            train_set=dtrain, 
            num_boost_round=1000,
            valid_sets=[dtest],
            )
        prediction0 = booster.predict(X_val)
        prediction = (prediction0 >= 0.5).astype('int')
        aroc = roc_auc_score(test_y, prediction0)
        f1 = f1_score(test_y, prediction)
        output = {"acc": accuracy_score(test_y, prediction), 
                  "f1_score": f1, 
                  "precision": precision_score(test_y, prediction), 
                  "recall": recall_score(test_y, prediction),
                  "area_roc":aroc}
        mlflow.log_metrics(output)

    return {'loss': -f1, 'status': STATUS_OK}

In [9]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'boosting': 'gbdt',
    'num_iterations': 120,
    'num_leaves': scope.int(hp.quniform('num_leaves', 4, 100, 1)),
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000234 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1178                     
[LightGBM] [Info] Number of data points in the train set: 1257, number of used features: 61
[LightGBM] [Info] Start training from score 0.119332  
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001505 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1178                                                
[LightGBM] [Info] Number of data points in the train set: 1257, number of used features: 61
[LightGBM] [Info] Start training from score 0.119332                             
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000220 seconds.
You can set `force_row_wise=true` to remove the ov

In [10]:
best_result

{'learning_rate': 0.20538572544887793, 'max_depth': 64.0, 'num_leaves': 22.0}

In [11]:
from imblearn.under_sampling import RandomUnderSampler

In [12]:
new_train_data = train_data.copy()

undersample = RandomUnderSampler(sampling_strategy=0.4, random_state = 0)
y = new_train_data.pop('attrition')
X_train_new, y = undersample.fit_resample(new_train_data, y)
train_x, test_x,train_y, test_y = train_test_split(X_train_new, y, test_size = 0.25, random_state=0)

In [13]:
vectorizer = DictVectorizer()

train_dicts = train_x[categorical_col + numerical_col].to_dict(orient='records')
val_dicts = test_x[categorical_col + numerical_col].to_dict(orient='records')

vectorizer.fit(train_dicts)
feature_names = vectorizer.get_feature_names_out().tolist()

X_train = vectorizer.transform(train_dicts)
X_val = vectorizer.transform(val_dicts)

dtrain = lgb.Dataset(X_train, label = train_y)
dtest = lgb.Dataset(X_val, label = test_y)

In [14]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "lgb")
        mlflow.set_tag("sampling", 'undersampling')
        mlflow.set_tag('Features', [categorical_col + numerical_col])
        mlflow.log_params(params)
        booster = lgb.train(
                params=params,
                train_set=dtrain, 
                num_boost_round=1000,
                valid_sets=[dtest],
                )
        prediction = booster.predict(X_val)
        prediction_label = (prediction >= 0.5).astype('int')
        aroc = roc_auc_score(test_y, prediction)
        f1 = f1_score(test_y, prediction_label)
        output = {"acc": accuracy_score(test_y, prediction_label), "f1_score": f1, 
                  "precision": precision_score(test_y, prediction_label), 
                  "recall": recall_score(test_y, prediction_label), "area_roc":aroc}
        mlflow.log_metrics(output)

    return {'loss': -f1, 'status': STATUS_OK}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001875 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 906                      
[LightGBM] [Info] Number of data points in the train set: 525, number of used features: 58
[LightGBM] [Info] Start training from score 0.289524  
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001110 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 906                                                 
[LightGBM] [Info] Number of data points in the train set: 525, number of used features: 58
[LightGBM] [Info] Start training from score 0.289524                             
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.0006

In [15]:
best_result

{'learning_rate': 0.22107113972653164, 'max_depth': 69.0, 'num_leaves': 4.0}

In [16]:
train = lgb.Dataset(X_train, label=train_y)
valid = lgb.Dataset(X_val, label=test_y)

best_params = {"boosting":"gbdt",
            "learning_rate":best_result['learning_rate'],
            "max_depth":int(best_result['max_depth']),
            "num_iterations":120,
            "num_leaves":int(best_result['num_leaves']),
            "seed":42}

booster = lgb.train(
    params=best_params,
    train_set=train,
    num_boost_round=1000,
    valid_sets=[valid],
)

prediction0 = booster.predict(X_val)
prediction = (prediction0 >=0.5).astype('int')
f1 = f1_score(test_y, prediction)
output = {"acc": accuracy_score(test_y, prediction), "f1_score": f1, 
          "precision": precision_score(test_y, prediction), 
          "recall": recall_score(test_y, prediction), 
          "area_roc":roc_auc_score(test_y, prediction0)}

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 906
[LightGBM] [Info] Number of data points in the train set: 525, number of used features: 58
[LightGBM] [Info] Start training from score 0.289524


In [19]:
with open('../parameters.json', 'w') as json_file:
    json.dump(best_result, json_file, indent=2)

In [51]:
test_data = pd.read_csv('../data/bct-data-summit/test.csv')

test_data['newage'] = pd.cut(x = test_data['Age'], 
                             bins = [17, 30, 42, 61 ], 
                             labels = ['18 - 30', '31 - 42', '43 - 60'])
test_data['oldyoung'] = pd.cut(x = test_data['Age'], 
                               bins = [17, 30, 61], labels = ['young', 'old'])
test_data['loyal'] = pd.cut(x = test_data['YearsAtCompany'], 
                            bins = [-1, 3, 42], labels = ['fairly', 'loyal'])
test_data['masterylevel'] = pd.cut(x = test_data['TotalWorkingYears'],
                                   bins = [-1, 3, 10, 421], labels = ['entry', 'intermediate', 'master'])
test_data['loyaltylevel'] = pd.cut(x = test_data['YearsAtCompany'], 
                                   bins = [-1, 3, 10, 42], labels = ['fairly', 'loyal', 'very-loyal'])
test_data['dueforprom'] = pd.cut(x = test_data['YearsSinceLastPromotion'], 
                                 bins = [-1, 5,  16], labels = ['due', 'overdue'])

In [52]:
test_dat  = test_data[categorical_col + numerical_col].to_dict(orient='record')
X_test = vectorizer.transform(test_dat)

prediction = booster.predict(X_test).round(2)
dicts = {'id': test_data['id'], 'Attrition': prediction}
output_frame = pd.DataFrame(dicts)

(output_frame['Attrition'] >=0.5).astype('int').sum()

186

In [53]:
output_frame.to_csv('../submissions/lgb002.csv', index = False)