In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [20]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [21]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Attrition")

<Experiment: artifact_location='/home/godwin/Documents/Workflow/Attriton/notebooks/mlruns/1', creation_time=1701070848011, experiment_id='1', last_update_time=1701070848011, lifecycle_stage='active', name='Attrition', tags={}>

In [22]:
train_data = pd.read_csv('../data/newtrain1.csv')
test_data = pd.read_csv('../data/bct-data-summit/test.csv')

numerical_col = train_data.select_dtypes(exclude=['object']).columns.tolist()
numerical_col.remove('id')
numerical_col.remove( 'Attrition')
numerical_col.remove('EmployeeCount')
numerical_col.remove('StandardHours')
train_data = train_data[train_data['TrainingTimesLastYear'] <= 4]
train_data = train_data[train_data['TrainingTimesLastYear'] > 0]
train_data = train_data[train_data['YearsSinceLastPromotion'] <= 5]
train_data = train_data[train_data['YearsWithCurrManager'] <= 13]
#numerical_col.remove('PerformanceRating')##########

categorical_col = train_data.select_dtypes(include=['object']).columns.tolist()
categorical_col.remove('Over18')

In [23]:
categorical_col = ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus', 'OverTime','newage', 'masterylevel', 'loyaltylevel', 'oldyoung', 'loyal']
numerical_col = ['DailyRate', 'DistanceFromHome',  'Education',  'EnvironmentSatisfaction',
            'HourlyRate', 'JobInvolvement', 'JobSatisfaction',  'MonthlyIncome',  'NumCompaniesWorked', 'PerformanceRating',
            'RelationshipSatisfaction',  'StockOptionLevel',  'TrainingTimesLastYear',  'WorkLifeBalance',]

In [None]:
train_df, test_df = train_test_split(train_data, test_size = 0.25, random_state=0)
train_y, test_y = train_df.pop('Attrition'), test_df.pop("Attrition")

In [None]:
vectorizer = DictVectorizer()

train_dicts = train_df[categorical_col + numerical_col].to_dict(orient='records')
val_dicts = test_df[categorical_col + numerical_col].to_dict(orient='records')

vectorizer.fit(train_dicts)
feature_names = vectorizer.get_feature_names_out().tolist()

X_train = vectorizer.transform(train_dicts)
X_val = vectorizer.transform(val_dicts)

In [None]:
dtrain = xgb.DMatrix(X_train, label = train_y, feature_names = feature_names)
dtest = xgb.DMatrix(X_val, label = test_y, feature_names = feature_names)

In [None]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.set_tag("data", "original")
        mlflow.set_tag("loss", "RMSE")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=1000,
            evals=[(dtest, 'validation')],
            early_stopping_rounds=200
            )
        prediction0 = booster.predict(dtest)
        prediction = (prediction0 >= 0.5).astype('int')
        rmse = mean_squared_error(y_true = test_y.astype('float'), y_pred =prediction0,  squared=True)
        f1 = f1_score(test_y, prediction)
        output = {"acc": accuracy_score(test_y, prediction), "f1_score": f1_score(test_y, prediction), 
                  "precision": precision_score(test_y, prediction), "recall": recall_score(test_y, prediction), "rmse":rmse}
        mlflow.log_metrics(output)

    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'binary:logistic',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

In [None]:
vectorizer = DictVectorizer()

scaler = MinMaxScaler((0,10))
train_df[numerical_col] = scaler.fit_transform(train_df[numerical_col])
test_df[numerical_col] = scaler.transform(test_df[numerical_col])

train_dicts = train_df[categorical_col + numerical_col].to_dict(orient='records')
val_dicts = test_df[categorical_col + numerical_col].to_dict(orient='records')

vectorizer.fit(train_dicts)
feature_names = vectorizer.get_feature_names_out().tolist()


X_train = vectorizer.transform(train_dicts)
X_val = vectorizer.transform(val_dicts)

In [None]:
dtrain = xgb.DMatrix(X_train, label = train_y, feature_names = feature_names)
dtest = xgb.DMatrix(X_val, label = test_y, feature_names = feature_names)

In [None]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.set_tag("data", "engineered")
        mlflow.set_tag("scaler", "standard")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=1000,
            evals=[(dtest, 'validation')],
            early_stopping_rounds=200
            )
        prediction0 = booster.predict(dtest)
        prediction = (prediction0 >= 0.5).astype('int')
        f1 = f1_score(test_y, prediction)
        output = {"acc": accuracy_score(test_y, prediction), "f1_score": f1_score(test_y, prediction), 
                  "precision": precision_score(test_y, prediction), "recall": recall_score(test_y, prediction), }
        mlflow.log_metrics(output)

    return {'loss': -f1, 'status': STATUS_OK}

In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'binary:logistic',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

In [None]:
best_params = {"learning_rate":0.12156676810099117,
               "max_depth":	74,
               "min_child_weight":18.7107722766818,
               "objective":"binary:logistic",
               "reg_alpha":0.03791356716583494,
               "reg_lambda":0.05021397566611148,
               "seed":42}

In [None]:
# with mlflow.start_run():
    
train = xgb.DMatrix(X_train, label=train_y)
valid = xgb.DMatrix(X_val, label=test_y)

best_params = {"learning_rate":0.5252319423885001,
              "max_depth":60,
              "min_child_weight":14.511009837774113,
              "objective":"binary:logistic",
              "reg_alpha":0.024980836694059552,
              "reg_lambda":	0.002504555735308145,
               "seed":42}

# mlflow.log_params(best_params)

booster = xgb.train(
    params=best_params,
    dtrain=train,
    num_boost_round=1000,
    evals=[(valid, 'validation')],
    early_stopping_rounds=200
)

prediction0 = booster.predict(valid)
prediction = (prediction0 >=0.5).astype('int')
f1 = f1_score(test_y, prediction)
output = {"acc": accuracy_score(test_y, prediction), "f1_score": f1_score(test_y, prediction), 
            "precision": precision_score(test_y, prediction), "recall": recall_score(test_y, prediction)}
    #mlflow.log_metrics(output)

    # with open("../Models/preprocessor.b", "wb") as f_out:
    #     pickle.dump(dv, f_out)
    # mlflow.log_artifact("Models/preprocessor.b", artifact_path="preprocessor")

    # mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

In [None]:
output

In [None]:
test_data = pd.read_csv('../data/bct-data-summit/test.csv')

test_data['newage'] = pd.cut(x = test_data['Age'], bins = [17, 30, 42, 61 ], labels = ['18 - 30', '31 - 42', '43 - 60'])


test_data['oldyoung'] = pd.cut(x = test_data['Age'], bins = [17, 30, 61], labels = ['young', 'old'])
test_data['loyal'] = pd.cut(x = test_data['YearsAtCompany'], bins = [-1, 3, 42], labels = ['fairly', 'loyal'])


test_data['masterylevel'] = pd.cut(x = test_data['TotalWorkingYears'], bins = [-1, 3, 10, 421], labels = ['entry', 'intermediate', 'master'])
test_data['loyaltylevel'] = pd.cut(x = test_data['YearsAtCompany'], bins = [-1, 3, 10, 42], labels = ['fairly', 'loyal', 'very-loyal'])
test_data['dueforprom'] = pd.cut(x = test_data['YearsSinceLastPromotion'], bins = [-1, 5,  16], labels = ['due', 'overdue'])


In [None]:
test_dat  = test_data[categorical_col + numerical_col].to_dict(orient='record')
X_test = vectorizer.transform(test_dat)
eval_data  = xgb.DMatrix(X_test)

prediction = booster.predict(eval_data)
dicts = {'id': test_data['id'], 'Attrition': prediction}
output_frame = pd.DataFrame(dicts)

(output_frame['Attrition'] >=0.5).astype('int').sum()

In [None]:
output_frame.to_csv('../submissions/base007.csv', index = False)

In [5]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [6]:
new_train_data = train_data.copy()

undersample = RandomUnderSampler(sampling_strategy=0.4, random_state = 0)
y = new_train_data.pop('Attrition')
X_train_new, y = undersample.fit_resample(new_train_data, y)
train_x,  test_x,train_y, test_y = train_test_split(X_train_new, y, test_size = 0.25, random_state=0)

In [7]:
vectorizer = DictVectorizer()


# scaler = MinMaxScaler()
# train_x[numerical_col] = scaler.fit_transform(train_x[numerical_col])
# test_x[numerical_col] = scaler.transform(test_x[numerical_col])

train_dicts = train_x[categorical_col + numerical_col].to_dict(orient='records')
val_dicts = test_x[categorical_col + numerical_col].to_dict(orient='records')

vectorizer.fit(train_dicts)
feature_names = vectorizer.get_feature_names_out().tolist()

X_train = vectorizer.transform(train_dicts)
X_val = vectorizer.transform(val_dicts)

dtrain = xgb.DMatrix(X_train, label = train_y, feature_names = feature_names)
dtest = xgb.DMatrix(X_val, label = test_y, feature_names = feature_names)

In [None]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.set_tag("data", "full")
        #mlflow.set_tag("scaler", "standard")
        mlflow.set_tag("sampling", 'undersampling')
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=1000,
            evals=[(dtest, 'validation')],
            early_stopping_rounds=200
            )
        prediction0 = booster.predict(dtest)
        prediction = (prediction0 >= 0.5).astype('int')
        aroc = roc_auc_score(test_y,prediction0,)
        f1 = f1_score(test_y, prediction)
        output = {"acc": accuracy_score(test_y, prediction), "f1_score": f1, 
                    "precision": precision_score(test_y, prediction), "recall": recall_score(test_y, prediction), "area_roc":aroc}
        mlflow.log_metrics(output)

    return {'loss': -aroc, 'status': STATUS_OK}

In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'binary:logistic',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

In [17]:
# with mlflow.start_run():
    
# train = xgb.DMatrix(X_train, label=train_y)
# valid = xgb.DMatrix(X_val, label=test_y)

# best_params = {"learning_rate":0.10361831885254409,
#                "max_depth":12,
#                "min_child_weight":14.729488950791687,
#                "objective":"binary:logistic",
#                "reg_alpha":0.3425942221271259,
#                "reg_lambda":0.12207176639906649,
#                "seed":42}

# booster = xgb.train(
#     params=best_params,
#     dtrain=train,
#     num_boost_round=1000,
#     evals=[(valid, 'validation')],
#     early_stopping_rounds=200
# )

prediction0 = booster.predict(train)
prediction = (prediction0 >=0.5).astype('int')
f1 = f1_score(train_y, prediction)
output = {"acc": accuracy_score(train_y, prediction), "f1_score": f1, 
            "precision": precision_score(train_y, prediction), "recall": recall_score(train_y, prediction), 'area_roc':roc_auc_score(train_y, prediction0)}

In [18]:
output

{'acc': 0.9015151515151515,
 'f1_score': 0.8059701492537313,
 'precision': 0.8804347826086957,
 'recall': 0.7431192660550459,
 'area_roc': 0.9562062462040086}

In [13]:
test_data = pd.read_csv('../data/bct-data-summit/test.csv')

test_data['newage'] = pd.cut(x = test_data['Age'], bins = [17, 30, 42, 61 ], labels = ['18 - 30', '31 - 42', '43 - 60'])


test_data['oldyoung'] = pd.cut(x = test_data['Age'], bins = [17, 30, 61], labels = ['young', 'old'])
test_data['loyal'] = pd.cut(x = test_data['YearsAtCompany'], bins = [-1, 3, 42], labels = ['fairly', 'loyal'])


test_data['masterylevel'] = pd.cut(x = test_data['TotalWorkingYears'], bins = [-1, 3, 10, 421], labels = ['entry', 'intermediate', 'master'])
test_data['loyaltylevel'] = pd.cut(x = test_data['YearsAtCompany'], bins = [-1, 3, 10, 42], labels = ['fairly', 'loyal', 'very-loyal'])
test_data['dueforprom'] = pd.cut(x = test_data['YearsSinceLastPromotion'], bins = [-1, 5,  16], labels = ['due', 'overdue'])


In [14]:
test_dat  = test_data[categorical_col + numerical_col].to_dict(orient='record')
X_test = vectorizer.transform(test_dat)
eval_data  = xgb.DMatrix(X_test)

prediction = booster.predict(eval_data)
dicts = {'id': test_data['id'], 'Attrition': prediction}
output_frame = pd.DataFrame(dicts)

(output_frame['Attrition'] >=0.5).astype('int').sum()

147

In [15]:
output_frame.to_csv('../submissions/xgb002.csv', index = False)

In [None]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.set_tag("data", "engineered")
        #mlflow.set_tag("scaler", "standard")
        mlflow.set_tag("sampling", 'undersampling')
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=1000,
            evals=[(dtest, 'validation')],
            early_stopping_rounds=200
            )
        prediction0 = booster.predict(dtest)
        prediction = (prediction0 >= 0.5).astype('int')
        rmse = mean_squared_error(y_true = test_y.astype('float'), y_pred =prediction0,  squared=True)
        f1 = f1_score(test_y, prediction)
        output = {"acc": accuracy_score(test_y, prediction), "f1_score": f1_score(test_y, prediction), 
                  "precision": precision_score(test_y, prediction), "recall": recall_score(test_y, prediction), "rmse":rmse}
        mlflow.log_metrics(output)

    return {'loss': -f1, 'status': STATUS_OK}

In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'binary:logistic',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

In [None]:
oversample = SMOTE()
new_train_data = train_data.copy()
y = new_train_data.pop('Attrition')
train_x,  test_x,train_y, test_y = train_test_split(new_train_data, y, test_size = 0.25, random_state=0)

In [None]:
vectorizer = DictVectorizer()

train_dicts = train_x[categorical_col + numerical_col].to_dict(orient='records')
val_dicts = test_x[categorical_col + numerical_col].to_dict(orient='records')
vectorizer.fit(train_dicts)
feature_names = vectorizer.get_feature_names_out().tolist()

X_train = vectorizer.transform(train_dicts)
X_val = vectorizer.transform(val_dicts)
X_train_new, train_y_new = oversample.fit_resample(X_train, train_y)

In [None]:
dtrain = xgb.DMatrix(X_train, label = train_y, feature_names = feature_names)
dtest = xgb.DMatrix(X_val, label = test_y, feature_names = feature_names)

In [None]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.set_tag("data", "engineered")
        #mlflow.set_tag("scaler", "standard")
        mlflow.set_tag("sampling", 'oversampling')
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=1000,
            evals=[(dtest, 'validation')],
            early_stopping_rounds=200
            )
        prediction0 = booster.predict(dtest)
        prediction = (prediction0 >= 0.5).astype('int')
        rmse = mean_squared_error(y_true = test_y.astype('float'), y_pred =prediction0,  squared=True)
        f1 = f1_score(test_y, prediction)
        output = {"acc": accuracy_score(test_y, prediction), "f1_score": f1_score(test_y, prediction), 
                  "precision": precision_score(test_y, prediction), "recall": recall_score(test_y, prediction), "rmse":rmse}
        mlflow.log_metrics(output)

    return {'loss': -f1, 'status': STATUS_OK}

In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'binary:logistic',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

In [None]:
best_params = {"learning_rate":	0.1992334055006196,
               "max_depth":	71,
               "min_child_weight":6.647476155770954,
               "objective":	"binary:logistic",
               "reg_alpha":	0.009732448304248409,
               "reg_lambda":0.01586123080836735,
               "seed":42}