In [79]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from pathlib import Path
import xgboost as xgb
import lightgbm as lgbm
import catboost
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from IPython.display import display
import optuna
from sklearn.preprocessing import StandardScaler
from category_encoders import LeaveOneOutEncoder
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import copy

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [18]:
BASE_PATH = Path('../input/playground-series-s3e3')

# id is not going to be an informative feature, so we're dropping it for train
# but since we'll need test set's ids to make the submission file, so we'll save those in  a separate varible before dropping
train = pd.read_csv(BASE_PATH / "train.csv").drop(columns="id")
test = pd.read_csv(BASE_PATH / "test.csv")
test_idx = test.id
test = test.drop(columns="id")

# It's been shown that incorporating original data, improves scores - at least on the public leaderboard. So let's do that!
original = pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')

train.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,Gender,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,36,Travel_Frequently,599,Research & Development,24,3,Medical,1,4,Male,...,80,1,10,2,3,10,0,7,8,0
1,35,Travel_Rarely,921,Sales,8,3,Other,1,1,Male,...,80,1,4,3,3,4,2,0,3,0
2,32,Travel_Rarely,718,Sales,26,3,Marketing,1,3,Male,...,80,2,4,3,3,3,2,1,2,0
3,38,Travel_Rarely,1488,Research & Development,2,3,Medical,1,3,Female,...,80,0,15,1,1,6,0,0,2,0
4,50,Travel_Rarely,1017,Research & Development,5,4,Medical,1,2,Female,...,80,0,31,0,3,31,14,4,10,1


# Preprocessing

In [19]:
original['Attrition'] = (original['Attrition'] == 'Yes').astype(np.int64)

# in original data, id is termed as "EmployeeNumber", so let's drop it
original.drop(columns="EmployeeNumber", inplace=True)

In [20]:
# now reordering the features in original dataset
original = original[list(train.columns)]

In [21]:
original["is_original"] = 1
train["is_original"] = 0
test["is_original"] = 0

In [22]:
train_extended = pd.concat([train, original]).reset_index(drop=True)
len(train_extended)

3147

In [23]:
y = train_extended.Attrition
y

0       0
1       0
2       0
3       0
4       1
       ..
3142    0
3143    0
3144    0
3145    0
3146    0
Name: Attrition, Length: 3147, dtype: int64

In [26]:
X_train = train_extended.drop(columns="Attrition")
X_test = test.copy()

## Dropping the useless featuers and encoding the categorical ones

In [27]:
feats_to_drop = [col for col in X_train.columns if X_train[col].nunique()==1]
cat_features = [col for col in X_train.columns if X_train[col].nunique() <= 10 and X_train[col].nunique() > 1]

In [28]:
X_train.drop(columns=feats_to_drop, inplace=True)
X_test.drop(columns=feats_to_drop, inplace=True)

In [29]:
loo_enc = LeaveOneOutEncoder(sigma=0.05)
loo_enc.fit(X_train[cat_features], y)

X_train[cat_features] = loo_enc.transform(X_train[cat_features])
X_test[cat_features] = loo_enc.transform(X_test[cat_features])

X_train.head(3)

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,is_original
0,36,0.223048,599,0.121711,24,3,0.124383,4,0.147996,42,...,2,1,10,2,3,10,0,7,8,0
1,35,0.12859,921,0.173391,8,3,0.103659,1,0.147996,46,...,4,1,4,3,3,4,2,0,3,0
2,32,0.12859,718,0.173391,26,3,0.196141,3,0.147996,80,...,4,2,4,3,3,3,2,1,2,0


## Scaling the numerical featuers

In [30]:
num_features = list(set(X_test.columns) - set(cat_features))

In [34]:
sc = StandardScaler()
sc.fit(X_train[num_features])
X_train[num_features] = sc.transform(X_train[num_features])
X_test[num_features] = sc.transform(X_test[num_features])

## Let's separate competition and original datasets

In [36]:
X_train_comp = X_train[X_train.is_original==0]
y_train_comp = y[X_train_comp.index]

X_train_org = X_train[X_train.is_original==1]
y_train_org = y[X_train_org.index].reset_index(drop=True)
X_train_org.reset_index(drop=True, inplace=True)

## Our data is ready for training. So let's building some models and then do bagging

# Bagging

### Defining best/tuned params for each base model

In [66]:
# X G B O O S T
xgb_params = {'n_estimators': 195,
                 'max_depth': 4,
                 'learning_rate': 0.1562142569601105,
                 'min_child_weight': 9,
                 'gamma': 0.062380752916410806,
                 'subsample': 0.9,
                 'colsample_bytree': 0.2,
#                  'early_stoppig_rounds': 63
             }

# L I G H T G B M
lgbm_params = {"objective": "binary",
               "unbalance": True,
               'n_estimators': 289,
                 'num_rounds': 100,
                 'learning_rate': 0.20387218552865483,
                 'num_leaves': 49,
                 'max_depth': 2,
                 'min_data_in_leaf': 180,
                 'lambda_l1': 0.29454856381940814,
                 'lambda_l2': 0.04768773451967244,
                 'min_gain_to_split': 2.4953566257592468,
                 'bagging_fraction': 0.42646008454113976,
                 'feature_fraction': 0.44305864350467467,
#                  'early_stopping_rounds': 117
              }


# C A T B O O S T
cat_params = {'n_estimators': 1054,
                 'loss_function': 'CrossEntropy',
                 'learning_rate': 0.28958661851562734,
                 'l2_leaf_reg': 0.03231273388976541,
                 'colsample_bylevel': 0.08854889705957293,
                 'depth': 1,
                 'boosting_type': 'Plain',
                 'bootstrap_type': 'MVS',
                 'min_data_in_leaf': 8,
                 'one_hot_max_size': 18,
#                  'early_stopping_rounds': 181
             }

# R A N D O M   F O R E S T
rf_params = {'n_estimators': 269,
                 'max_features': 4,
                 'min_samples_split': 22,
                 'min_samples_leaf': 3,
                 'max_depth': 22}

In [68]:
base_learners = []

xgb_model = xgb.XGBClassifier(**xgb_params)
base_learners.append(xgb_model)

lgbm_model = lgbm.LGBMClassifier(**lgbm_params)
base_learners.append(lgbm_model)

cat_model = catboost.CatBoostClassifier(**cat_params)
base_learners.append(cat_model)

rf_model = RandomForestClassifier(**rf_params)
base_learners.append(rf_model)

In [69]:
predictions = np.zeros((len(base_learners), len(X_test)))
for i, base_learner in enumerate(base_learners):
    # intiating the enesemble
    ensemble = BaggingClassifier(base_estimator=base_learner, n_estimators=10,
                                n_jobs=10, oob_score=True)
    
    ensemble.fit(X_train, y)    
    # getting the predictins scores
    pred_scores = ensemble.predict_proba(X_test)[:, 1]

    predictions[i] = pred_scores

In [None]:
# let's try adding logistic regression to the mix even though it couldn't go above 83.2 or something
# i doubt it will boost any score but let's see 
base_learners_2 = copy.copy(base_learners)

lr_params = {'solver': 'lbfgs',
                 'C': 98.00687488883742,
                 'tol': 2.5878413413159397e-05,
                 'max_iter': 394}

lr_model = LogisticRegression(**lr_params)

base_learners_2.append(lr_model)

predictions_w_lr = np.zeros((len(base_learners_2), len(X_test)))
for i, base_learner in enumerate(base_learners_2):
    # intiating the enesemble
    ensemble_w_lr = BaggingClassifier(base_estimator=base_learner, n_estimators=10,
                                n_jobs=10, oob_score=True)
    
    ensemble_w_lr.fit(X_train, y)    
    # getting the predictins scores
    pred_scores = ensemble_w_lr.predict_proba(X_test)[:, 1]

    predictions_w_lr[i] = pred_scores

In [81]:
final_preds = predictions.mean(axis=0)
final_preds_w_lr = predictions_w_lr.mean(axis=0)

In [77]:
submission = pd.DataFrame({"id": test_idx, "Attrition": final_preds})
submission.head()

Unnamed: 0,id,Attrition
0,1677,0.280711
1,1678,0.151964
2,1679,0.079569
3,1680,0.088797
4,1681,0.592342


In [83]:
submission_w_lr = pd.DataFrame({"id": test_idx, "Attrition": final_preds_w_lr})
submission_w_lr.head()

Unnamed: 0,id,Attrition
0,1677,0.257573
1,1678,0.174536
2,1679,0.08243
3,1680,0.084116
4,1681,0.510196


In [85]:
# submission.to_csv("submission.csv", index=False)
submission_w_lr.to_csv("submission.csv", index=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


937:	learn: 0.1757086	total: 8.91s	remaining: 1.1s
938:	learn: 0.1756625	total: 8.91s	remaining: 1.09s
939:	learn: 0.1756000	total: 8.91s	remaining: 1.08s
940:	learn: 0.1755671	total: 8.92s	remaining: 1.07s
941:	learn: 0.1755396	total: 8.92s	remaining: 1.06s
942:	learn: 0.1755316	total: 8.92s	remaining: 1.05s
943:	learn: 0.1754951	total: 8.92s	remaining: 1.04s
944:	learn: 0.1754615	total: 8.92s	remaining: 1.03s
945:	learn: 0.1754084	total: 8.92s	remaining: 1.02s
946:	learn: 0.1753916	total: 8.92s	remaining: 1.01s
947:	learn: 0.1753368	total: 8.93s	remaining: 998ms
948:	learn: 0.1753131	total: 8.93s	remaining: 988ms
949:	learn: 0.1752693	total: 8.94s	remaining: 978ms
950:	learn: 0.1752233	total: 8.94s	remaining: 968ms
951:	learn: 0.1752048	total: 8.94s	remaining: 958ms
952:	learn: 0.1751891	total: 8.94s	remaining: 948ms
953:	learn: 0.1751357	total: 8.94s	remaining: 938ms
954:	learn: 0.1751224	total: 8.95s	remaining: 927ms
955:	learn: 0.1751225	total: 8.95s	remaining: 917ms
956:	learn: 0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


937:	learn: 0.1813259	total: 8.83s	remaining: 1.09s
938:	learn: 0.1812986	total: 8.86s	remaining: 1.08s
939:	learn: 0.1812839	total: 8.87s	remaining: 1.07s
940:	learn: 0.1812647	total: 8.89s	remaining: 1.07s
941:	learn: 0.1812269	total: 8.89s	remaining: 1.06s
942:	learn: 0.1812022	total: 8.91s	remaining: 1.05s
943:	learn: 0.1811732	total: 8.91s	remaining: 1.04s
944:	learn: 0.1811508	total: 8.93s	remaining: 1.03s
945:	learn: 0.1811383	total: 8.93s	remaining: 1.02s
946:	learn: 0.1811080	total: 8.93s	remaining: 1.01s
947:	learn: 0.1810900	total: 9.03s	remaining: 1.01s
948:	learn: 0.1810537	total: 9.03s	remaining: 999ms
949:	learn: 0.1810341	total: 9.03s	remaining: 989ms
950:	learn: 0.1809912	total: 9.03s	remaining: 979ms
951:	learn: 0.1809776	total: 9.05s	remaining: 969ms
952:	learn: 0.1809519	total: 9.05s	remaining: 959ms
953:	learn: 0.1809352	total: 9.07s	remaining: 951ms
954:	learn: 0.1809231	total: 9.07s	remaining: 940ms
955:	learn: 0.1808992	total: 9.14s	remaining: 937ms
956:	learn: 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


937:	learn: 0.2143918	total: 8.89s	remaining: 1.1s
938:	learn: 0.2143460	total: 8.9s	remaining: 1.09s
939:	learn: 0.2142644	total: 8.91s	remaining: 1.08s
940:	learn: 0.2142373	total: 8.91s	remaining: 1.07s
941:	learn: 0.2142205	total: 8.93s	remaining: 1.06s
942:	learn: 0.2141804	total: 8.93s	remaining: 1.05s
943:	learn: 0.2141735	total: 8.93s	remaining: 1.04s
944:	learn: 0.2141267	total: 8.95s	remaining: 1.03s
945:	learn: 0.2141148	total: 8.95s	remaining: 1.02s
946:	learn: 0.2141023	total: 8.96s	remaining: 1.01s
947:	learn: 0.2140672	total: 8.98s	remaining: 1s
948:	learn: 0.2140229	total: 9s	remaining: 995ms
949:	learn: 0.2140042	total: 9.01s	remaining: 987ms
950:	learn: 0.2139549	total: 9.03s	remaining: 978ms
951:	learn: 0.2139194	total: 9.03s	remaining: 968ms
952:	learn: 0.2139026	total: 9.06s	remaining: 960ms
953:	learn: 0.2138914	total: 9.1s	remaining: 953ms
954:	learn: 0.2138401	total: 9.1s	remaining: 943ms
955:	learn: 0.2138053	total: 9.12s	remaining: 935ms
956:	learn: 0.2138053	

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


937:	learn: 0.1932755	total: 8.99s	remaining: 1.11s
938:	learn: 0.1932356	total: 9s	remaining: 1.1s
939:	learn: 0.1932094	total: 9.01s	remaining: 1.09s
940:	learn: 0.1931855	total: 9.01s	remaining: 1.08s
941:	learn: 0.1931666	total: 9.02s	remaining: 1.07s
942:	learn: 0.1931576	total: 9.02s	remaining: 1.06s
943:	learn: 0.1931205	total: 9.04s	remaining: 1.05s
944:	learn: 0.1931038	total: 9.05s	remaining: 1.04s
945:	learn: 0.1930759	total: 9.05s	remaining: 1.03s
946:	learn: 0.1930670	total: 9.06s	remaining: 1.02s
947:	learn: 0.1930243	total: 9.08s	remaining: 1.01s
948:	learn: 0.1930167	total: 9.08s	remaining: 1s
949:	learn: 0.1929831	total: 9.08s	remaining: 994ms
950:	learn: 0.1929522	total: 9.11s	remaining: 986ms
951:	learn: 0.1929237	total: 9.11s	remaining: 976ms
952:	learn: 0.1928765	total: 9.12s	remaining: 966ms
953:	learn: 0.1928543	total: 9.12s	remaining: 956ms
954:	learn: 0.1928442	total: 9.12s	remaining: 945ms
955:	learn: 0.1926460	total: 9.15s	remaining: 938ms
956:	learn: 0.19263

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


937:	learn: 0.1952258	total: 9.15s	remaining: 1.13s
938:	learn: 0.1951921	total: 9.15s	remaining: 1.12s
939:	learn: 0.1951738	total: 9.22s	remaining: 1.12s
940:	learn: 0.1951569	total: 9.22s	remaining: 1.11s
941:	learn: 0.1951355	total: 9.22s	remaining: 1.1s
942:	learn: 0.1951054	total: 9.26s	remaining: 1.09s
943:	learn: 0.1950958	total: 9.26s	remaining: 1.08s
944:	learn: 0.1950943	total: 9.27s	remaining: 1.07s
945:	learn: 0.1950387	total: 9.27s	remaining: 1.06s
946:	learn: 0.1949854	total: 9.27s	remaining: 1.05s
947:	learn: 0.1949555	total: 9.31s	remaining: 1.04s
948:	learn: 0.1949094	total: 9.32s	remaining: 1.03s
949:	learn: 0.1949081	total: 9.32s	remaining: 1.02s
950:	learn: 0.1948838	total: 9.32s	remaining: 1.01s
951:	learn: 0.1948656	total: 9.36s	remaining: 1s
952:	learn: 0.1948277	total: 9.36s	remaining: 992ms
953:	learn: 0.1947887	total: 9.36s	remaining: 981ms
954:	learn: 0.1947521	total: 9.36s	remaining: 971ms
955:	learn: 0.1947373	total: 9.36s	remaining: 960ms
956:	learn: 0.19

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


936:	learn: 0.1981190	total: 8.9s	remaining: 1.11s
937:	learn: 0.1980996	total: 8.9s	remaining: 1.1s
938:	learn: 0.1980637	total: 8.9s	remaining: 1.09s
939:	learn: 0.1980551	total: 8.93s	remaining: 1.08s
940:	learn: 0.1980195	total: 8.93s	remaining: 1.07s
941:	learn: 0.1979719	total: 8.94s	remaining: 1.06s
942:	learn: 0.1979303	total: 8.94s	remaining: 1.05s
943:	learn: 0.1979230	total: 8.95s	remaining: 1.04s
944:	learn: 0.1979074	total: 8.97s	remaining: 1.03s
945:	learn: 0.1978682	total: 8.97s	remaining: 1.02s
946:	learn: 0.1978256	total: 9.01s	remaining: 1.02s
947:	learn: 0.1977742	total: 9.03s	remaining: 1.01s
948:	learn: 0.1977670	total: 9.04s	remaining: 1000ms
949:	learn: 0.1977351	total: 9.04s	remaining: 989ms
950:	learn: 0.1977121	total: 9.05s	remaining: 980ms
951:	learn: 0.1976860	total: 9.05s	remaining: 970ms
952:	learn: 0.1976686	total: 9.08s	remaining: 962ms
953:	learn: 0.1976422	total: 9.08s	remaining: 952ms
954:	learn: 0.1976205	total: 9.1s	remaining: 943ms
955:	learn: 0.19

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


931:	learn: 0.1863371	total: 869ms	remaining: 114ms
932:	learn: 0.1863222	total: 870ms	remaining: 113ms
933:	learn: 0.1863039	total: 871ms	remaining: 112ms
934:	learn: 0.1862053	total: 872ms	remaining: 111ms
935:	learn: 0.1861735	total: 872ms	remaining: 110ms
936:	learn: 0.1861329	total: 873ms	remaining: 109ms
937:	learn: 0.1861073	total: 874ms	remaining: 108ms
938:	learn: 0.1860534	total: 874ms	remaining: 107ms
939:	learn: 0.1860328	total: 875ms	remaining: 106ms
940:	learn: 0.1860299	total: 876ms	remaining: 105ms
941:	learn: 0.1859981	total: 877ms	remaining: 104ms
942:	learn: 0.1859981	total: 877ms	remaining: 103ms
943:	learn: 0.1859652	total: 878ms	remaining: 102ms
944:	learn: 0.1859607	total: 879ms	remaining: 101ms
945:	learn: 0.1859298	total: 879ms	remaining: 100ms
946:	learn: 0.1859210	total: 880ms	remaining: 99.4ms
947:	learn: 0.1858988	total: 881ms	remaining: 98.5ms
948:	learn: 0.1858754	total: 881ms	remaining: 97.5ms
949:	learn: 0.1858491	total: 882ms	remaining: 96.6ms
950:	lea