# TODO:
https://www.kaggle.com/competitions/playground-series-s3e2/discussion/378795
https://www.kaggle.com/competitions/playground-series-s3e2/discussion/378780

These show that we should incorporate original data but when validating using K fold methods, we should only validate based on data in the competition dataset and not on original dataset. So implement this technique for this competition

### Also, bagging resulted in a lot better score with simple mean in the last competition, thought it didn't score much better on public, it absolutely ranked up to 60th position in the final private LB. So,
## Remember to trust your cvs over pbl

# A Few more TODOs:
* select features with less than or equal to 10 values as catergorical features, instead of current 20, see if it improves the score
* Try target encoding, weights of evidence AND leave one out encoding, see which one performs better

# Imports

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from pathlib import Path
import xgboost as xgb
import lightgbm as lgbm
import catboost
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from IPython.display import display
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
import optuna
from sklearn.preprocessing import StandardScaler

from category_encoders import TargetEncoder, LeaveOneOutEncoder, WOEEncoder

In [3]:
import warnings
warnings.filterwarnings('ignore')

# Loading Data

In [4]:
BASE_PATH = Path('../input/playground-series-s3e3')

# id is not going to be an informative feature, so we're dropping it for train
# but since we'll need test set's ids to make the submission file, so we'll save those in  a separate varible before dropping
train = pd.read_csv(BASE_PATH / "train.csv").drop(columns="id")
test = pd.read_csv(BASE_PATH / "test.csv")
test_idx = test.id
test = test.drop(columns="id")

# It's been shown that incorporating original data, improves scores - at least on the public leaderboard. So let's do that!
original = pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')

train.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,Gender,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,36,Travel_Frequently,599,Research & Development,24,3,Medical,1,4,Male,...,80,1,10,2,3,10,0,7,8,0
1,35,Travel_Rarely,921,Sales,8,3,Other,1,1,Male,...,80,1,4,3,3,4,2,0,3,0
2,32,Travel_Rarely,718,Sales,26,3,Marketing,1,3,Male,...,80,2,4,3,3,3,2,1,2,0
3,38,Travel_Rarely,1488,Research & Development,2,3,Medical,1,3,Female,...,80,0,15,1,1,6,0,0,2,0
4,50,Travel_Rarely,1017,Research & Development,5,4,Medical,1,2,Female,...,80,0,31,0,3,31,14,4,10,1


In [5]:
original.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


# Pre-Pre-Processing

### Let's make the feature names and order consistent b/w our competition dataset and original dataset, before we concatenate

In [6]:
original['Attrition'] = (original['Attrition'] == 'Yes').astype(np.int64)

# in original data, id is termed as "EmployeeNumber", so let's drop it
original.drop(columns="EmployeeNumber", inplace=True)

In [7]:
# now reordering the features in original dataset
original = original[list(train.columns)]

### Someone in the first compeition showed that adding a source feature i.e. a feature that indicates whether a given record is from original dataset or synthetic improves performance, so let's' do that!

In [8]:
original["is_original"] = 1
train["is_original"] = 0
test["is_original"] = 0

### Let's finally concatenate

In [9]:
train_extended = pd.concat([train, original]).reset_index(drop=True)
len(train_extended)

3147

### checking for null values

In [11]:
pd.concat([train_extended.isnull().sum().rename("Missing in Train"),
           test.isnull().sum().rename("Missing in Test")], axis=1).sort_values(by="Missing in Train")

Unnamed: 0,Missing in Train,Missing in Test
Age,0,0.0
Over18,0,0.0
OverTime,0,0.0
PercentSalaryHike,0,0.0
PerformanceRating,0,0.0
RelationshipSatisfaction,0,0.0
StandardHours,0,0.0
NumCompaniesWorked,0,0.0
StockOptionLevel,0,0.0
TrainingTimesLastYear,0,0.0


#### Insights: No missing values! Something to celebrate! :p

## Let's also concatenate test data to train

In [12]:
y = train_extended.Attrition
y

0       0
1       0
2       0
3       0
4       1
       ..
3142    0
3143    0
3144    0
3145    0
3146    0
Name: Attrition, Length: 3147, dtype: int64

In [13]:
df = pd.concat([train_extended.drop(columns="Attrition"), test])

# Preprocessing

### Identifying Categorical Features

In [14]:
df.dtypes.sort_values()

Age                          int64
YearsSinceLastPromotion      int64
YearsInCurrentRole           int64
YearsAtCompany               int64
WorkLifeBalance              int64
TrainingTimesLastYear        int64
TotalWorkingYears            int64
StockOptionLevel             int64
StandardHours                int64
RelationshipSatisfaction     int64
PerformanceRating            int64
PercentSalaryHike            int64
NumCompaniesWorked           int64
MonthlyRate                  int64
YearsWithCurrManager         int64
MonthlyIncome                int64
JobSatisfaction              int64
DailyRate                    int64
DistanceFromHome             int64
Education                    int64
EmployeeCount                int64
HourlyRate                   int64
EnvironmentSatisfaction      int64
JobLevel                     int64
JobInvolvement               int64
is_original                  int64
Gender                      object
MaritalStatus               object
OverTime            

### Remember, being of type int, doesn't mean that the feature cannot be categorial.
#### Let's check for unique values in each column

In [15]:
df.nunique().sort_values()

StandardHours                  1
EmployeeCount                  1
Over18                         1
is_original                    2
PerformanceRating              2
OverTime                       2
Gender                         2
BusinessTravel                 3
Department                     3
MaritalStatus                  3
RelationshipSatisfaction       4
JobSatisfaction                4
WorkLifeBalance                4
StockOptionLevel               5
JobInvolvement                 5
EnvironmentSatisfaction        5
Education                      6
JobLevel                       6
EducationField                 6
TrainingTimesLastYear          7
JobRole                        9
NumCompaniesWorked            11
PercentSalaryHike             15
YearsSinceLastPromotion       16
YearsWithCurrManager          18
YearsInCurrentRole            19
DistanceFromHome              29
YearsAtCompany                38
TotalWorkingYears             41
Age                           43
HourlyRate

#### INSIGHTS: Taking a quick look at number of unique values in features reveals that we should be safe setting the threshold for to 20 unique values for what consitutes as a categorical feature
#### We'll drop columns with only one value as they bring nothing to the table

#### But feel free to use your own intuition and test & trial to figure our what's works best in terms of threshold and features

In [16]:
feats_to_drop = [col for col in df.columns if df[col].nunique()==1]
cat_features = [col for col in df.columns if df[col].nunique() <= 20 and df[col].nunique() > 1]

In [17]:
df.drop(columns=feats_to_drop, inplace=True)

#### We won't use one hot encoder here, because we already have a large ratio of features to rows and one hotting would increase that ratio by a large margin even further which will result in severe overfitting
#### Rather we'll use ordinal/label encoder (they're basically the same thing)

In [18]:
len(df), len(y)

(4266, 3147)

In [19]:
# but first let's separate test and train_extended
X_train = df.iloc[:-len(test), :]
X_test = df.iloc[-len(test): , :]

In [20]:
target_enc = TargetEncoder()
loo_enc = LeaveOneOutEncoder(sigma=0.05)
woe_enc = WOEEncoder(sigma=0.05)

target_enc.fit(X_train[cat_features], y)

X_train[cat_features] = target_enc.transform(X_train[cat_features])
X_test[cat_features] = target_enc.transform(X_test[cat_features])

X_train.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,is_original
0,36,0.223048,599,0.121711,24,3,0.124383,4,0.147996,42,...,2,1,10,2,3,10,0,7,8,0
1,35,0.12859,921,0.173391,8,3,0.103659,1,0.147996,46,...,4,1,4,3,3,4,2,0,3,0
2,32,0.12859,718,0.173391,26,3,0.196141,3,0.147996,80,...,4,2,4,3,3,3,2,1,2,0
3,38,0.12859,1488,0.121711,2,3,0.124383,3,0.124063,40,...,3,0,15,1,1,6,0,0,2,0
4,50,0.12859,1017,0.121711,5,4,0.124383,2,0.124063,37,...,3,0,31,0,3,31,14,4,10,0


In [22]:
numerical_feats = list(set(df.columns) - set(cat_features))

(len(numerical_feats) + len(cat_features)) == len(df.columns)

True

In [23]:
numerical_feats

['MonthlyIncome',
 'HourlyRate',
 'TotalWorkingYears',
 'Age',
 'DistanceFromHome',
 'DailyRate',
 'YearsAtCompany',
 'MonthlyRate']

## Always a good idea to scale the features

In [26]:
sc = StandardScaler()
X_train[numerical_feats] = sc.fit_transform(X_train[numerical_feats])
X_test[numerical_feats] = sc.transform(X_test[numerical_feats])

### Let's seprate comp and original sets

In [27]:
#let's also separate original and copetition data
X_comp = X_train[X_train.is_original==0]
y_comp = y[X_comp.index]

X_original = X_train[X_train.is_original==1]
y_original = y[X_original.index].reset_index(drop=True)
X_original = X_original.reset_index(drop=True)

# Modelling

### But first, let's setup cross validation

In [23]:
# for i, (x, y) in enumerate(zip([1,2,3], [4,5,6])):
#     print(f"{'*'*10} {i}")
#     print(f"X: {x}")
#     print(f"Y: {y}")    

In [24]:
# a = np.array([1,2,3])
# b = np.array([4,5,6])

# np.append(a, b)

In [33]:
# we're gonna train on the combined dataset but, we'll only calculate the validation score only on comp data

# N_FOLDS = 10

def cross_validate(X, y, model, model_verbose=None, verbose=None, X_original=None, y_original=None):
    N_FOLDS = 5
    all_scores = np.zeros(N_FOLDS)

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=1337)

    
    for fold_id, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # for training we'll use data from both datasets
        if X_original is not None:
            X_tr = pd.concat([X_tr, X_original], axis=0).reset_index(drop=True)
            y_tr = pd.concat([y_tr, y_original], axis=0).reset_index(drop=True)
               
        model.fit(X_tr, y_tr, 
                  eval_set=[(X_val, y_val)],
                  early_stopping_rounds=50,
                 verbose=model_verbose)
        
        y_pred = model.predict_proba(X_val)[:, 1]
        
        auc = roc_auc_score(y_val, y_pred)
        
        print(f"Fold {fold_id} \t auc: {auc}")
        
        all_scores[fold_id] = (auc)
    
    avg_auc = np.mean(all_scores)
    
    print(f"Avg AUC: {avg_auc}")

In [34]:
# random params values - make sure to tune yours
xgb_params = {'n_estimators': 150,
                 'max_depth': 3,
                 'learning_rate': 0.1,
                 'min_child_weight': 4,
                 'subsample': 0.7,
                 'colsample_bytree': 0.3
             }


xgb_clf = xgb.XGBClassifier(**xgb_params)

cross_validate(X_comp, y_comp, xgb_clf, model_verbose=False,
                           X_original=X_original, y_original=y_original)

# xgb_clf.fit(X_train, y, verbose=0)

Fold 0 	 auc: 0.8356418918918919
Fold 1 	 auc: 0.9069256756756756
Fold 2 	 auc: 0.8169491525423729
Fold 3 	 auc: 0.8648305084745762
Fold 4 	 auc: 0.9005084745762711
Avg AUC: 0.8649711406321575


In [None]:
# random params but feel free to tune
lgbm_params = {'n_estimators': 407,
                 'num_rounds': 274,
                 'learning_rate': 0.1,
                 'num_leaves': 195,
                 'max_depth': 9,
                 'min_data_in_leaf': 46,
                 'lambda_l1': 0.01,
                 'lambda_l2': 0.6,
                 'min_gain_to_split': 1.42,
                 'bagging_fraction': 0.45,
                 'feature_fraction': 0.3}


lgbm_clf = lgbm.LGBMClassifier(**lgbm_params)

cross_validate(X_train, y, lgbm_clf, model_verbose=-1)

# lgbm_clf.fit(X_train, y, verbose=False)

In [50]:
#random params but feel free to tune
catboost_params = {'loss_function': 'CrossEntropy',
                     'learning_rate': 0.76,
                     'l2_leaf_reg': 0.014,
                     'colsample_bylevel': 0.06,
                     'depth': 1,
                     'boosting_type': 'Plain',
                     'bootstrap_type': 'Bernoulli',
                     'min_data_in_leaf': 18,
                     'one_hot_max_size': 14,
                     'subsample': 0.99}

catboost_clf = catboost.CatBoostClassifier(**catboost_params)

cross_validate(X_comp, y_comp, catboost_clf, model_verbose=False)

# catboost_clf.fit(X_train, y, verbose=False)

Fold 0 	 auc: 0.8514358108108109
Fold 1 	 auc: 0.9086993243243243
Fold 2 	 auc: 0.8044915254237287
Fold 3 	 auc: 0.8661864406779661
Fold 4 	 auc: 0.9154237288135594
Avg AUC: 0.8692473660100777


## INSIGHTS:
let's use this method of cross validation to
* Tune all our models
* Select top k
* Take their predictions average
* submit

In [75]:
np.random.randint(1, 10, size=(2,3))

array([[7, 2, 9],
       [7, 1, 7]])

In [83]:
some_X = pd.DataFrame(data=np.random.randint(1, 10, size=(2,3)))
pd.concat([some_X, some_X], axis=0).reset_index(drop=True)

Unnamed: 0,0,1,2
0,6,1,9
1,3,3,5
2,6,1,9
3,3,3,5


# Hyperparameters Tuning

## XGBoost

In [35]:
def objective_xgb(trial, X, y, X_original, y_original):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 0.00001, 0.3),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0, step=0.05),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 1.0, step=0.05),
        'early_stopping_rounds': trial.suggest_int("early_stoppig_rounds", 40, 100)
    }
    # we're gonna train on the combined dataset but, we'll only calculate the validation score only on comp data

    N_FOLDS = 5
    all_scores = np.zeros(N_FOLDS)

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=1337)

    for fold_id, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # for training we'll use data from both datasets
        if X_original is not None:
            X_tr = pd.concat([X_tr, X_original], axis=0)
            y_tr = pd.concat([y_tr, y_original], axis=0)
        
        model = xgb.XGBClassifier(**params)
        model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
        
        y_pred = model.predict_proba(X_val)[:, 1]
                
        auc = roc_auc_score(y_val, y_pred)        
        all_scores[fold_id] = auc
    
    avg_auc = np.mean(all_scores)
    
    print(f"Avg AUC: {avg_auc}")
    
    return avg_auc

In [36]:
study_xgb = optuna.create_study(study_name="xgboost_tuning", direction="maximize")
func = lambda trial: objective_xgb(trial, X_comp, y_comp, X_original, y_original)
study_xgb.optimize(func, n_trials=30)

[32m[I 2023-01-21 08:16:17,618][0m A new study created in memory with name: xgboost_tuning[0m
[32m[I 2023-01-21 08:16:23,921][0m Trial 0 finished with value: 0.858027027027027 and parameters: {'n_estimators': 288, 'max_depth': 5, 'learning_rate': 0.025159214649555714, 'min_child_weight': 9, 'gamma': 0.0015314413576502184, 'subsample': 0.75, 'colsample_bytree': 0.45, 'early_stoppig_rounds': 64}. Best is trial 0 with value: 0.858027027027027.[0m


Avg AUC: 0.858027027027027


[32m[I 2023-01-21 08:16:26,818][0m Trial 1 finished with value: 0.8516515116811727 and parameters: {'n_estimators': 89, 'max_depth': 5, 'learning_rate': 0.053510979109602334, 'min_child_weight': 4, 'gamma': 0.006303644080049534, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.8, 'early_stoppig_rounds': 90}. Best is trial 0 with value: 0.858027027027027.[0m


Avg AUC: 0.8516515116811727


[32m[I 2023-01-21 08:16:29,934][0m Trial 2 finished with value: 0.8550784470911589 and parameters: {'n_estimators': 166, 'max_depth': 3, 'learning_rate': 0.043210308054627326, 'min_child_weight': 1, 'gamma': 0.009719966460524504, 'subsample': 0.9000000000000001, 'colsample_bytree': 0.35000000000000003, 'early_stoppig_rounds': 58}. Best is trial 0 with value: 0.858027027027027.[0m


Avg AUC: 0.8550784470911589


[32m[I 2023-01-21 08:16:31,440][0m Trial 3 finished with value: 0.8530913307375171 and parameters: {'n_estimators': 63, 'max_depth': 5, 'learning_rate': 0.14695701282292378, 'min_child_weight': 9, 'gamma': 0.021403514320537963, 'subsample': 0.5, 'colsample_bytree': 0.8, 'early_stoppig_rounds': 47}. Best is trial 0 with value: 0.858027027027027.[0m


Avg AUC: 0.8530913307375171


[32m[I 2023-01-21 08:16:33,268][0m Trial 4 finished with value: 0.8446856390288593 and parameters: {'n_estimators': 86, 'max_depth': 10, 'learning_rate': 0.028530836957198713, 'min_child_weight': 6, 'gamma': 0.0015029063704172841, 'subsample': 0.25, 'colsample_bytree': 0.30000000000000004, 'early_stoppig_rounds': 50}. Best is trial 0 with value: 0.858027027027027.[0m


Avg AUC: 0.8446856390288593


[32m[I 2023-01-21 08:16:37,906][0m Trial 5 finished with value: 0.8555108795235913 and parameters: {'n_estimators': 207, 'max_depth': 10, 'learning_rate': 0.06067368505629897, 'min_child_weight': 4, 'gamma': 0.08362456540346819, 'subsample': 0.8, 'colsample_bytree': 0.35000000000000003, 'early_stoppig_rounds': 49}. Best is trial 0 with value: 0.858027027027027.[0m


Avg AUC: 0.8555108795235913


[32m[I 2023-01-21 08:16:40,675][0m Trial 6 finished with value: 0.8598249541914796 and parameters: {'n_estimators': 217, 'max_depth': 3, 'learning_rate': 0.24855334852631575, 'min_child_weight': 6, 'gamma': 0.0028316894686862045, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.7, 'early_stoppig_rounds': 66}. Best is trial 6 with value: 0.8598249541914796.[0m


Avg AUC: 0.8598249541914796


[32m[I 2023-01-21 08:16:44,998][0m Trial 7 finished with value: 0.8391390288593679 and parameters: {'n_estimators': 234, 'max_depth': 2, 'learning_rate': 0.022935332246166857, 'min_child_weight': 2, 'gamma': 0.0011198509184436913, 'subsample': 1.0, 'colsample_bytree': 0.95, 'early_stoppig_rounds': 95}. Best is trial 6 with value: 0.8598249541914796.[0m


Avg AUC: 0.8391390288593679


[32m[I 2023-01-21 08:16:48,199][0m Trial 8 finished with value: 0.8511449839670178 and parameters: {'n_estimators': 131, 'max_depth': 5, 'learning_rate': 0.02203476204223855, 'min_child_weight': 5, 'gamma': 0.0066941780537012035, 'subsample': 0.35000000000000003, 'colsample_bytree': 1.0, 'early_stoppig_rounds': 52}. Best is trial 6 with value: 0.8598249541914796.[0m


Avg AUC: 0.8511449839670178


[32m[I 2023-01-21 08:16:52,488][0m Trial 9 finished with value: 0.8612724461749884 and parameters: {'n_estimators': 251, 'max_depth': 2, 'learning_rate': 0.07130422097589852, 'min_child_weight': 10, 'gamma': 2.258041733018634e-05, 'subsample': 1.0, 'colsample_bytree': 0.45, 'early_stoppig_rounds': 95}. Best is trial 9 with value: 0.8612724461749884.[0m


Avg AUC: 0.8612724461749884


[32m[I 2023-01-21 08:17:00,316][0m Trial 10 finished with value: 0.8436846655978012 and parameters: {'n_estimators': 276, 'max_depth': 8, 'learning_rate': 0.01199388123091291, 'min_child_weight': 10, 'gamma': 1.208330708737309e-05, 'subsample': 1.0, 'colsample_bytree': 0.55, 'early_stoppig_rounds': 78}. Best is trial 9 with value: 0.8612724461749884.[0m


Avg AUC: 0.8436846655978012


[32m[I 2023-01-21 08:17:03,183][0m Trial 11 finished with value: 0.8652509161704078 and parameters: {'n_estimators': 236, 'max_depth': 2, 'learning_rate': 0.28511990072975674, 'min_child_weight': 7, 'gamma': 7.335563659356346e-05, 'subsample': 0.6000000000000001, 'colsample_bytree': 0.65, 'early_stoppig_rounds': 75}. Best is trial 11 with value: 0.8652509161704078.[0m


Avg AUC: 0.8652509161704078


[32m[I 2023-01-21 08:17:07,031][0m Trial 12 finished with value: 0.8606426935409985 and parameters: {'n_estimators': 254, 'max_depth': 2, 'learning_rate': 0.1254651585874934, 'min_child_weight': 8, 'gamma': 2.3335230745442562e-05, 'subsample': 0.4, 'colsample_bytree': 0.55, 'early_stoppig_rounds': 81}. Best is trial 11 with value: 0.8652509161704078.[0m


Avg AUC: 0.8606426935409985


[32m[I 2023-01-21 08:17:09,396][0m Trial 13 finished with value: 0.8667127805771873 and parameters: {'n_estimators': 177, 'max_depth': 3, 'learning_rate': 0.2814017544513701, 'min_child_weight': 8, 'gamma': 0.00010877835071630076, 'subsample': 0.75, 'colsample_bytree': 0.2, 'early_stoppig_rounds': 79}. Best is trial 13 with value: 0.8667127805771873.[0m


Avg AUC: 0.8667127805771873


[32m[I 2023-01-21 08:17:12,130][0m Trial 14 finished with value: 0.8473616010077876 and parameters: {'n_estimators': 175, 'max_depth': 7, 'learning_rate': 0.29377851941999633, 'min_child_weight': 7, 'gamma': 0.00016145967633275112, 'subsample': 0.7, 'colsample_bytree': 0.2, 'early_stoppig_rounds': 76}. Best is trial 13 with value: 0.8667127805771873.[0m


Avg AUC: 0.8473616010077876


[32m[I 2023-01-21 08:17:15,170][0m Trial 15 finished with value: 0.8560675103069171 and parameters: {'n_estimators': 192, 'max_depth': 4, 'learning_rate': 0.1675981865397774, 'min_child_weight': 8, 'gamma': 0.00015225953315582452, 'subsample': 0.5, 'colsample_bytree': 0.7, 'early_stoppig_rounds': 87}. Best is trial 13 with value: 0.8667127805771873.[0m


Avg AUC: 0.8560675103069171


[32m[I 2023-01-21 08:17:17,746][0m Trial 16 finished with value: 0.8627240036646816 and parameters: {'n_estimators': 143, 'max_depth': 3, 'learning_rate': 0.10116601604383826, 'min_child_weight': 7, 'gamma': 0.0002450948345539835, 'subsample': 0.8500000000000001, 'colsample_bytree': 0.2, 'early_stoppig_rounds': 73}. Best is trial 13 with value: 0.8667127805771873.[0m


Avg AUC: 0.8627240036646816


[32m[I 2023-01-21 08:17:19,776][0m Trial 17 finished with value: 0.8475877805771873 and parameters: {'n_estimators': 135, 'max_depth': 7, 'learning_rate': 0.21016077584390042, 'min_child_weight': 7, 'gamma': 5.893248010291727e-05, 'subsample': 0.7, 'colsample_bytree': 0.65, 'early_stoppig_rounds': 40}. Best is trial 13 with value: 0.8667127805771873.[0m


Avg AUC: 0.8475877805771873


[32m[I 2023-01-21 08:17:23,607][0m Trial 18 finished with value: 0.8627462207970682 and parameters: {'n_estimators': 224, 'max_depth': 4, 'learning_rate': 0.09769831807027829, 'min_child_weight': 5, 'gamma': 0.0004736430605951253, 'subsample': 0.5, 'colsample_bytree': 0.8500000000000001, 'early_stoppig_rounds': 85}. Best is trial 13 with value: 0.8667127805771873.[0m


Avg AUC: 0.8627462207970682


[32m[I 2023-01-21 08:17:26,273][0m Trial 19 finished with value: 0.8614450297755383 and parameters: {'n_estimators': 187, 'max_depth': 4, 'learning_rate': 0.20815876898495758, 'min_child_weight': 8, 'gamma': 5.5882173005942625e-05, 'subsample': 0.65, 'colsample_bytree': 0.45, 'early_stoppig_rounds': 68}. Best is trial 13 with value: 0.8667127805771873.[0m


Avg AUC: 0.8614450297755383


[32m[I 2023-01-21 08:17:28,263][0m Trial 20 finished with value: 0.8353184264773248 and parameters: {'n_estimators': 247, 'max_depth': 2, 'learning_rate': 0.2851390712244482, 'min_child_weight': 3, 'gamma': 6.59430640961717e-05, 'subsample': 0.2, 'colsample_bytree': 0.6000000000000001, 'early_stoppig_rounds': 61}. Best is trial 13 with value: 0.8667127805771873.[0m


Avg AUC: 0.8353184264773248


[32m[I 2023-01-21 08:17:32,534][0m Trial 21 finished with value: 0.8554659299129638 and parameters: {'n_estimators': 217, 'max_depth': 4, 'learning_rate': 0.08245919703216459, 'min_child_weight': 5, 'gamma': 0.00041576811502454724, 'subsample': 0.45, 'colsample_bytree': 0.8500000000000001, 'early_stoppig_rounds': 84}. Best is trial 13 with value: 0.8667127805771873.[0m


Avg AUC: 0.8554659299129638


[32m[I 2023-01-21 08:17:35,735][0m Trial 22 finished with value: 0.8625792487402656 and parameters: {'n_estimators': 159, 'max_depth': 3, 'learning_rate': 0.12164028361607034, 'min_child_weight': 6, 'gamma': 0.0007683560535068322, 'subsample': 0.55, 'colsample_bytree': 0.9000000000000001, 'early_stoppig_rounds': 100}. Best is trial 13 with value: 0.8667127805771873.[0m


Avg AUC: 0.8625792487402656


[32m[I 2023-01-21 08:17:38,129][0m Trial 23 finished with value: 0.8604067796610171 and parameters: {'n_estimators': 228, 'max_depth': 4, 'learning_rate': 0.19553630839643849, 'min_child_weight': 4, 'gamma': 0.00040522143156822375, 'subsample': 0.35000000000000003, 'colsample_bytree': 0.75, 'early_stoppig_rounds': 72}. Best is trial 13 with value: 0.8667127805771873.[0m


Avg AUC: 0.8604067796610171


[32m[I 2023-01-21 08:17:42,191][0m Trial 24 finished with value: 0.8528273018781494 and parameters: {'n_estimators': 266, 'max_depth': 6, 'learning_rate': 0.09595003356975276, 'min_child_weight': 7, 'gamma': 9.484859297183919e-05, 'subsample': 0.65, 'colsample_bytree': 0.55, 'early_stoppig_rounds': 80}. Best is trial 13 with value: 0.8667127805771873.[0m


Avg AUC: 0.8528273018781494


[32m[I 2023-01-21 08:17:45,639][0m Trial 25 finished with value: 0.8545085890975722 and parameters: {'n_estimators': 296, 'max_depth': 3, 'learning_rate': 0.15678529664430046, 'min_child_weight': 9, 'gamma': 0.0004635265661787456, 'subsample': 0.55, 'colsample_bytree': 0.9000000000000001, 'early_stoppig_rounds': 87}. Best is trial 13 with value: 0.8667127805771873.[0m


Avg AUC: 0.8545085890975722


[32m[I 2023-01-21 08:17:49,417][0m Trial 26 finished with value: 0.8573894296839212 and parameters: {'n_estimators': 206, 'max_depth': 2, 'learning_rate': 0.04192212530258929, 'min_child_weight': 5, 'gamma': 0.2750254162001188, 'subsample': 0.8, 'colsample_bytree': 0.65, 'early_stoppig_rounds': 75}. Best is trial 13 with value: 0.8667127805771873.[0m


Avg AUC: 0.8573894296839212


[32m[I 2023-01-21 08:17:52,488][0m Trial 27 finished with value: 0.8531299816765918 and parameters: {'n_estimators': 196, 'max_depth': 4, 'learning_rate': 0.23111793245313642, 'min_child_weight': 8, 'gamma': 3.614551636776944e-05, 'subsample': 0.9000000000000001, 'colsample_bytree': 1.0, 'early_stoppig_rounds': 84}. Best is trial 13 with value: 0.8667127805771873.[0m


Avg AUC: 0.8531299816765918


[32m[I 2023-01-21 08:17:56,029][0m Trial 28 finished with value: 0.8544667315620705 and parameters: {'n_estimators': 234, 'max_depth': 6, 'learning_rate': 0.12000050538735188, 'min_child_weight': 3, 'gamma': 0.0001586766513144013, 'subsample': 0.7, 'colsample_bytree': 0.8, 'early_stoppig_rounds': 70}. Best is trial 13 with value: 0.8667127805771873.[0m


Avg AUC: 0.8544667315620705


[32m[I 2023-01-21 08:17:58,077][0m Trial 29 finished with value: 0.8538826729271645 and parameters: {'n_estimators': 110, 'max_depth': 3, 'learning_rate': 0.29844401791874936, 'min_child_weight': 6, 'gamma': 1.492083516589084e-05, 'subsample': 0.75, 'colsample_bytree': 0.4, 'early_stoppig_rounds': 93}. Best is trial 13 with value: 0.8667127805771873.[0m


Avg AUC: 0.8538826729271645


In [37]:
study_xgb.best_value

0.8667127805771873

## INSIGHTS:
* With 10 fold cv and target encoding, we achieve a best value of 0.879. But i feel like 10 cv might be too much for such small dataset in that it's way more prone to randoness than a 5 fold is
* With 5 fold cv and target encoding, we achieve a best value of 0.87259 or 0.873

Although the pbL is just luck based casino game at this point, we'll still submit using the best params to make sure we're on the right path with such cross_validation technique. The last time i tried using this technique, it resulted in severe overfitting because idk how i somehow messed something up

In [38]:
study_xgb.best_params

{'n_estimators': 177,
 'max_depth': 3,
 'learning_rate': 0.2814017544513701,
 'min_child_weight': 8,
 'gamma': 0.00010877835071630076,
 'subsample': 0.75,
 'colsample_bytree': 0.2,
 'early_stoppig_rounds': 79}

In [39]:
xgb_params = {'n_estimators': 177,
             'max_depth': 3,
             'learning_rate': 0.2814,
             'min_child_weight': 8,
             'gamma': 0.0001,
             'subsample': 0.75,
             'colsample_bytree': 0.2,
             'early_stoppig_rounds': 79}

In [40]:
X_train_fr, X_val, y_train_fr, y_val = train_test_split(X_comp, y_comp, test_size=0.1, shuffle=True, random_state=1337,
                                                        stratify=y_comp)

In [41]:
X_train_fr = pd.concat([X_train_fr, X_original])
y_train_fr = pd.concat([y_train_fr, y_original])

In [42]:
xgb_tuned_clf = xgb.XGBClassifier(**xgb_params)
xgb_tuned_clf.fit(X_train_fr, y_train_fr, eval_set=[(X_val, y_val)], verbose=False)

Parameters: { "early_stoppig_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.2,
              early_stoppig_rounds=79, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, gamma=0.0001,
              gpu_id=-1, grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.2814, max_bin=256,
              max_cat_to_onehot=4, max_delta_step=0, max_depth=3, max_leaves=0,
              min_child_weight=8, missing=nan, monotone_constraints='()',
              n_estimators=177, n_jobs=0, num_parallel_tree=1, predictor='auto',
              random_state=0, reg_alpha=0, ...)

In [43]:
len(X_test) == len(test)

True

In [44]:
xgb_tuned_preds = xgb_tuned_clf.predict_proba(X_test)[:, 1]

In [46]:
# non-overfitting predictions
submission = pd.DataFrame({"id": test_idx, "Attrition": xgb_tuned_preds})
submission.head()

Unnamed: 0,id,Attrition
0,1677,0.093922
1,1678,0.051655
2,1679,0.040405
3,1680,0.133737
4,1681,0.819291


In [86]:
# # non-overfitting predictions
# submission = pd.DataFrame({"id": test_idx, "Attrition": xgb_tuned_preds})
# submission.head()

Unnamed: 0,id,Attrition
0,1677,0.196305
1,1678,0.043251
2,1679,0.010886
3,1680,0.082801
4,1681,0.132661


In [41]:
# # non-overfitting predictions
# submission = pd.DataFrame({"id": test_idx, "Attrition": xgb_tuned_preds})
# submission.head()

Unnamed: 0,id,Attrition
0,1677,0.40117
1,1678,0.117693
2,1679,0.000777
3,1680,0.015711
4,1681,0.295577


In [42]:
# OVERFITTED PREDICTIONS
# submission = pd.DataFrame({"id": test_idx, "Attrition": xgb_tuned_preds})
# submission.head()

Unnamed: 0,id,Attrition
0,1677,0.021201
1,1678,0.001126
2,1679,1.7e-05
3,1680,0.000405
4,1681,0.947035


In [47]:
submission.to_csv("submission.csv", index=False)