In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
import scipy.stats as stats
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, VotingClassifier
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler, QuantileTransformer
from sklearn import metrics


In [2]:
# Import original versions

train_set = pd.read_csv('train.csv')
test_set = pd.read_csv('test.csv')

In [3]:
# Make a copy for feature engineering

X_train = train_set.drop(columns=['is_promoted'])
X_test = pd.read_csv('test.csv')
y_train = train_set['is_promoted']

# For submission, import template

submission = pd.read_csv('sample_submission.csv')

# <center>EDA<center>

In [None]:
X_train.info()
X_test.info()

In [None]:
num_columns = ['no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 'KPIs_met >80%', 'awards_won?', 'avg_training_score']
cat_columns = ['department', 'region', 'education', 'gender', 'recruitment_channel']

In [None]:
X_train.describe()

In [None]:
X_test.describe()

### Observations:
* Null values found in:
    * Education (train: 2409, test: 1034)
    * previous_year_rating (train: 4124, test: 1812)

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
y_train

In [None]:
for column in num_columns:
    fig, ax = plt.subplots()
    n, bins, patches = ax.hist(X_train[column])
    ax.set_xlabel(column)
    ax.set_ylabel('Frequency')
    ax.set_title(f'Histogram of {column}')
    
    # Add frequency labels above each bar
    for i in range(len(patches)):
        x = patches[i].get_x() + patches[i].get_width() / 2
        y = patches[i].get_height()
        ax.text(x, y, f'{int(y)}', ha='center', va='bottom')
    
plt.show()

In [None]:
sns.set(style='whitegrid', font_scale=1.2)
# Create bar charts for each categorical column
for col in cat_columns:
    plt.figure(figsize=(10, 5))
    value_counts = X_train[col].value_counts()
    ax = value_counts.plot(kind='bar', color='skyblue')

    # Add frequency text above each bar
    for i, v in enumerate(value_counts):
        ax.text(i, v, str(v), ha='center', va='bottom', fontweight='bold')

    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.title(f'Bar Chart of {col}')
    plt.tight_layout()
    plt.show()


### Observations
* Education (which contains null values) has no obvious value to impute. Need to consider correlations.
* Previous_year_rating contains null values. Check if these are new joiners by considering length_of_service.
* no_of_trainings and length_of_training is highly skewed to left - needs normalizing
* age is slightly skewed to the left
* most people have not won an award (~2% have won)

In [None]:
# Check for correlations using a heatmap to understand the data better
corr_matrix = X_train.corr()
plt.subplots(figsize=(20,15))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.show()

In [None]:
corr_matrix = X_test.corr()
plt.subplots(figsize=(20,15))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.show()

In [None]:
# Group by 'Category' and calculate the average and std for each group
averages = X_train.groupby('education').mean()
std = X_train.groupby('education').std()

In [None]:
averages

In [None]:
std

### Observations
* Use a combination of age and length_of_service to determine potential education level.
    * 20 < age < 26 = Below Secondary
    * 26 < age < 36 = Bachelor's
    * 36 < age < max = Master's & above

# <center>Feature Engineering<center>

In [4]:
X_train = X_train.drop('employee_id', axis=1)
X_test = X_test.drop('employee_id', axis=1)

In [None]:
# Function to impute education based on age
def impute_education(row):
    age = row['age']
    education = row['education']
    
    if education not in ["Below Secondary", "Bachelor's", "Master's & Above"]:
        if 0 < age <= 26:
            return "Below Secondary"
        elif 26 < age <= 36:
            return "Bachelor's"
        elif 36 < age:
            return "Master's & Above"
    
    return education
    
# Apply the function and update the 'education' column
X_train['education'] = X_train.apply(impute_education, axis=1)
X_test['education'] = X_test.apply(impute_education, axis=1)


In [5]:
# Impute education by forward/backward filling
X_train['education'] = X_train['education'].ffill().bfill()
X_test['education'] = X_test['education'].ffill().bfill()

In [None]:
X_train.head()

In [None]:
X_train[X_train['previous_year_rating'].isna() == True].describe()

### Observation
* previous_year_rating is NaN when length_of_service is 1. 
* Try to impute modal value of 3
* Try to impute 0

In [6]:
# Impute value of 0 for 'previous_year_rating'
X_train['previous_year_rating'] = X_train['previous_year_rating'].fillna(0)
X_test['previous_year_rating'] = X_test['previous_year_rating'].fillna(0)

In [7]:
# Create new column to seperate new starters from experienced
X_train['New Starter'] = X_train['previous_year_rating'].apply(lambda x: 'New Starter' if x == 0 else 'Experienced')
X_test['New Starter'] = X_test['previous_year_rating'].apply(lambda x: 'New Starter' if x == 0 else 'Experienced')


In [None]:
# Remove "region_" from each of the region value
X_train['region'] = X_train['region'].str[7:].astype(int)
X_test['region'] = X_test['region'].str[7:].astype(int)

In [None]:
sns.distplot(X_train['age'])


In [8]:
X_train['age'] = pd.cut( x=X_train['age'], bins=[20, 29, 39, 49], labels=['20', '30', '40'] )
X_test['age']  = pd.cut( x=X_test['age'], bins=[20, 29, 39, 49],  labels=['20', '30', '40'] )

### Data Encoding

In [9]:
# get dummies
all_data = pd.concat([X_train, X_test])
all_data = pd.get_dummies(all_data)
all_col = all_data.columns

cutoff = X_train.shape[0]

X_train = all_data[0:cutoff].reset_index(drop=True)
X_test = all_data[cutoff:].reset_index(drop=True)

### Skewed Data

In [None]:
skewed_columns = ['length_of_service', 'age']

In [None]:
# Apply log transformation
X_train[skewed_columns] = np.log(X_train[skewed_columns])
X_test[skewed_columns] = np.log(X_test[skewed_columns])

#robust_scaler = RobustScaler()
#X_train['no_of_trainings'] = robust_scaler.fit_transform(X_train[['no_of_trainings']])
#X_test['no_of_trainings'] = robust_scaler.fit_transform(X_test[['no_of_trainings']])

In [None]:
for column in ['no_of_trainings', 'length_of_service', 'age']:
    fig, ax = plt.subplots()
    n, bins, patches = ax.hist(X_train[column])
    ax.set_xlabel(column)
    ax.set_ylabel('Frequency')
    ax.set_title(f'Histogram of {column}')
    
    # Add frequency labels above each bar
    for i in range(len(patches)):
        x = patches[i].get_x() + patches[i].get_width() / 2
        y = patches[i].get_height()
        ax.text(x, y, f'{int(y)}', ha='center', va='bottom')
    
plt.show()

### Scale

In [10]:
# To scale values with robust scaler
X_train = RobustScaler().fit_transform(X_train)
X_test = RobustScaler().fit_transform(X_test)


In [None]:
## SCALE VALUES ##

scaler_train = StandardScaler()
scaler_train.fit(X_train)
X_train = pd.DataFrame(scaler_train.transform(X_train), columns = all_col)

scaler_test = StandardScaler()
scaler_test.fit(X_test)
X_test = pd.DataFrame(scaler_test.transform(X_test), columns = all_col)

In [None]:
X_train.head()

In [None]:
for column in num_columns:
    fig, ax = plt.subplots()
    n, bins, patches = ax.hist(X_train[column])
    ax.set_xlabel(column)
    ax.set_ylabel('Frequency')
    ax.set_title(f'Histogram of {column}')
    
    # Add frequency labels above each bar
    for i in range(len(patches)):
        x = patches[i].get_x() + patches[i].get_width() / 2
        y = patches[i].get_height()
        ax.text(x, y, f'{int(y)}', ha='center', va='bottom')
    
plt.show()

# <center>Model Selection<center>

In [11]:
# Split the data for testing
X_0, X_1, y_0, y_1 = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

### KNN Classifier

In [None]:
# Create a KNN classifier
knn_classifier = KNeighborsClassifier()

In [None]:
# Define the hyperparameter grid for the grid search
param_grid = {
    'n_neighbors': [1, 2, 3, 5, 7], 
    'weights': ['uniform', 'distance']
}
grid_search_knn = GridSearchCV(knn_classifier, param_grid, scoring='f1', cv=5, n_jobs=-1)

In [None]:
# Fit the GridSearchCV object on the training data
grid_search_knn.fit(X_0, y_0)

In [None]:
# Get the best parameters and best estimator from the grid search
best_params_knn = grid_search_knn.best_params_
best_estimator_knn = grid_search_knn.best_estimator_

In [None]:
# Predict labels on the test set using the best estimator
y_pred_knn = best_estimator_knn.predict(X_1)

In [None]:
# Calculate the F1 score
f1_knn = f1_score(y_1, y_pred_knn, average='binary')

print("Best Parameters:", best_params_knn)
print("Best F1 Score:", f1_knn)

### SVC

In [None]:
# Create an SVC classifier
svc_classifier = SVC()

In [None]:
# Define the hyperparameter grid for the grid search
param_grid_svc = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly'],  # Example kernel options
}
grid_search_svc = GridSearchCV(svc_classifier, param_grid_svc, scoring='f1', cv=3, n_jobs=-1)

In [None]:
# Fit the GridSearchCV object on the training data
grid_search_svc.fit(X_0, y_0)

In [None]:
# Get the best parameters and best estimator from the grid search
best_params_svc = grid_search_svc.best_params_
best_estimator_svc = grid_search_svc.best_estimator_

In [None]:
# Predict labels on the test set using the best estimator
y_pred_svc = best_estimator_svc.predict(X_1)

In [None]:
# Calculate the F1 score
f1_svc = f1_score(y_1, y_pred_svc, average='binary')

print("Best Parameters:", best_params_svc)
print("Best F1 Score:", f1_svc)

### GradientBoostingClassifier

In [None]:
# Create a GradientBoostingClassifier
gb_classifier = GradientBoostingClassifier()

In [None]:
# Define the hyperparameter grid for the grid search
param_grid_gbc = {
    'n_estimators': [400, 500, 600, 700],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [4, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'subsample': [0.8, 1.0],
    'max_features': ['sqrt', 'log2']
}

grid_search_gbc = GridSearchCV(gb_classifier, param_grid_gbc, scoring='f1', cv=4, n_jobs=-1, verbose=2)


In [None]:
# Fit the GridSearchCV object on the training data
grid_search_gbc.fit(X_0, y_0)

In [None]:
# Get the best parameters and best estimator from the grid search
best_params_gbc = grid_search_gbc.best_params_
best_estimator_gbc = grid_search_gbc.best_estimator_

In [None]:
# Predict labels on the test set using the best estimator
y_pred_gbc = best_estimator_gbc.predict(X_1)

In [None]:
# Calculate the F1 score
f1_gbc = f1_score(y_1, y_pred_gbc, average='binary')

print("Best Parameters:", best_params_gbc)
print("Best F1 Score:", f1_gbc)

### CatBoostClassifier

In [None]:
param_grid = {
    'iterations': [300, 400, 500],  # Number of boosting iterations
    'depth': [4, 6, 8],             # Maximum depth of the trees
    'learning_rate': [0.01, 0.1, 0.2],   # Learning rate
}

In [None]:
CBC = CatBoostClassifier()

In [None]:
grid_search_CBC = GridSearchCV(
    estimator=CBC,   # CatBoost model
    param_grid=param_grid, # Grid of hyperparameters
    scoring='f1',   # Choose an appropriate scoring metric
    cv=5,                 # Number of cross-validation folds
    n_jobs=-1             # Number of CPU cores to use (-1 for all available cores)
)

In [None]:
grid_search_CBC.fit(X_0, y_0)

In [None]:
# Get the best parameters and best estimator from the grid search
best_params_CBC = grid_search.best_params_
best_estimator_CBC = grid_search.best_estimator_

In [None]:
# Predict labels on the test set using the best estimator
y_pred_cbc = best_estimator.predict(X_1)

In [None]:
# Calculate the F1 score
f1_cbc = f1_score(y_1, y_pred_cbc, average='binary')
print("Best Parameters:", best_params)

print("Best F1 Score:", f1_cbc)

## Ensemble 

In [67]:
# Populate best parameters of each model
params = {}

### Get best params for XGBClassifier

In [93]:
xgb_model = XGBClassifier(random_state=42)

In [94]:
param_grid_XGB = {
    'n_estimators': [300, 400, 500],
    'max_depth': [4, 5, 6],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.7, 0.8, 0.9, 1.0],
}


In [95]:
grid_search_XGB = GridSearchCV(estimator=xgb_model, param_grid=param_grid_XGB, cv=3, scoring='f1', n_jobs=-1)

In [96]:
grid_search_XGB.fit(X_0, y_0)

GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                   

In [97]:
# Get the best parameters and best estimator from the grid search
best_params_XGB = grid_search_XGB.best_params_
best_estimator_XGB = grid_search_XGB.best_estimator_

In [98]:
# Predict labels on the test set using the best estimator
y_pred_XGB = best_estimator_XGB.predict(X_1)

In [99]:
# Calculate the F1 score
f1_xgb = f1_score(y_1, y_pred_XGB, average='binary')
print("Best Parameters:", best_params_XGB)

print("Best F1 Score:", f1_xgb)

Best Parameters: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 400, 'subsample': 0.7}
Best F1 Score: 0.4972549019607843


In [100]:
params['XGBoost'] = best_params_XGB

### Get best params for CatBoostClassifier

In [101]:
param_grid = {
    'iterations': [300, 400, 500],  
    'depth': [4, 5, 6],           
    'learning_rate': [0.1, 0.15, 0.2],   
}

In [102]:
CBC = CatBoostClassifier()

In [103]:
grid_search_CBC = GridSearchCV(
    estimator=CBC,   # CatBoost model
    param_grid=param_grid, # Grid of hyperparameters
    scoring='f1',   # Choose an appropriate scoring metric
    cv=5,                 # Number of cross-validation folds
    n_jobs=-1             # Number of CPU cores to use (-1 for all available cores)
)

In [104]:
grid_search_CBC.fit(X_0, y_0)

0:	learn: 0.5294600	total: 41.6ms	remaining: 20.8s
1:	learn: 0.4355253	total: 53.3ms	remaining: 13.3s
2:	learn: 0.3704886	total: 64.4ms	remaining: 10.7s
3:	learn: 0.3252583	total: 76.2ms	remaining: 9.45s
4:	learn: 0.2929038	total: 89.5ms	remaining: 8.86s
5:	learn: 0.2759877	total: 102ms	remaining: 8.38s
6:	learn: 0.2494633	total: 115ms	remaining: 8.07s
7:	learn: 0.2437808	total: 127ms	remaining: 7.8s
8:	learn: 0.2335455	total: 138ms	remaining: 7.53s
9:	learn: 0.2224340	total: 151ms	remaining: 7.39s
10:	learn: 0.2152240	total: 164ms	remaining: 7.28s
11:	learn: 0.2104613	total: 176ms	remaining: 7.17s
12:	learn: 0.2007722	total: 188ms	remaining: 7.05s
13:	learn: 0.1991325	total: 199ms	remaining: 6.92s
14:	learn: 0.1975317	total: 211ms	remaining: 6.81s
15:	learn: 0.1956581	total: 222ms	remaining: 6.72s
16:	learn: 0.1946628	total: 236ms	remaining: 6.71s
17:	learn: 0.1917490	total: 250ms	remaining: 6.68s
18:	learn: 0.1907555	total: 263ms	remaining: 6.65s
19:	learn: 0.1903480	total: 275ms	rem

168:	learn: 0.1514337	total: 2.2s	remaining: 4.32s
169:	learn: 0.1513690	total: 2.22s	remaining: 4.3s
170:	learn: 0.1513191	total: 2.23s	remaining: 4.29s
171:	learn: 0.1512977	total: 2.24s	remaining: 4.28s
172:	learn: 0.1512278	total: 2.25s	remaining: 4.26s
173:	learn: 0.1511708	total: 2.27s	remaining: 4.25s
174:	learn: 0.1510057	total: 2.28s	remaining: 4.23s
175:	learn: 0.1508733	total: 2.29s	remaining: 4.22s
176:	learn: 0.1508418	total: 2.3s	remaining: 4.2s
177:	learn: 0.1507447	total: 2.32s	remaining: 4.19s
178:	learn: 0.1507159	total: 2.33s	remaining: 4.18s
179:	learn: 0.1506446	total: 2.34s	remaining: 4.17s
180:	learn: 0.1505105	total: 2.36s	remaining: 4.16s
181:	learn: 0.1504347	total: 2.37s	remaining: 4.14s
182:	learn: 0.1503267	total: 2.38s	remaining: 4.13s
183:	learn: 0.1502742	total: 2.4s	remaining: 4.12s
184:	learn: 0.1502052	total: 2.41s	remaining: 4.1s
185:	learn: 0.1501349	total: 2.42s	remaining: 4.09s
186:	learn: 0.1500872	total: 2.44s	remaining: 4.08s
187:	learn: 0.1500

340:	learn: 0.1411578	total: 4.39s	remaining: 2.05s
341:	learn: 0.1410722	total: 4.4s	remaining: 2.03s
342:	learn: 0.1409918	total: 4.41s	remaining: 2.02s
343:	learn: 0.1409770	total: 4.43s	remaining: 2.01s
344:	learn: 0.1409147	total: 4.44s	remaining: 1.99s
345:	learn: 0.1409059	total: 4.45s	remaining: 1.98s
346:	learn: 0.1408662	total: 4.46s	remaining: 1.97s
347:	learn: 0.1408150	total: 4.47s	remaining: 1.95s
348:	learn: 0.1407409	total: 4.49s	remaining: 1.94s
349:	learn: 0.1407154	total: 4.5s	remaining: 1.93s
350:	learn: 0.1406553	total: 4.51s	remaining: 1.92s
351:	learn: 0.1406047	total: 4.53s	remaining: 1.9s
352:	learn: 0.1405907	total: 4.54s	remaining: 1.89s
353:	learn: 0.1405826	total: 4.55s	remaining: 1.88s
354:	learn: 0.1405534	total: 4.56s	remaining: 1.86s
355:	learn: 0.1405431	total: 4.57s	remaining: 1.85s
356:	learn: 0.1405073	total: 4.59s	remaining: 1.84s
357:	learn: 0.1404290	total: 4.6s	remaining: 1.82s
358:	learn: 0.1403926	total: 4.61s	remaining: 1.81s
359:	learn: 0.14

GridSearchCV(cv=5,
             estimator=<catboost.core.CatBoostClassifier object at 0x00000209408D7130>,
             n_jobs=-1,
             param_grid={'depth': [4, 5, 6], 'iterations': [300, 400, 500],
                         'learning_rate': [0.1, 0.15, 0.2]},
             scoring='f1')

In [105]:
# Get the best parameters and best estimator from the grid search
best_params_CBC = grid_search_CBC.best_params_
best_estimator_CBC = grid_search_CBC.best_estimator_

In [106]:
# Predict labels on the test set using the best estimator
y_pred_cbc = best_estimator_CBC.predict(X_1)

In [107]:
# Calculate the F1 score
f1_cbc = f1_score(y_1, y_pred_cbc, average='binary')
print("Best Parameters:", best_params_CBC)

print("Best F1 Score:", f1_cbc)

Best Parameters: {'depth': 5, 'iterations': 500, 'learning_rate': 0.15}
Best F1 Score: 0.49564528899445764


In [108]:
params['CatBoost'] = best_params_CBC

### Get best params for LGBMClassifier

In [109]:
lgbm_model = LGBMClassifier(random_state=42)

In [110]:
param_grid_LGBM = {
    'n_estimators': [200, 300, 400, 500],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.8, 0.9, 1.0],
}

In [111]:
grid_search_LGBM = GridSearchCV(estimator=lgbm_model, param_grid=param_grid_LGBM, cv=3, scoring='f1', n_jobs=-1)

In [112]:
grid_search_LGBM.fit(X_0, y_0)

GridSearchCV(cv=3, estimator=LGBMClassifier(random_state=42), n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.01, 0.001],
                         'max_depth': [3, 4, 5],
                         'n_estimators': [200, 300, 400, 500],
                         'subsample': [0.8, 0.9, 1.0]},
             scoring='f1')

In [113]:
# Get the best parameters and best estimator from the grid search
best_params_LGBM = grid_search_LGBM.best_params_
best_estimator_LGBM = grid_search_LGBM.best_estimator_

In [114]:
# Predict labels on the test set using the best estimator
y_pred_LGBM = best_estimator_LGBM.predict(X_1)

In [115]:
# Calculate the F1 score
f1_LGBM = f1_score(y_1, y_pred_LGBM, average='binary')
print("Best Parameters:", best_params_LGBM)

print("Best F1 Score:", f1_LGBM)

Best Parameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 500, 'subsample': 0.8}
Best F1 Score: 0.49879711307137126


In [116]:
params['LightGBM'] = best_params_LGBM

### Ensemble

In [117]:
params

{'XGBoost': {'learning_rate': 0.1,
  'max_depth': 6,
  'n_estimators': 400,
  'subsample': 0.7},
 'CatBoost': {'depth': 5, 'iterations': 500, 'learning_rate': 0.15},
 'LightGBM': {'learning_rate': 0.1,
  'max_depth': 4,
  'n_estimators': 500,
  'subsample': 0.8}}

In [118]:
# Fine Tuned Model With-Hyperparameters :
Classifiers = {'XGBoost' : XGBClassifier(learning_rate =0.1,
                                           n_estimators=494,
                                           max_depth=5,
                                           subsample = 0.70,
                                           verbosity = 0,
                                           scale_pos_weight = 2.5,
                                           updater ="grow_histmaker",
                                           base_score  = 0.2),
               'CatBoost' : CatBoostClassifier(learning_rate=0.15,
                                                 n_estimators=494,
                                                 subsample=0.085,
                                                 max_depth=5,
                                                 scale_pos_weight=2.5),
               'LightGBM' : LGBMClassifier(subsample_freq = 2,
                                             objective ="binary",
                                             importance_type = "gain",
                                             verbosity = -1,
                                             max_bin = 60,
                                             num_leaves = 300,
                                             boosting_type = 'dart',
                                             learning_rate=0.15,
                                             n_estimators=494,
                                             max_depth=5,
                                             scale_pos_weight=2.5)
              }

In [119]:
voting_model = VotingClassifier(estimators=[

('XGBoost_Best', list(Classifiers.values())[0]),
('CatBoost_Best', list(Classifiers.values())[1]),
('LightGBM_Best', list(Classifiers.values())[2]),
],
voting='soft',weights=[5,5,5.2])

In [48]:
voting_model = VotingClassifier(estimators=[
('XGBoost_Best', XGBClassifier(list(params['XGBoost']))),
('CatBoost_Best', CatBoostClassifier(list(params['CatBoost']))),
('LightGBM_Best', LGBMClassifier(list(params['LightGBM']))),
],
voting='soft', weights=[5,5,5.2])



In [120]:
voting_model.fit(X_0, y_0)
predictions_of_voting = voting_model.predict_proba(X_1)[::,1]
predictions = [int(round(value)) for value in predictions_of_voting]
# Calculate the F1 score
f1 = f1_score(y_1, predictions, average='binary')
print("Best F1 Score:", f1)

0:	learn: 0.5933452	total: 11.9ms	remaining: 5.87s
1:	learn: 0.5018041	total: 25.1ms	remaining: 6.16s
2:	learn: 0.4670189	total: 37.3ms	remaining: 6.1s
3:	learn: 0.4106083	total: 50.4ms	remaining: 6.17s
4:	learn: 0.3923341	total: 61.6ms	remaining: 6.02s
5:	learn: 0.3799262	total: 72.3ms	remaining: 5.88s
6:	learn: 0.3647184	total: 82.9ms	remaining: 5.77s
7:	learn: 0.3528707	total: 93.5ms	remaining: 5.68s
8:	learn: 0.3390085	total: 104ms	remaining: 5.6s
9:	learn: 0.3367032	total: 114ms	remaining: 5.54s
10:	learn: 0.3288305	total: 125ms	remaining: 5.49s
11:	learn: 0.3258765	total: 136ms	remaining: 5.47s
12:	learn: 0.3132528	total: 148ms	remaining: 5.49s
13:	learn: 0.3123591	total: 158ms	remaining: 5.41s
14:	learn: 0.3078196	total: 170ms	remaining: 5.43s
15:	learn: 0.3069956	total: 182ms	remaining: 5.45s
16:	learn: 0.3030962	total: 195ms	remaining: 5.47s
17:	learn: 0.3022287	total: 205ms	remaining: 5.43s
18:	learn: 0.3008672	total: 219ms	remaining: 5.48s
19:	learn: 0.2980481	total: 233ms	r

164:	learn: 0.2465761	total: 2.01s	remaining: 4.01s
165:	learn: 0.2465253	total: 2.02s	remaining: 4s
166:	learn: 0.2464660	total: 2.03s	remaining: 3.98s
167:	learn: 0.2463299	total: 2.04s	remaining: 3.97s
168:	learn: 0.2462318	total: 2.06s	remaining: 3.95s
169:	learn: 0.2458686	total: 2.07s	remaining: 3.94s
170:	learn: 0.2457775	total: 2.08s	remaining: 3.93s
171:	learn: 0.2457221	total: 2.09s	remaining: 3.92s
172:	learn: 0.2455891	total: 2.11s	remaining: 3.91s
173:	learn: 0.2455242	total: 2.12s	remaining: 3.9s
174:	learn: 0.2453373	total: 2.13s	remaining: 3.89s
175:	learn: 0.2452722	total: 2.15s	remaining: 3.88s
176:	learn: 0.2451704	total: 2.16s	remaining: 3.86s
177:	learn: 0.2450728	total: 2.17s	remaining: 3.85s
178:	learn: 0.2449751	total: 2.18s	remaining: 3.84s
179:	learn: 0.2449404	total: 2.19s	remaining: 3.83s
180:	learn: 0.2448061	total: 2.21s	remaining: 3.81s
181:	learn: 0.2446929	total: 2.22s	remaining: 3.8s
182:	learn: 0.2445987	total: 2.23s	remaining: 3.79s
183:	learn: 0.244

341:	learn: 0.2295392	total: 3.93s	remaining: 1.75s
342:	learn: 0.2295005	total: 3.94s	remaining: 1.74s
343:	learn: 0.2294717	total: 3.95s	remaining: 1.72s
344:	learn: 0.2293758	total: 3.96s	remaining: 1.71s
345:	learn: 0.2292641	total: 3.97s	remaining: 1.7s
346:	learn: 0.2291707	total: 3.98s	remaining: 1.69s
347:	learn: 0.2291332	total: 3.99s	remaining: 1.68s
348:	learn: 0.2290463	total: 4s	remaining: 1.66s
349:	learn: 0.2290200	total: 4.02s	remaining: 1.65s
350:	learn: 0.2289583	total: 4.03s	remaining: 1.64s
351:	learn: 0.2289160	total: 4.04s	remaining: 1.63s
352:	learn: 0.2288879	total: 4.05s	remaining: 1.62s
353:	learn: 0.2288237	total: 4.06s	remaining: 1.6s
354:	learn: 0.2287896	total: 4.07s	remaining: 1.59s
355:	learn: 0.2287563	total: 4.08s	remaining: 1.58s
356:	learn: 0.2286842	total: 4.09s	remaining: 1.57s
357:	learn: 0.2286267	total: 4.1s	remaining: 1.56s
358:	learn: 0.2285547	total: 4.11s	remaining: 1.54s
359:	learn: 0.2285041	total: 4.12s	remaining: 1.53s
360:	learn: 0.2284

In [123]:
voting_model.fit(X_train, y_train)
predictions_of_voting = voting_model.predict_proba(X_test)[::,1]

0:	learn: 0.5908691	total: 11.9ms	remaining: 5.84s
1:	learn: 0.5062188	total: 22.4ms	remaining: 5.51s
2:	learn: 0.4552513	total: 32.6ms	remaining: 5.34s
3:	learn: 0.4340413	total: 41.6ms	remaining: 5.09s
4:	learn: 0.3935237	total: 52.6ms	remaining: 5.14s
5:	learn: 0.3760248	total: 63.1ms	remaining: 5.13s
6:	learn: 0.3497793	total: 74.4ms	remaining: 5.17s
7:	learn: 0.3444429	total: 84.3ms	remaining: 5.12s
8:	learn: 0.3329675	total: 94.3ms	remaining: 5.08s
9:	learn: 0.3280006	total: 104ms	remaining: 5.05s
10:	learn: 0.3214739	total: 115ms	remaining: 5.04s
11:	learn: 0.3122618	total: 125ms	remaining: 5.03s
12:	learn: 0.3110920	total: 134ms	remaining: 4.95s
13:	learn: 0.3054896	total: 144ms	remaining: 4.94s
14:	learn: 0.3043355	total: 156ms	remaining: 4.97s
15:	learn: 0.3018137	total: 167ms	remaining: 4.99s
16:	learn: 0.3002490	total: 179ms	remaining: 5.02s
17:	learn: 0.2947630	total: 190ms	remaining: 5.04s
18:	learn: 0.2942656	total: 203ms	remaining: 5.08s
19:	learn: 0.2929293	total: 215m

171:	learn: 0.2458479	total: 1.96s	remaining: 3.67s
172:	learn: 0.2456964	total: 1.97s	remaining: 3.65s
173:	learn: 0.2456334	total: 1.98s	remaining: 3.65s
174:	learn: 0.2453385	total: 2s	remaining: 3.64s
175:	learn: 0.2452614	total: 2.01s	remaining: 3.63s
176:	learn: 0.2452176	total: 2.02s	remaining: 3.62s
177:	learn: 0.2451565	total: 2.04s	remaining: 3.62s
178:	learn: 0.2450866	total: 2.05s	remaining: 3.61s
179:	learn: 0.2450348	total: 2.06s	remaining: 3.6s
180:	learn: 0.2449592	total: 2.07s	remaining: 3.58s
181:	learn: 0.2449174	total: 2.08s	remaining: 3.57s
182:	learn: 0.2448811	total: 2.1s	remaining: 3.56s
183:	learn: 0.2447876	total: 2.11s	remaining: 3.55s
184:	learn: 0.2446412	total: 2.12s	remaining: 3.54s
185:	learn: 0.2445672	total: 2.13s	remaining: 3.53s
186:	learn: 0.2445021	total: 2.15s	remaining: 3.52s
187:	learn: 0.2443954	total: 2.16s	remaining: 3.52s
188:	learn: 0.2443219	total: 2.17s	remaining: 3.5s
189:	learn: 0.2442738	total: 2.19s	remaining: 3.5s
190:	learn: 0.24421

342:	learn: 0.2326465	total: 3.94s	remaining: 1.74s
343:	learn: 0.2326171	total: 3.95s	remaining: 1.72s
344:	learn: 0.2325705	total: 3.96s	remaining: 1.71s
345:	learn: 0.2325314	total: 3.97s	remaining: 1.7s
346:	learn: 0.2325108	total: 3.98s	remaining: 1.69s
347:	learn: 0.2324140	total: 4s	remaining: 1.68s
348:	learn: 0.2323842	total: 4.01s	remaining: 1.66s
349:	learn: 0.2323797	total: 4.02s	remaining: 1.65s
350:	learn: 0.2322825	total: 4.03s	remaining: 1.64s
351:	learn: 0.2322352	total: 4.04s	remaining: 1.63s
352:	learn: 0.2322075	total: 4.05s	remaining: 1.62s
353:	learn: 0.2321273	total: 4.06s	remaining: 1.61s
354:	learn: 0.2320726	total: 4.07s	remaining: 1.59s
355:	learn: 0.2320436	total: 4.08s	remaining: 1.58s
356:	learn: 0.2319833	total: 4.09s	remaining: 1.57s
357:	learn: 0.2319178	total: 4.1s	remaining: 1.56s
358:	learn: 0.2318082	total: 4.11s	remaining: 1.55s
359:	learn: 0.2317460	total: 4.12s	remaining: 1.53s
360:	learn: 0.2315907	total: 4.13s	remaining: 1.52s
361:	learn: 0.231

In [54]:
predictions_of_voting

array([3.24898320e-01, 6.26814580e-04, 3.19433578e-05, ...,
       1.89628430e-04, 8.70298530e-03, 9.62587209e-01])

# <center>Make Predictions<center>

In [126]:
predictions = [int(round(value)) for value in predictions_of_voting]

In [125]:
predictions = best_estimator_gbc.predict(X_test)

NameError: name 'best_estimator_gbc' is not defined

In [127]:
submission['is_promoted'] = predictions

In [128]:
submission

Unnamed: 0,employee_id,is_promoted
0,8724,0
1,74430,0
2,72255,0
3,38562,0
4,64486,0
...,...,...
23485,53478,0
23486,25600,0
23487,45409,0
23488,1186,0


In [129]:
submission.to_csv('ensemble_Submission.csv', index=False)
