In [1]:
pip install prettytable lazypredict shap imbalanced-learn tensorflow

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import seaborn as sb
import numpy as np
from matplotlib import pyplot as plt
import io
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error,r2_score, recall_score, precision_score, f1_score
from math import sqrt
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")
from prettytable import PrettyTable
import shap
from lazypredict.Supervised import LazyRegressor
from datetime import datetime
import pickle
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## Dataset

### Dataset Importing

In [3]:
emp_past_data = pd.read_csv('past_emp_data.csv',sep=r'\s*,\s*',header=0, engine='python') # removes the spaces in the data too to avoid naming issues (sep=r'\s*,\s*)
emp_past_data.head()

Unnamed: 0,age_category,gender,marital_status,educational_status,total_years_industry,years_work_current_hotel,number_of_years_current_role,department,joined_date,resigned_date
0,20 - 30,Male,Single,Diploma holder,1 - 3 years,1 - 3 years,1 - 3 years,Food and Beverages,24/02/2021,03/03/2022
1,30 - 40,Male,Married with children,Diploma holder,5 - 10 years,1 - 3 years,1 - 3 years,Food and Beverages,19/02/2020,15/03/2022
2,20 - 30,Female,Single,Diploma holder,,1 - 3 years,1 - 3 years,Food and Beverages,20/03/2020,22/03/2022
3,30 - 40,Male,Married with children,Diploma holder,10 - 15 years,3 - 5 years,3 - 5 years,Maintenance,22/03/2017,02/04/2022
4,20 - 30,Female,Single,Diploma holder,,1 - 3 years,1 - 3 years,Front office,12/04/2020,06/05/2022


### Dataset Organizing

In [4]:
# summing up the worked days count using resigned date and joined date (resigned date-joined date)
days = []
for index, row in emp_past_data.iterrows():
    resigned_date = datetime.strptime(row['resigned_date'], '%d/%m/%Y')
    joined_date = datetime.strptime(row['joined_date'], '%d/%m/%Y')
    number_of_days = resigned_date-joined_date
    days.append(int(number_of_days.days))
emp_past_data['job_days']=days
emp_past_data =  emp_past_data.drop(columns=['joined_date','resigned_date'],inplace=False)
emp_past_data.head()

Unnamed: 0,age_category,gender,marital_status,educational_status,total_years_industry,years_work_current_hotel,number_of_years_current_role,department,job_days
0,20 - 30,Male,Single,Diploma holder,1 - 3 years,1 - 3 years,1 - 3 years,Food and Beverages,372
1,30 - 40,Male,Married with children,Diploma holder,5 - 10 years,1 - 3 years,1 - 3 years,Food and Beverages,755
2,20 - 30,Female,Single,Diploma holder,,1 - 3 years,1 - 3 years,Food and Beverages,732
3,30 - 40,Male,Married with children,Diploma holder,10 - 15 years,3 - 5 years,3 - 5 years,Maintenance,1837
4,20 - 30,Female,Single,Diploma holder,,1 - 3 years,1 - 3 years,Front office,754


## Data Preprocessing

### Dimensional Reduction

*   cardinality validation



In [5]:
#for the categorical attributes
categorical_attributues = emp_past_data.loc[:, ['age_category','gender','marital_status','educational_status','total_years_industry','years_work_current_hotel','number_of_years_current_role','department']]
categorical_attributues.head()

Unnamed: 0,age_category,gender,marital_status,educational_status,total_years_industry,years_work_current_hotel,number_of_years_current_role,department
0,20 - 30,Male,Single,Diploma holder,1 - 3 years,1 - 3 years,1 - 3 years,Food and Beverages
1,30 - 40,Male,Married with children,Diploma holder,5 - 10 years,1 - 3 years,1 - 3 years,Food and Beverages
2,20 - 30,Female,Single,Diploma holder,,1 - 3 years,1 - 3 years,Food and Beverages
3,30 - 40,Male,Married with children,Diploma holder,10 - 15 years,3 - 5 years,3 - 5 years,Maintenance
4,20 - 30,Female,Single,Diploma holder,,1 - 3 years,1 - 3 years,Front office


In [6]:
categorical_attributues_data_types = pd.DataFrame(emp_past_data[categorical_attributues.columns].dtypes,columns=['Data Type']) # data types
categorical_attributues_cardinality = pd.DataFrame(emp_past_data[categorical_attributues.columns].nunique(),columns=['Cardinality']) # number of unique categories in the column
categorical_attributues_analytics =  categorical_attributues_data_types.join(categorical_attributues_cardinality)
categorical_attributues_analytics

Unnamed: 0,Data Type,Cardinality
age_category,object,3
gender,object,2
marital_status,object,3
educational_status,object,5
total_years_industry,object,6
years_work_current_hotel,object,6
number_of_years_current_role,object,6
department,object,5


since all columns cardinality is greater than one no columns to drop

*   corerelation validation

In [7]:
corr_matrix = emp_past_data.corr().abs()
upper_trangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper_trangle.columns if any(upper_trangle[column] > 0.95)]
print("columns to drop: ",to_drop)

columns to drop:  []


### Mising Values Handling

In [8]:
emp_past_data.isna().sum() # this step shows the number of missing/null values in the dataset

age_category                     0
gender                           0
marital_status                   0
educational_status               1
total_years_industry            22
years_work_current_hotel         0
number_of_years_current_role     0
department                       2
job_days                         0
dtype: int64

*   using data mode filling the missing data slots

https://vitalflux.com/pandas-impute-missing-values-mean-median-mode/




In [9]:
emp_past_data['total_years_industry'] = emp_past_data['total_years_industry'].fillna(emp_past_data['total_years_industry'].mode()[0])
emp_past_data['educational_status'] = emp_past_data['educational_status'].fillna(emp_past_data['educational_status'].mode()[0])
emp_past_data['department'] = emp_past_data['department'].fillna(emp_past_data['department'].mode()[0])
emp_past_data.isna().sum()

age_category                    0
gender                          0
marital_status                  0
educational_status              0
total_years_industry            0
years_work_current_hotel        0
number_of_years_current_role    0
department                      0
job_days                        0
dtype: int64

### Data Encording

In [10]:
# using pandas get_dummies method non-numerical values encords into numerical values based on One-Hot technique
non_numeric_value_cols =['age_category','gender','marital_status','educational_status','total_years_industry','years_work_current_hotel','number_of_years_current_role','department']
encoded_emp_data = pd.DataFrame(pd.get_dummies(emp_past_data, dummy_na=False,columns=non_numeric_value_cols,dtype=np.int64))

encoded_emp_data.dtypes

job_days                                           int64
age_category_20 - 30                               int64
age_category_30 - 40                               int64
age_category_Above 40                              int64
gender_Female                                      int64
gender_Male                                        int64
marital_status_Married with children               int64
marital_status_Married without children            int64
marital_status_Single                              int64
educational_status_A/L passer                      int64
educational_status_Below O/L                       int64
educational_status_Degree holder                   int64
educational_status_Diploma holder                  int64
educational_status_O/L passer                      int64
total_years_industry_1 - 3 years                   int64
total_years_industry_10 - 15 years                 int64
total_years_industry_15 years and above            int64
total_years_industry_3 - 5 year

In [11]:
encoded_emp_data.head()

Unnamed: 0,job_days,age_category_20 - 30,age_category_30 - 40,age_category_Above 40,gender_Female,gender_Male,marital_status_Married with children,marital_status_Married without children,marital_status_Single,educational_status_A/L passer,educational_status_Below O/L,educational_status_Degree holder,educational_status_Diploma holder,educational_status_O/L passer,total_years_industry_1 - 3 years,total_years_industry_10 - 15 years,total_years_industry_15 years and above,total_years_industry_3 - 5 years,total_years_industry_5 - 10 years,total_years_industry_Less than 1 year,years_work_current_hotel_1 - 3 years,years_work_current_hotel_10 - 15 years,years_work_current_hotel_15 years and above,years_work_current_hotel_3 - 5 years,years_work_current_hotel_5 - 10 years,years_work_current_hotel_Less than 1 year,number_of_years_current_role_1 - 3 years,number_of_years_current_role_10 - 15 years,number_of_years_current_role_15 years and above,number_of_years_current_role_3 - 5 years,number_of_years_current_role_5 - 10 years,number_of_years_current_role_Less than 1 year,department_Food and Beverages,department_Front office,department_Housekeeping,department_Maintenance,department_Security
0,372,1,0,0,0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1,755,0,1,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
2,732,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
3,1837,0,1,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0
4,754,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0


### Dupplicate Values Handling

In [12]:
print("before removal dupplication data: ",encoded_emp_data.shape[0])
encoded_emp_data.drop_duplicates(inplace = True)
print("after removal dupplication data: ",encoded_emp_data.shape[0])

before removal dupplication data:  61
after removal dupplication data:  61


### Feature Scaling

*   applying min-max scaling to the dataset

In [13]:
encoded_emp_satisfaction_data_features = encoded_emp_data.drop("job_days", axis=1)
encoded_emp_satisfaction_data_output = encoded_emp_data[["job_days"]]

min_max_scaler_features=MinMaxScaler()
min_max_scaler_features.fit(encoded_emp_satisfaction_data_features)

encoded_balanced_mx_emp_features = pd.DataFrame(min_max_scaler_features.transform(encoded_emp_satisfaction_data_features.to_numpy()),columns=encoded_emp_satisfaction_data_features.columns)

min_max_scaler_output=MinMaxScaler()
min_max_scaler_output.fit(encoded_emp_satisfaction_data_output)

encoded_balanced_mx_emp_output = pd.DataFrame()
encoded_balanced_mx_emp_output['job_days'] = pd.DataFrame(min_max_scaler_output.transform(encoded_emp_satisfaction_data_output.to_numpy()))

encoded_balanced_mx_scaled_emp_data= encoded_balanced_mx_emp_features.join(encoded_balanced_mx_emp_output)
encoded_balanced_mx_scaled_emp_data.head()

Unnamed: 0,age_category_20 - 30,age_category_30 - 40,age_category_Above 40,gender_Female,gender_Male,marital_status_Married with children,marital_status_Married without children,marital_status_Single,educational_status_A/L passer,educational_status_Below O/L,educational_status_Degree holder,educational_status_Diploma holder,educational_status_O/L passer,total_years_industry_1 - 3 years,total_years_industry_10 - 15 years,total_years_industry_15 years and above,total_years_industry_3 - 5 years,total_years_industry_5 - 10 years,total_years_industry_Less than 1 year,years_work_current_hotel_1 - 3 years,years_work_current_hotel_10 - 15 years,years_work_current_hotel_15 years and above,years_work_current_hotel_3 - 5 years,years_work_current_hotel_5 - 10 years,years_work_current_hotel_Less than 1 year,number_of_years_current_role_1 - 3 years,number_of_years_current_role_10 - 15 years,number_of_years_current_role_15 years and above,number_of_years_current_role_3 - 5 years,number_of_years_current_role_5 - 10 years,number_of_years_current_role_Less than 1 year,department_Food and Beverages,department_Front office,department_Housekeeping,department_Maintenance,department_Security,job_days
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.03
1,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.1
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.09
3,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.28
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.1


## Data Splitting

In the below step, data is split into 80:20 ratios. This means 80% of the total data is for training and 20% for testing

In [14]:
encoded_features_train, encoded_features_test , encoded_output_train, encoded_output_test = train_test_split(encoded_emp_data.drop("job_days", axis=1),encoded_emp_data['job_days'],train_size=0.8) # encoded , balanced
mx_features_train, mx_features_test , mx_output_train, mx_output_test = train_test_split(encoded_balanced_mx_scaled_emp_data.drop("job_days", axis=1),encoded_balanced_mx_scaled_emp_data['job_days'],train_size=0.8) # encoded , balanced, sclaed

## Model Selection

*   selection with encoded balanced without scaled data

In [15]:
lazy_regressor = LazyRegressor(verbose=0,ignore_warnings=False, custom_metric=None )
models,predictions = lazy_regressor.fit(encoded_features_train, encoded_features_test, encoded_output_train, encoded_output_test)
models.head(250)

100%|██████████████████████████████████████████████████████████████████████████████| 42/42 [00:00<00:00, 46.45it/s]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KernelRidge,5.14,-7.29,1563.43,0.01
MLPRegressor,2.8,-2.6,1030.89,0.06
GaussianProcessRegressor,2.77,-2.53,1020.76,0.02
LinearSVR,2.72,-2.43,1006.32,0.0
Lars,2.53,-2.06,949.18,0.01
LassoLars,2.26,-1.52,861.87,0.01
DummyRegressor,2.25,-1.5,857.93,0.0
LGBMRegressor,2.02,-1.04,775.17,0.03
HistGradientBoostingRegressor,2.02,-1.04,775.17,0.06
NuSVR,1.62,-0.23,602.78,0.01


*   selection with min-max scaled data

In [16]:
mx_lazy_regressor = LazyRegressor(verbose=0,ignore_warnings=False, custom_metric=None )
mx_models,mx_predictions = mx_lazy_regressor.fit(mx_features_train, mx_features_test, mx_output_train, mx_output_test)
mx_models.head(150)

 31%|████████████████████████▏                                                     | 13/42 [00:00<00:00, 41.77it/s]

GammaRegressor model failed to execute
Some value(s) of y are out of the valid range for family GammaDistribution


100%|██████████████████████████████████████████████████████████████████████████████| 42/42 [00:00<00:00, 53.46it/s]


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MLPRegressor,1.9,-0.81,0.51,0.03
GaussianProcessRegressor,1.9,-0.8,0.51,0.01
QuantileRegressor,1.71,-0.42,0.45,0.02
LassoLars,1.6,-0.2,0.41,0.02
DummyRegressor,1.6,-0.2,0.41,0.0
ElasticNet,1.6,-0.2,0.41,0.0
Lasso,1.6,-0.2,0.41,0.01
HistGradientBoostingRegressor,1.36,0.29,0.32,0.06
LGBMRegressor,1.36,0.29,0.32,0.02
SVR,1.22,0.56,0.25,0.01


In [17]:
from xgboost import XGBRFRegressor
from sklearn.tree import ExtraTreeRegressor,DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor,BaggingRegressor,RandomForestRegressor
from sklearn.linear_model import LassoCV,OrthogonalMatchingPursuitCV,HuberRegressor

In [18]:
ompcv =OrthogonalMatchingPursuitCV()
abr = AdaBoostRegressor()
etr = ExtraTreeRegressor()
hbr = HuberRegressor()
lcv =  LassoCV()
rfr = RandomForestRegressor()
xgbr = XGBRFRegressor()

results_table = PrettyTable(["Algorithm Name","Train Score(R2 Score)","Test Score(R2 Score)", "MSE Score"])
for model in (ompcv,abr,etr,hbr,lcv,rfr,xgbr):


  model.fit(encoded_features_train,encoded_output_train)
  model_predictions_std = model.predict(encoded_features_test)

  model_train_score = model.score(encoded_features_train,encoded_output_train)
  model_test_score = model.score(encoded_features_test,encoded_output_test)
  model_mse_score=mean_squared_error(encoded_output_test, model_predictions_std)

  results_table.add_row([model.__class__.__name__,round(model_train_score*100,2), 
                      round(model_test_score*100,2), 
                      round(model_mse_score,2)])

print(results_table)

+-----------------------------+-----------------------+----------------------+-----------+
|        Algorithm Name       | Train Score(R2 Score) | Test Score(R2 Score) | MSE Score |
+-----------------------------+-----------------------+----------------------+-----------+
| OrthogonalMatchingPursuitCV |         98.19         |        83.58         |  48419.1  |
|      AdaBoostRegressor      |         99.12         |        72.68         |  80556.84 |
|      ExtraTreeRegressor     |         99.89         |        64.62         | 104320.48 |
|        HuberRegressor       |         98.01         |        59.92         | 118192.59 |
|           LassoCV           |         98.54         |        73.08         |  79378.71 |
|    RandomForestRegressor    |         99.14         |        71.31         |  84601.69 |
|        XGBRFRegressor       |         99.55         |        82.14         |  52668.53 |
+-----------------------------+-----------------------+----------------------+-----------+

In [19]:
mx_dtr = DecisionTreeRegressor()
mx_xgbr = XGBRFRegressor()
mx_ompcv =OrthogonalMatchingPursuitCV()
mx_bgr = BaggingRegressor()
mx_etr = ExtraTreeRegressor()
mx_lcv =  LassoCV()


mx_results_table =PrettyTable(["Algorithm Name","Train Score(R2 Score)","Test Score(R2 Score)", "MSE Score"])
for model_mx in (mx_dtr,mx_xgbr,mx_ompcv,mx_bgr,mx_etr,mx_lcv):

  model_mx.fit(mx_features_train,mx_output_train)
  model_predictions_mx = model_mx.predict(mx_features_test)

  model_train_score = model_mx.score(mx_features_train,mx_output_train)
  model_test_score = model_mx.score(mx_features_test,mx_output_test)
  model_mse_score=mean_squared_error(mx_output_test, model_predictions_mx)

  mx_results_table.add_row([model_mx.__class__.__name__,round(model_train_score*100,2), round(model_test_score*100,2), round(model_mse_score*100,2)])

print(mx_results_table)

+-----------------------------+-----------------------+----------------------+-----------+
|        Algorithm Name       | Train Score(R2 Score) | Test Score(R2 Score) | MSE Score |
+-----------------------------+-----------------------+----------------------+-----------+
|    DecisionTreeRegressor    |         99.53         |         94.0         |    0.85   |
|        XGBRFRegressor       |          98.5         |        97.03         |    0.42   |
| OrthogonalMatchingPursuitCV |         97.64         |        96.56         |    0.49   |
|       BaggingRegressor      |         96.33         |        82.72         |    2.45   |
|      ExtraTreeRegressor     |         99.53         |        90.13         |    1.4    |
|           LassoCV           |         98.32         |        94.23         |    0.82   |
+-----------------------------+-----------------------+----------------------+-----------+


It is possible to see from the above results, XGBRFRegressor algorithm is performing well for minmax scaled encoded type data. As it is author decided to continue this system with XGBRFRegressor Algorithm as the final algotithm predict employee early turnover.

## Model Tuning

In [None]:
# values = [i for i in range(2,200)]
# train_scores, test_scores = list(), list()
# for i in values:
#   model2 = XGBRFRegressor(n_estimators=i)
#   model2.fit(mx_features_train,mx_output_train)
#   train_acc= model2.score(mx_features_train,mx_output_train)
#   test_acc =model2.score(mx_features_test,mx_output_test)
#   test_scores.append(test_acc)
#   print('>%d, train: %.3f, test: %.3f' % (i, train_acc, test_acc))

# # plot of train and test scores vs tree depth
# plt.plot(values, train_scores, '-o', label='Train')
# plt.plot(values, test_scores, '-o', label='Test')
# plt.legend()
# plt.show()

In [20]:
best_params ={
    'max_depth': 5,
    'n_estimators': 6}

tuned_results_table = PrettyTable(["Algorithm Name","Train Score(R2 Score)","Test Score(R2 Score)","MSE Score"])

tuned_classifier = XGBRFRegressor(**best_params)

tuned_classifier.fit(mx_features_train,mx_output_train)
model_predictions = tuned_classifier.predict(mx_features_test)

model_train_score = tuned_classifier.score(mx_features_train,mx_output_train)
model_test_score = tuned_classifier.score(mx_features_test,mx_output_test)
model_mse_score=mean_squared_error(mx_output_test, model_predictions)

tuned_results_table.add_row([tuned_classifier.__class__.__name__,round(model_train_score*100,2), round(model_test_score*100,2), round(model_mse_score*100,2)])

print(tuned_results_table)

+----------------+-----------------------+----------------------+-----------+
| Algorithm Name | Train Score(R2 Score) | Test Score(R2 Score) | MSE Score |
+----------------+-----------------------+----------------------+-----------+
| XGBRFRegressor |         98.62         |        98.44         |    0.22   |
+----------------+-----------------------+----------------------+-----------+


## Model/Scaler Export

In [21]:
pickle.dump(tuned_classifier, open('employee_turnover_days_xgb_regression_model.pkl', 'wb'))
pickle.dump(min_max_scaler_features, open('employee_turnover_days_features_min_max_feature_scaler.pkl', 'wb'))
pickle.dump(min_max_scaler_output, open('employee_turnover_days_features_min_max_output_scaler.pkl', 'wb'))

## Application

In [40]:
age=2 #20-30=>1,30-40=>2,40> => 3
gender=1 #M=>1,F=>2
marital_status=1 #marital_status_Married with children => 1 , marital_status_Married without children => 2 ,marital_status_Single=>3
educational_status=4 # below_ol = 1, ol_passer = 2, al_passer= 3, diploma_holder=4,degree_holder=5
total_years_industry=4 # total_years_industry_Less than 1 year=>1, total_years_industry_1 - 3 years => 2 ,total_years_industry_3 - 5 years => 3, total_years_industry_5 - 10 years => 4, total_years_industry_10 - 15 years=5,total_years_industry_15 years and above=6
years_work_current_hotel=2 # years_work_current_hotel_Less than 1 year => 1, years_work_current_hotel_1 - 3 years => 2, years_work_current_hotel_3 - 5 years=>3, years_work_current_hotel_5 - 10 years=4, years_work_current_hotel_10 - 15 years=>5,
number_of_years_current_role=2 # number_of_years_current_role_Less than 1 year=1, number_of_years_current_role_1 - 3 years=2, number_of_years_current_role_3 - 5 years=3, number_of_years_current_role_5 - 10 years=4, number_of_years_current_role_10 - 15 years=5, number_of_years_current_role_15 years and above=6
department=1 # department_Food and Beverages=>1, department_Front office=>2, department_Housekeeping=>3, department_Maintenance=>4, department_Security=>5

if(age==1):
    age_category_20_30=1
    age_category_30_40=0 
    age_category_above_40=0
elif(age==2):
    age_category_20_30=0
    age_category_30_40=1 
    age_category_above_40=0
else:
    age_category_20_30=0
    age_category_30_40=0 
    aage_category_above_40=1 


if(gender==1):
    gender_female=0
    gender_male=1
else:
    gender_female=1
    gender_male=0 

    
if(marital_status==1):
    marital_status_Married_with_children=1
    marital_status_Married_without_children=0
    marital_status_Single=0
    
elif(marital_status==2):
    marital_status_Married_with_children=0
    marital_status_Married_without_children=1
    marital_status_Single=0
else:
    marital_status_Married_with_children=0
    marital_status_Married_without_children=0,
    marital_status_Single=1


if(educational_status==1):
    educational_status_below_ol = 1
    educational_status_ol_passer = 0
    educational_status_al_passer= 0
    educational_status_diploma_holder=0
    educational_status_degree_holder=0
    
elif(educational_status==2):
    educational_status_below_ol = 0
    educational_status_ol_passer = 1
    educational_status_al_passer= 0
    educational_status_diploma_holder=0
    educational_status_degree_holder=0
    
elif educational_status==3:
    educational_status_below_ol = 0
    educational_status_ol_passer = 0
    educational_status_al_passer= 1
    educational_status_diploma_holder=0
    educational_status_degree_holder=0
    
elif educational_status==4:
    educational_status_below_ol = 0
    educational_status_ol_passer = 0
    educational_status_al_passer= 0
    educational_status_diploma_holder=1
    educational_status_degree_holder=0
else:
    educational_status_below_ol = 0
    educational_status_ol_passer = 0
    educational_status_al_passer= 0
    educational_status_diploma_holder=0
    educational_status_degree_holder=1
   

if(total_years_industry==1):
    total_years_industry_less_than_1_year=1
    total_years_industry_1_3_years=0
    total_years_industry_3_5_years=0 
    total_years_industry_5_10_years=0
    total_years_industry_10_15_years=0
    total_years_industry_15_years_above=0
     
elif(total_years_industry==2):
    total_years_industry_less_than_1_year=0
    total_years_industry_1_3_years=1 
    total_years_industry_3_5_years=0 
    total_years_industry_5_10_years=0
    total_years_industry_10_15_years=0
    total_years_industry_15_years_above=0
     
elif(total_years_industry==3):
    total_years_industry_less_than_1_year=0
    total_years_industry_1_3_years=0
    total_years_industry_3_5_years=1
    total_years_industry_5_10_years=0
    total_years_industry_10_15_years=0
    total_years_industry_15_years_above=0
     
elif(total_years_industry==4):
    total_years_industry_less_than_1_year=0
    total_years_industry_1_3_years=0
    total_years_industry_3_5_years=0 
    total_years_industry_5_10_years=1
    total_years_industry_10_15_years=0
    total_years_industry_15_years_above=0
     
elif(total_years_industry==5):
    total_years_industry_less_than_1_year=0
    total_years_industry_1_3_years=0
    total_years_industry_3_5_years=0 
    total_years_industry_5_10_years=0
    total_years_industry_10_15_years=1
    total_years_industry_15_years_above=0
     
else:
    total_years_industry_less_than_1_year=0
    total_years_industry_1_3_years=0
    total_years_industry_3_5_years=0 
    total_years_industry_5_10_years=0
    total_years_industry_10_15_years=0
    total_years_industry_15_years_above=1


if(years_work_current_hotel==1):
    years_work_current_hotel_less_than_year=1
    years_work_current_hotel_1_3_years=0
    years_work_current_hotel_3_5_years=0
    years_work_current_hotel_5_10_years=0
    years_work_current_hotel_10_15_years=0
    years_work_current_hotel_15_above=0
        
elif years_work_current_hotel==2:
    years_work_current_hotel_less_than_year=0
    years_work_current_hotel_1_3_years=1
    years_work_current_hotel_3_5_years=0
    years_work_current_hotel_5_10_years=0
    years_work_current_hotel_10_15_years=0
    years_work_current_hotel_15_above=0
     
elif years_work_current_hotel==3:
    years_work_current_hotel_less_than_year=0
    years_work_current_hotel_1_3_years=0
    years_work_current_hotel_3_5_years=1
    years_work_current_hotel_5_10_years=0
    years_work_current_hotel_10_15_years=0
    years_work_current_hotel_15_above=0
     
elif years_work_current_hotel==4:
    years_work_current_hotel_less_than_year=0
    years_work_current_hotel_1_3_years=0
    years_work_current_hotel_3_5_years=0
    years_work_current_hotel_5_10_years=1
    years_work_current_hotel_10_15_years=0
    yyears_work_current_hotel_15_above=0

elif years_work_current_hotel==5:
    years_work_current_hotel_less_than_year=0
    years_work_current_hotel_1_3_years=0
    years_work_current_hotel_3_5_years=0
    years_work_current_hotel_5_10_years=0
    years_work_current_hotel_10_15_years=1
    years_work_current_hotel_15_above=0
     
else:
    years_work_current_hotel_less_than_year=0
    years_work_current_hotel_1_3_years=0
    years_work_current_hotel_3_5_years=0
    years_work_current_hotel_5_10_years=0
    years_work_current_hotel_10_15_years=0
    years_work_current_hotel_15_above=1
    
 
     
if(number_of_years_current_role==1):
    number_of_years_current_role_less_than_year=1
    number_of_years_current_role_1_3_years=0
    number_of_years_current_role_3_5_years=0
    number_of_years_current_role_5_10_years=0
    number_of_years_current_role_10_15_years=0
    number_of_years_current_role_15_above=0
     
elif number_of_years_current_role==2:
    number_of_years_current_role_less_than_year=0
    number_of_years_current_role_1_3_years=1
    number_of_years_current_role_3_5_years=0
    number_of_years_current_role_5_10_years=0
    number_of_years_current_role_10_15_years=0
    number_of_years_current_role_15_above=0
     
elif number_of_years_current_role==3:
    number_of_years_current_role_less_than_year=0
    number_of_years_current_role_1_3_years=0
    number_of_years_current_role_3_5_years=1
    number_of_years_current_role_5_10_years=0
    number_of_years_current_role_10_15_years=0
    number_of_years_current_role_15_above=0
     
elif number_of_years_current_role==4:
    number_of_years_current_role_less_than_year=0
    number_of_years_current_role_1_3_years=0
    number_of_years_current_role_3_5_years=0
    number_of_years_current_role_5_10_years=1
    number_of_years_current_role_10_15_years=0
    number_of_years_current_role_15_above=0

elif number_of_years_current_role==5:
    number_of_years_current_role_less_than_year=0
    number_of_years_current_role_1_3_years=0
    number_of_years_current_role_3_5_years=0
    number_of_years_current_role_5_10_years=0
    number_of_years_current_role_10_15_years=1
    number_of_years_current_role_15_above=0
     
else:
    number_of_years_current_role_less_than_year=0
    number_of_years_current_role_1_3_years=0
    number_of_years_current_role_3_5_years=0
    number_of_years_current_role_5_10_years=0
    number_of_years_current_role_10_15_years=0
    number_of_years_current_role_15_above=1

     
if (department==1):
    department_food_beverages=1
    department_front_office=0
    department_housekeeping=0
    department_maintenance=0
    department_security=0

elif department==2:
    department_food_beverages=0
    department_front_office=1
    department_housekeeping=0
    department_maintenance=0
    department_security=0
     
elif department==3:
    department_food_beverages=0
    department_front_office=0
    department_housekeeping=1
    department_maintenance=0
    department_security=0
     
elif department==4:
    department_food_beverages=0
    department_front_office=0
    department_housekeeping=0
    department_maintenance=1
    department_security=0
     
else:
    department_food_beverages=0
    department_front_office=0
    department_housekeeping=0
    department_maintenance=0
    department_security=1


datapoint = pd.DataFrame({
    'age_category_20 - 30':age_category_20_30,
    'age_category_30 - 40': age_category_30_40, 
    'age_category_Above 40':age_category_above_40, 
    'gender_Female':gender_female,
    'gender_Male':gender_male,
    'marital_status_Married with children':marital_status_Married_with_children,
    'marital_status_Married without children':marital_status_Married_without_children, 
    'marital_status_Single':marital_status_Single,
    'educational_status_A/L passer':educational_status_al_passer, 
    'educational_status_Below O/L':educational_status_below_ol,
    'educational_status_Degree holder':educational_status_degree_holder, 
    'educational_status_Diploma holder':educational_status_diploma_holder,
    'educational_status_O/L passer':educational_status_ol_passer, 
    'total_years_industry_1 - 3 years':total_years_industry_1_3_years,
    'total_years_industry_10 - 15 years':total_years_industry_10_15_years,
    'total_years_industry_15 years and above':total_years_industry_15_years_above,
    'total_years_industry_3 - 5 years':total_years_industry_3_5_years, 
    'total_years_industry_5 - 10 years':total_years_industry_5_10_years,
    'total_years_industry_Less than 1 year':total_years_industry_less_than_1_year,
    'years_work_current_hotel_1 - 3 years':years_work_current_hotel_1_3_years,
    'years_work_current_hotel_10 - 15 years':years_work_current_hotel_10_15_years,
    'years_work_current_hotel_15 years and above':years_work_current_hotel_15_above,
    'years_work_current_hotel_3 - 5 years':years_work_current_hotel_3_5_years,
    'years_work_current_hotel_5 - 10 years':years_work_current_hotel_5_10_years,
    'years_work_current_hotel_Less than 1 year':years_work_current_hotel_less_than_year,
    'number_of_years_current_role_1 - 3 years':number_of_years_current_role_1_3_years,
    'number_of_years_current_role_10 - 15 years':number_of_years_current_role_10_15_years,
    'number_of_years_current_role_15 years and above':number_of_years_current_role_15_above,
    'number_of_years_current_role_3 - 5 years':number_of_years_current_role_3_5_years,
    'number_of_years_current_role_5 - 10 years':number_of_years_current_role_5_10_years,
    'number_of_years_current_role_Less than 1 year':number_of_years_current_role_less_than_year,
    'department_Food and Beverages':department_food_beverages, 
    'department_Front office':department_front_office,
    'department_Housekeeping':department_housekeeping, 
    'department_Maintenance':department_maintenance,
    'department_Security':department_security},index=[0])

In [42]:
#model/sclaers importing
xgbr_model = pickle.load(open("employee_turnover_days_xgb_regression_model.pkl", 'rb'))
mx_input_scaler = pickle.load(open("employee_turnover_days_features_min_max_feature_scaler.pkl", 'rb'))
mx_output_scaler = pickle.load(open("employee_turnover_days_features_min_max_output_scaler.pkl", 'rb'))

#scaling
scaled_datapoint = mx_input_scaler.transform(datapoint)

#predicting
pred_results=xgbr_model.predict(scaled_datapoint)
inverese_transformed_results = mx_output_scaler.inverse_transform(pred_results.reshape(1, -1))
print("This Employee will leave in(weeks): ", (inverese_transformed_results[0][0]%365)/7)

This Employee will leave in(weeks):  20.143668038504465
