In [1]:
#IMPORTING AND DOWNLOADING ALL THE REQUIRED MEANS

# !pip install -q hvplot
# !pip install scikit-learn==1.2.2
# !pip install imblearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import hvplot
import hvplot.pandas
%matplotlib inline

In [2]:
#READING FILE FROM THE SYSTEM

file_path = r"C:\Users\Suyash Shringi\OneDrive\Desktop\Projects\Employee Retention\WA_Fn-UseC_-HR-Employee-Attrition.csv"
df = pd.read_csv(file_path)
print(df.head())

   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

   ...  RelationshipSatisfaction StandardHours  StockOptionLevel  \
0  ...

In [3]:
# df.info()

In [4]:
# df.describe()

In [5]:
# #CHECKING NUMBER OF UNIQUE VALUES FOR EVERY FEATURE

# for column in df.columns:
#     print(f"{column}: {df[column].nunique()}")

In [6]:
#REMOVING CONSTANT FEATURES ,i.e., EmployeeCount, Over18, StandardHours have 1 unique value and EmployeeNumber is irrelevant to the O/P of the model

df.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis="columns", inplace=True)

In [7]:
# #CHECKING DIMENSIONS OF THE DATASET

# df.shape

In [8]:
#CONVERTING O/P CATEGORICAL VARIABLE TO THE NUMERICAL FEATURE OF FORM 1/0(YES/NO)

from sklearn.preprocessing import LabelEncoder

output_Data = LabelEncoder()
df["Attrition"] = output_Data.fit_transform(df.Attrition)

In [9]:
#CHECKING IF ANY MISSING VALUES OR PRESENT OR NOT

df.isna().sum()

#NO MISSING VALUES ARE PRESENT IN ANY FEATURE

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [10]:
#CHECKING IF ANY DUPLICATE VALUE IS PRESENT OR NOT

df.duplicated().sum()

#NO DUPLICATES ARE PRESENT IN THE DATASET

0

In [11]:
df['OverTime'].value_counts()

OverTime
No     1054
Yes     416
Name: count, dtype: int64

In [12]:
#CONVERTING ALL CATEGORICAL VARIABLES TO NUMERICAL FEATURE, USING ORDINAL ENCODER

from sklearn.preprocessing import OrdinalEncoder
enc=OrdinalEncoder()
enc.fit(df[["BusinessTravel","Department", "EducationField", "Gender", "JobRole", "MaritalStatus",  "OverTime"]])
df[["BusinessTravel","Department", "EducationField", "Gender", "JobRole", "MaritalStatus",  "OverTime"]] = enc.transform(df[["BusinessTravel","Department", "EducationField", "Gender", "JobRole", "MaritalStatus",  "OverTime"]])

In [13]:
df['OverTime'].value_counts()

OverTime
0.0    1054
1.0     416
Name: count, dtype: int64

In [14]:
# #PLOTTING BOX-PLOT TO CHECK FOR OUTLIERS

# ColsBox = df.select_dtypes('number')
# for col in ColsBox.columns:
#     plt.figure(figsize=(10,6))
#     plt.title('box plot of '+col)
#     sns.boxplot(df[col])
#     plt.show()

# #OBSERVATION :
# #1. MonthlyIncome(+)
# #2. NumCompaniesWorked(+)
# #3. StockOptionLevel(+)
# #4. TotalWorkingYears(+)
# #5. TrainingThisLastYear(+/-)
# #6. YearsAtCompany(+)
# #7. YearsInCurrentRole(+)
# #8. YearsSinceLastPromtion(+)
# #9. YearsWithCurrentManager(+)

In [15]:
#DESIGNING FUNCTION TO REMOVE OUTLIERS USING THE CAPPING METHOD, cause WHILE TRIMMING, EXCESSIVE DATA GETS REMOVED

def remove_outliers(df):
    for col in df.select_dtypes(include='number').columns:
        if col == 'PerformanceRating':
            continue
        if col == 'Attrition':
            continue
        percentile25 = df[col].quantile(0.25)
        percentile75 = df[col].quantile(0.75)
        iqr = percentile75 - percentile25
        print(iqr)
        upper_limit = percentile75 + 1.5 * iqr
        lower_limit = percentile25 - 1.5 * iqr
        print(upper_limit)
        print(lower_limit)
        df[col] = np.where(df[col] > upper_limit, upper_limit, df[col])
        df[col] = np.where(df[col] <lower_limit, lower_limit, df[col])
    return df

In [16]:
# Remove outliers
df = remove_outliers(df)

13.0
62.5
10.5
1.0
3.5
-0.5
692.0
2195.0
-573.0
1.0
3.5
-0.5
12.0
32.0
-16.0
2.0
7.0
-1.0
2.0
6.0
-2.0
2.0
7.0
-1.0
1.0
2.5
-1.5
35.75
137.375
-5.625
1.0
4.5
0.5
2.0
6.0
-2.0
5.0
14.5
-5.5
2.0
7.0
-1.0
1.0
3.5
-0.5
5468.0
16581.0
-5291.0
12414.5
39083.25
-10574.75
3.0
8.5
-3.5
1.0
2.5
-1.5
6.0
27.0
3.0
2.0
7.0
-1.0
1.0
2.5
-1.5
9.0
28.5
-7.5
1.0
4.5
0.5
1.0
4.5
0.5
6.0
18.0
-6.0
5.0
14.5
-5.5
3.0
7.5
-4.5
5.0
14.5
-5.5


In [17]:
# #RE-CHECKING TO ENSURE NO MORE OUTLIERS ARE PRESENT

# ColsBox = df.select_dtypes('number')
# for col in ColsBox.columns:
#     plt.figure(figsize=(10,6))
#     plt.title('box plot of '+col)
#     sns.boxplot(df[col])
#     plt.show()

In [18]:
# #PLOTTING HISTOGRAMS TO UNDERSTAND THE DISTRIBUTION OF THE FEATURE

# histplotter = df.select_dtypes('number')
# for col in histplotter.columns:
#     plt.figure(figsize=(10,6))
#     plt.title('HIST '+col)
#     sns.histplot(data=df, x=col, hue='Attrition', bins=30, element='step')
#     plt.show()

In [19]:
#PERFORMING TRAIN/TEST SPLIT

X = df.drop('Attrition', axis=1)
y = df['Attrition']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=60)

In [20]:
#REMOVING COLUMNS THAT ARE HIGHELY CO-RELATED
# 1. Monthly Income - Job Level (0.94)
# 2. Years in current role - Years at company (0.82)
# 3. Years at company - Years with current manager(0.84)

# DROPPING THE ONE FEATURE WHICH HAS LESS IMPACT ON THE ACCURACY AND F1 SCORE

X_train.drop(['MonthlyIncome', 'YearsAtCompany', 'YearsInCurrentRole'] , axis=1, inplace=True)
X_test.drop(['MonthlyIncome', 'YearsAtCompany', 'YearsInCurrentRole'] , axis=1, inplace=True)

In [21]:
#DROPPING FEATURES, BASED ON THEIR POOR PERFORMANCE ON ALL THE ABOVE TESTS CONDUCTED

X_train.drop(['PerformanceRating', 'RelationshipSatisfaction', 'DistanceFromHome', 'JobInvolvement', 'JobSatisfaction', 'EnvironmentSatisfaction','TrainingTimesLastYear', 'Gender'] , axis=1, inplace=True)
X_test.drop(['PerformanceRating', 'RelationshipSatisfaction', 'DistanceFromHome', 'JobInvolvement', 'JobSatisfaction', 'EnvironmentSatisfaction','TrainingTimesLastYear', 'Gender'] , axis=1, inplace=True)

In [22]:
from sklearn.preprocessing import StandardScaler

# List of categorical columns
categorical_columns = [
    'BusinessTravel', 'Department', 'EducationField',
    'JobRole', 'MaritalStatus', 'OverTime'
]


# Separate numerical and categorical columns
X_train_num = X_train.drop(columns=categorical_columns)
X_test_num = X_test.drop(columns=categorical_columns)

X_train_cat = X_train[categorical_columns]
X_test_cat = X_test[categorical_columns]

# Apply scaling to the numerical columns
scaler = StandardScaler()
X_train_scaled_num = scaler.fit_transform(X_train_num)
X_test_scaled_num = scaler.transform(X_test_num)

# Transform back into DataFrame
X_train_scaled_num = pd.DataFrame(X_train_scaled_num, columns=X_train_num.columns)
X_test_scaled_num = pd.DataFrame(X_test_scaled_num, columns=X_test_num.columns)

# Concatenate scaled numerical data with categorical data
X_train_scaled = pd.concat([X_train_scaled_num, X_train_cat.reset_index(drop=True)], axis=1)
X_test_scaled = pd.concat([X_test_scaled_num, X_test_cat.reset_index(drop=True)], axis=1)

X_train=X_train_scaled
X_test=X_test_scaled

# Check the results
print(X_train.head())
print(X_test.head())


        Age  DailyRate  Education  HourlyRate  JobLevel  MonthlyRate  \
0 -0.426594   0.073582   1.034871   -0.138483 -0.966574     0.056416   
1  0.009360   1.517771  -0.892300    0.887919 -0.966574     1.562622   
2  1.426212   1.005718  -1.855885    0.399156 -0.966574    -1.560354   
3  0.118349  -1.457607   0.071286   -1.067131  0.855047    -0.787110   
4  1.317223  -0.684556  -0.892300   -0.724998  0.855047     1.141070   

   NumCompaniesWorked  PercentSalaryHike  StockOptionLevel  TotalWorkingYears  \
0           -0.681533          -0.620119          0.311165          -1.251429   
1           -1.090558          -0.348311         -0.984162          -0.561665   
2           -0.681533          -1.163734          2.254156           1.231721   
3           -1.090558           1.554342         -0.984162          -0.147807   
4            0.545540          -1.163734          0.311165           2.404320   

   WorkLifeBalance  YearsSinceLastPromotion  YearsWithCurrManager  \
0        -1

In [24]:
#IMPORTING LIBRARIES TO CHECK INTITAL ACCURACY

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score,confusion_matrix, classification_report, mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

In [25]:
#FUNCTION TO COMPUTE ACCURACY_SCORE, CONFUSION_MATRIX, 

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n")
        print(f"Confusion Matrix: \n{confusion_matrix(y_train, pred)}\n")
        print(f"Mean Squared Error: {mean_squared_error(y_train, pred):.2f}\n")
        print(f"Classification Report: \n{clf_report}\n")
        print(f"Precision: {clf_report.loc['precision', 'weighted avg']:.2f}")
        print(f"Recall: {clf_report.loc['recall', 'weighted avg']:.2f}")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")        

    
    else:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n")        
        print(f"Confusion Matrix: \n{confusion_matrix(y_test, pred)}\n")
        print(f"Mean Squared Error: {mean_squared_error(y_test, pred):.2f}\n")
        print(f"Classification Report: \n{clf_report}\n")
        print(f"Precision: {clf_report.loc['precision', 'weighted avg']:.2f}")
        print(f"Recall: {clf_report.loc['recall', 'weighted avg']:.2f}")

In [26]:
#FUNCTION TO COMPUTE ACCURACY_SCORE, CONFUSION_MATRIX, MEAN_SQUARED_ERROR

def print_score1(clf, X_train, y_train, X_test, y_test, train=True):
    
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")        
        print(f"Confusion Matrix: \n{confusion_matrix(y_train, pred)}\n")
        print(f"Mean Squared Error: {mean_squared_error(y_train, pred):.2f}\n")
        print(f"Classification Report: \n{clf_report}\n")
        print(f"Precision: {clf_report.loc['precision', 'weighted avg']:.2f}")
        print(f"Recall: {clf_report.loc['recall', 'weighted avg']:.2f}")
    
    else:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")        
        print(f"Confusion Matrix: \n{confusion_matrix(y_test, pred)}\n")
        print(f"Mean Squared Error: {mean_squared_error(y_test, pred):.2f}\n")
        print(f"Classification Report: \n{clf_report}\n")
        print(f"Precision: {clf_report.loc['precision', 'weighted avg']:.2f}")
        print(f"Recall: {clf_report.loc['recall', 'weighted avg']:.2f}")

In [27]:
#CHECKING VARIOUS METRICS ACROSS DIFFERENT MODELS

classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGB": XGBClassifier(),
    "Naive Bayes": GaussianNB()
}
for clf_name, clf in classifiers.items():
    clf.fit(X_train_scaled, y_train)
    print("---------------------------\n")
    print(f"Classifier: {clf_name}")
    print_score1(clf, X_train_scaled, y_train, X_test_scaled, y_test, train=True)
    print_score1(clf, X_train_scaled, y_train, X_test_scaled, y_test, train=False) 
    print("---------------------------\n")


---------------------------

Classifier: Decision Tree
Train Result:

Accuracy Score: 100.00%
Confusion Matrix: 
[[989   0]
 [  0 187]]

Mean Squared Error: 0.00

Classification Report: 
               0      1  accuracy  macro avg  weighted avg
precision    1.0    1.0       1.0        1.0           1.0
recall       1.0    1.0       1.0        1.0           1.0
f1-score     1.0    1.0       1.0        1.0           1.0
support    989.0  187.0       1.0     1176.0        1176.0

Precision: 1.00
Recall: 1.00
Test Result:

Accuracy Score: 75.51%
Confusion Matrix: 
[[207  37]
 [ 35  15]]

Mean Squared Error: 0.24

Classification Report: 
                    0          1  accuracy   macro avg  weighted avg
precision    0.855372   0.288462  0.755102    0.571917      0.758959
recall       0.848361   0.300000  0.755102    0.574180      0.755102
f1-score     0.851852   0.294118  0.755102    0.572985      0.756999
support    244.000000  50.000000  0.755102  294.000000    294.000000

Precision: 0

In [28]:
# #CREATING CO-RELATION MATRIX

# plt.figure(figsize=(19,13))
# cor=X_train.corr()
# sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
# plt.show()

In [29]:
# #CO-RELATION FUNCTINO TO CHECK FOR FEATURES THAT ARE ABOVE THE THRESHOLD

# def correlation(dataset, threshold):
#     col_corr = set() # Set of all the names of deleted columns
#     corr_matrix = dataset.corr()
#     for i in range(len(corr_matrix.columns)):
#         for j in range(i):
#             if ((corr_matrix.iloc[i, j] >= threshold)):
#                 colname = corr_matrix.columns[i] # getting the name of column
#                 col_corr.add(colname)

#     return col_corr

In [30]:
# #SETTING THE THRESHOLD VALUE, WHICH IS IN THIS 0.8

# col_corr = correlation(X_train, 0.8)
# print(col_corr)

In [31]:
# #IMPLEMENTING CHI-SQUARE TEST

# from sklearn.feature_selection import chi2
# values = chi2(X_train, y_train)

# f_values=pd.Series(values[0])
# f_values.index = X_train.columns
# f_values.sort_values(ascending=False, inplace=True)
# p_values=pd.Series(values[1])
# p_values.index = X_train.columns
# p_values.sort_values(ascending=True, inplace=True)

# #OBSERVATION :
# # 1. OVERTIME IS THE MOST IMPORTANT COLUMN
# # 2. Whereas hourly rate, performance rating, business travel are least important column in that order

In [32]:
#HIGHER THE F_VALUE HIGHER IS THE IMPORTANCE

# f_values

In [33]:
#LOWER P_VALUE INDICATES HIGHER PRIORITY FEATURE

# p_values

In [34]:
# #INFO GAIN FeatureSelctionTechnique

# from sklearn.feature_selection import mutual_info_classif
# # determine the mutual information
# mutual_info = mutual_info_classif(X_train, y_train)
# mutual_info

In [35]:
# #ARRANGING THEM IN DESCENDIG VALUES

# mutual_info = pd.Series(mutual_info)
# mutual_info.index = X_train.columns
# mutual_info.sort_values(ascending=False)

In [36]:
# #PLOT :)

# mutual_info.sort_values(ascending=False).plot.bar(figsize=(20, 8))

# #FEATURES HAVING MUTUAL INFO VALUE -> 0 HAVE VERY LITTLE IMPACT ON THE TARGET VARIABLE

In [37]:
# #IMPLEMENTING BACKWARD WRAPPER SELECTION TECHNIQUE

# from mlxtend.feature_selection import SequentialFeatureSelector as SFS
# dt = DecisionTreeClassifier()
# dt.fit(X_train, y_train)

# sfs_accuracy_score_forward = SFS(dt,
#            k_features=17,
#            forward=True,
#            floating=False,
#            scoring='accuracy',
#            )
# sfs_accuracy_score_forward = sfs_accuracy_score_forward.fit(X_train, y_train)

# #CREATED DATAFRAME

# metric_df_accuracy_forward = pd.DataFrame.from_dict(sfs_accuracy_score_forward.get_metric_dict()).T
# metric_df_accuracy_forward['observations'] = 404
# metric_df_accuracy_forward['num_features'] = metric_df_accuracy_forward['feature_idx'].apply(lambda x: len(x))

# #CREATING LIST IN DESCENDING ORDER TO REPRESENT AVG_SCORES

# features_score_forward = metric_df_accuracy_forward.groupby('feature_idx').agg({'avg_score': 'mean'}).reset_index()
# features_score_forward.sort_values('avg_score', ascending=False, inplace=True)
# features_score_forward

In [38]:
# #PRINTING MATRIX, AVG_SCORE AT EACH WRAPPER STEP

# print("ACCURACY MATRIX : ")
# print(metric_df_accuracy_forward)
# accuracy_scores = metric_df_accuracy_forward['avg_score']
# print("Accuracy scores at each step:")
# print(accuracy_scores)

In [39]:
# #PLOTTING GRAPH FOR BETTER UNDERSTANDING

# from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

# fig1 = plot_sfs(sfs_accuracy_score_forward.get_metric_dict(), kind='std_dev')
# plt.title('WRAPPER SELECTION :)')
# plt.grid()
# plt.show()

In [40]:
 #IMPORTING LIBRARIES TO USE DIFFERENT MODELS FOR MODELLING PROCESS
# !pip install bayesian-optimization
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from statistics import mean, stdev

In [41]:
#FUNCTION FOR BAYESIAN OPTIMIZER FOR RANDOM FOREST

def rf_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features):
    # Convert parameters to integer
    n_estimators = int(n_estimators)
    max_depth = int(max_depth) if max_depth != None else None
    min_samples_split = int(min_samples_split)
    min_samples_leaf = int(min_samples_leaf)
    max_features = max_features if max_features != 'auto' else None  # 'auto' is deprecated in newer versions
    
    # Create a RandomForestClassifier instance with the given parameters
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=60
    )
    
    # Perform cross-validation and return the mean score
    return np.max(cross_val_score(rf, X_train_scaled, y_train, cv=5, n_jobs=-1))


In [42]:
#HYPER-PARAMETER TUING OF RANDOM FOREST CLASSIFIER USING GRID_SEARCHcv

# Define the parameter grid
param_grid = {
    'n_estimators': [50,100],
    'max_depth': [None,1,2],
    'min_samples_split': [2,3,4],
    'min_samples_leaf': [1,2,3,4,5,6],
    'max_features': ['auto' ,'sqrt']
}
# Create a RandomForestClassifier instance
rf = RandomForestClassifier(random_state=42)

# Create a GridSearchCV instance
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the GridSearchCV instance
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the best model on the test set
best_rf = grid_search.best_estimator_
print("Random Forest Classifier with Best Hyperparameters:")
print_score1(best_rf, X_train_scaled, y_train, X_test_scaled, y_test, train=True)
print_score1(best_rf, X_train_scaled, y_train, X_test_scaled, y_test, train=False)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


  warn(


Best Parameters: {'max_depth': None, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Score: 0.8571366750811397
Random Forest Classifier with Best Hyperparameters:
Train Result:

Accuracy Score: 100.00%
Confusion Matrix: 
[[989   0]
 [  0 187]]

Mean Squared Error: 0.00

Classification Report: 
               0      1  accuracy  macro avg  weighted avg
precision    1.0    1.0       1.0        1.0           1.0
recall       1.0    1.0       1.0        1.0           1.0
f1-score     1.0    1.0       1.0        1.0           1.0
support    989.0  187.0       1.0     1176.0        1176.0

Precision: 1.00
Recall: 1.00
Test Result:

Accuracy Score: 85.03%
Confusion Matrix: 
[[242   2]
 [ 42   8]]

Mean Squared Error: 0.15

Classification Report: 
                    0          1  accuracy   macro avg  weighted avg
precision    0.852113   0.800000   0.85034    0.826056      0.843250
recall       0.991803   0.160000   0.85034    0.575902      0.8

In [43]:
#HYPER-PARAMETER TUNING OF RANDOM FOREST CLASSIFIER USING BAYESIAN-OPTIMIZATION

param_bounds = {
    'n_estimators': (50,150),
    'max_depth': (1, 25),
    'min_samples_split': (2, 15),
    'min_samples_leaf': (1, 15),
    'max_features': (0.1, 0.999)  # max_features should be a fraction of features to consider
}


# Create a BayesianOptimization instance
optimizer = BayesianOptimization(
    f=rf_cv,
    pbounds=param_bounds,
    random_state=42
)

# Perform the optimization
optimizer.maximize(init_points=10, n_iter=30)
best_val = optimizer.max['target']

# Print the best parameters
print("Best Parameters:", optimizer.max['params'])

# Create the best RandomForestClassifier with the optimized parameters
best_params = optimizer.max['params']
best_rf = RandomForestClassifier(
    n_estimators=int(best_params['n_estimators']),
    max_depth=int(best_params['max_depth']) if best_params['max_depth'] != None else None,
    min_samples_split=int(best_params['min_samples_split']),
    min_samples_leaf=int(best_params['min_samples_leaf']),
    max_features=best_params['max_features'] if best_params['max_features'] != 'auto' else None,
    random_state=42
)

# Fit the model on the training data
best_rf.fit(X_train_scaled, y_train)

# Evaluate the best model on the test set
print("Random Forest Classifier with Best Hyperparameters:")
print_score(best_rf, X_train_scaled, y_train, X_test_scaled, y_test, train=True) 
print_score(best_rf, X_train_scaled, y_train, X_test_scaled, y_test, train=False)
print(f"Accuracy Score: {best_val*100:.2f}%")

|   iter    |  target   | max_depth | max_fe... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------
| [30m1         | [30m0.8723    | [30m9.989     | [30m0.9547    | [30m11.25     | [30m9.783     | [30m65.6      |
| [30m2         | [30m0.8468    | [30m4.744     | [30m0.1522    | [30m13.13     | [30m9.814     | [30m120.8     |
| [30m3         | [30m0.8426    | [30m1.494     | [30m0.9719    | [30m12.65     | [30m4.76      | [30m68.18     |
| [30m4         | [30m0.8638    | [30m5.402     | [30m0.3735    | [30m8.347     | [30m7.615     | [30m79.12     |
| [30m5         | [30m0.8681    | [30m15.68     | [30m0.2254    | [30m5.09      | [30m6.763     | [30m95.61     |
| [30m6         | [30m0.8602    | [30m19.84     | [30m0.2795    | [30m8.199     | [30m9.701     | [30m54.65     |
| [30m7         | [30m0.8723    | [30m15.58     | [30m0.2533    | [30m1.911     | [30m14.34     

In [44]:
#FUNCTION FOR BAYESIAN OPTIMIZATION FOR THE XGBOOST MODEL

def xgb_evaluate(n_estimators, max_depth, learning_rate, subsample, colsample_bytree):
    # Convert float parameters to integers where needed
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    
    # Create the classifier with given parameters
    xgb = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=42
    )
    
    # Perform cross-validation and return the mean score
    cv_scores = cross_val_score(xgb, X_train_scaled, y_train, cv=5, scoring='accuracy', n_jobs=-1)
    return cv_scores.max()

In [45]:
#HYPER-PARAMETER TUNING FOR  XGboost CLASSIFIER USING GRID_SEARCH CV

# Define the parameter grid
param_grid = {
    'n_estimators': [50],
    'max_depth': [2,9],
    'learning_rate': [0.25,0.3, 0.35],
    'subsample': [0.75, 0.8,0.85],
    'colsample_bytree': [0.5 ,0.6, 0.7]
}
# Create a RandomForestClassifier instance
xg = XGBClassifier(random_state=42)

# Create a GridSearchCV instance
grid_search = GridSearchCV(estimator=xg, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the GridSearchCV instance
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the best model on the test set
best_xg = grid_search.best_estimator_
print("Random Forest Classifier with Best Hyperparameters:")
print_score1(best_xg, X_train_scaled, y_train, X_test_scaled, y_test, train=True)
print_score1(best_xg, X_train_scaled, y_train, X_test_scaled, y_test, train=False)


Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best Parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.35, 'max_depth': 2, 'n_estimators': 50, 'subsample': 0.8}
Best Score: 0.8733032816444284
Random Forest Classifier with Best Hyperparameters:
Train Result:

Accuracy Score: 90.73%
Confusion Matrix: 
[[976  13]
 [ 96  91]]

Mean Squared Error: 0.09

Classification Report: 
                    0           1  accuracy    macro avg  weighted avg
precision    0.910448    0.875000  0.907313     0.892724      0.904811
recall       0.986855    0.486631  0.907313     0.736743      0.907313
f1-score     0.947113    0.625430  0.907313     0.786271      0.895961
support    989.000000  187.000000  0.907313  1176.000000   1176.000000

Precision: 0.90
Recall: 0.91
Test Result:

Accuracy Score: 85.03%
Confusion Matrix: 
[[237   7]
 [ 37  13]]

Mean Squared Error: 0.15

Classification Report: 
                    0          1  accuracy   macro avg  weighted avg
precision    0.86496

In [46]:
#HYPER-PARAMETER TUNING FOR XGboost CLASSIFIER USING BAYESIAN-OPTIMIZATION

param_bounds = {
    'n_estimators': (50, 300),  # Only one value, but necessary to define the bounds
    'max_depth': (2,20),
    'learning_rate': (0.01, 0.55),
    'subsample': (0.1, 0.9),
    'colsample_bytree': (0.2, 0.8)
}

# Initialize Bayesian Optimization
optimizer = BayesianOptimization(f=xgb_evaluate, pbounds=param_bounds, random_state=42, verbose=2)

# Run the optimization
optimizer.maximize(init_points=5, n_iter=40)
best_val = optimizer.max['target']


# Extract the best parameters
best_params = optimizer.max['params']
best_params['n_estimators'] = int(best_params['n_estimators'])
best_params['max_depth'] = int(best_params['max_depth'])

print("Best Parameters:", best_params)

# Train the final model with the best parameters
best_xg = XGBClassifier(**best_params, random_state=42)
best_xg.fit(X_train, y_train)

# Evaluate the best model
print("XG BOOST Classifier with Best Hyperparameters:")
print_score(best_xg, X_train, y_train, X_test, y_test, train=True)
print_score(best_xg, X_train, y_train, X_test, y_test, train=False)
print(f"Accuracy Score: {best_val*100:.2f}%")

|   iter    |  target   | colsam... | learni... | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [30m1         | [30m0.8298    | [30m0.4247    | [30m0.5234    | [30m15.18     | [30m199.7     | [30m0.2248    |
| [35m2         | [35m0.8644    | [35m0.2936    | [35m0.04137   | [35m17.59     | [35m200.3     | [35m0.6665    |
| [30m3         | [30m0.822     | [30m0.2124    | [30m0.5338    | [30m16.98     | [30m103.1     | [30m0.2455    |
| [30m4         | [30m0.8596    | [30m0.31      | [30m0.1743    | [30m11.45     | [30m158.0     | [30m0.333     |
| [35m5         | [35m0.8809    | [35m0.5671    | [35m0.08533   | [35m7.259     | [35m141.6     | [35m0.4649    |
| [30m6         | [30m0.8766    | [30m0.5882    | [30m0.05942   | [30m7.282     | [30m141.6     | [30m0.5024    |
| [30m7         | [30m0.8553    | [30m0.3591    | [30m0.3418    | [30m7.032     | [30m141.9     

In [47]:
import pickle as pkl

In [50]:
pkl.dump(best_rf,open('employee1.pkl', 'wb'))

In [49]:
pkl.dump(scaler,open('scaler.pkl','wb'))
