In [1]:
# for Mathematical Operations
import numpy as np

# for dataframe Manipulation
import pandas as pd 

# for data visualization
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
# Importing the dataset
data = pd.read_csv("employee_promotion.csv")

# lets check the shape of the dataset
data.shape

(54808, 14)

In [3]:
# lets check the head of the dataset
data.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [6]:
# Lets generate the Class Distrubtion 

target_class = data.is_promoted.value_counts()
print('Class 0:', target_class[0])
print('Class 1:', target_class[1])
print('Ratio:', round(target_class[0] / target_class[1], 2), ': 1')

Class 0: 50140
Class 1: 4668
Ratio: 10.74 : 1


In [7]:
#checking null values
data.isnull().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [8]:
# Imputing the Missing values

data['education'].fillna(data['education'].mode()[0], inplace = True)
data['previous_year_rating'].fillna(data['previous_year_rating'].mode()[0], inplace=True)

# Checking the Number of Missing Values
data.isnull().sum().sum()

0

In [9]:
# lets check all the columns which are in String
data.select_dtypes('object').columns

Index(['department', 'region', 'education', 'gender', 'recruitment_channel'], dtype='object')

In [11]:
# Using label encoder for converting categorical values to numerical values

from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()
data['department'] = lb.fit_transform(data['department'])
data['region'] = lb.fit_transform(data['region'])
data['education'] = lb.fit_transform(data['education'])
data['gender'] = lb.fit_transform(data['gender'])
data['recruitment_channel'] = lb.fit_transform(data['recruitment_channel'])

# lets check if there are any Categorical Columns left
data.select_dtypes('object').columns

Index([], dtype='object')

In [12]:
# Let's split the Target Data from the Dataset

X=data.iloc[:,:-1]
y=data.iloc[:,-1]

# lets check the shape of x and y
print("The Shape of X: ", X.shape)
print("The Shape of y: ", y.shape)

The Shape of X:  (54808, 13)
The Shape of y:  (54808,)


In [13]:
# Lets Split the dataset into Training and Testing Sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# lets print the shapes
print("The Shape of X train :", X_train.shape)
print("The Shape of X test :", X_test.shape)
print("The Shape of Y train :", y_train.shape)
print("The Shape of Y test :", y_test.shape)

The Shape of X train : (43846, 13)
The Shape of X test : (10962, 13)
The Shape of Y train : (43846,)
The Shape of Y test : (10962,)




```
# This is formatted as code
```

### Applying Machine Learning Models

#### Applying Logistic Regression

In [18]:
# Lets Build a Predictive Model using the Random Forest Classifier

from sklearn.linear_model import LogisticRegression

# define a Model
model1 = LogisticRegression()

# Train the Model
model1.fit(X_train,y_train)

# Predictions for the X test
y_pred1 = model1.predict(X_test)

In [19]:
# Lets Evaluate the Model using Accuracy and Classification Report
from sklearn.metrics import classification_report, accuracy_score

print(accuracy_score(y_test, y_pred1))
print(classification_report(y_test, y_pred1))

0.9159824849480022
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     10041
           1       0.00      0.00      0.00       921

    accuracy                           0.92     10962
   macro avg       0.46      0.50      0.48     10962
weighted avg       0.84      0.92      0.88     10962



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Applying Random Forest

In [22]:
# Lets Build a Predictive Model using the Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

# define a Model
model = RandomForestClassifier()

# Train the Model
model.fit(X_train,y_train)

# Predictions for the X test
y_pred = model.predict(X_test)

In [23]:
# Lets Evaluate the Model using Accuracy and Classification Report
from sklearn.metrics import classification_report, accuracy_score

print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.9351395730706076
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     10041
           1       0.89      0.26      0.40       921

    accuracy                           0.94     10962
   macro avg       0.92      0.63      0.68     10962
weighted avg       0.93      0.94      0.92     10962



## Using Random OverSampling

In [29]:
# OverSampling
from imblearn.over_sampling import RandomOverSampler

# defining the Model
ros = RandomOverSampler(random_state=0)

# Training the Model
ros.fit(X, y)

# Creating the Resampled Sets
X_resampled, y_resampled = ros.fit_resample(X, y)

In [30]:
# Lets Check the Class Distribution
y_resampled.value_counts()

1    50140
0    50140
Name: is_promoted, dtype: int64

In [32]:
# Splitting the dataset
X_train_re, X_test_re, y_train_re, y_test_re =train_test_split(X_resampled,
                                                               y_resampled,
                                                               test_size=0.2,
                                                               random_state=0)

# lets print the shapes
print("The Shape of X train :", X_train_re.shape)
print("The Shape of X test :", X_test_re.shape)
print("The Shape of Y train :", y_train_re.shape)
print("The Shape of Y test :", y_test_re.shape)

The Shape of X train : (80224, 13)
The Shape of X test : (20056, 13)
The Shape of Y train : (80224,)
The Shape of Y test : (20056,)


In [33]:
# Lets Build a Predictive Model using the Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

# define a Model
model2 = RandomForestClassifier()

# Train the Model
model2.fit(X_train_re, y_train_re)

# Predictions for the X test
y_pred_re = model2.predict(X_test_re)

In [34]:
# Accuracy score and classification report
from sklearn.metrics import classification_report, accuracy_score

print(accuracy_score(y_test_re, y_pred_re))
print(classification_report(y_test_re, y_pred_re))

0.9907758276824891
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      9981
           1       0.98      1.00      0.99     10075

    accuracy                           0.99     20056
   macro avg       0.99      0.99      0.99     20056
weighted avg       0.99      0.99      0.99     20056



## Using Random Undersampling 

In [36]:
from imblearn.under_sampling import RandomUnderSampler

# defining the Model
ros = RandomUnderSampler(random_state=0)

# Training the Model
ros.fit(X, y)

# Making the Samples
X_resampled2, y_resampled2 = ros.fit_resample(X, y)

In [37]:
# lets check the Class Distribution
y_resampled2.value_counts()

1    4668
0    4668
Name: is_promoted, dtype: int64

In [38]:
# Splitting the dataset
X_train_us, X_test_us, y_train_us, y_test_us =train_test_split(X_resampled2,
                                                               y_resampled2,
                                                               test_size=0.2,
                                                               random_state=0)

# lets print the shapes
print("The Shape of X train :", X_train_us.shape)
print("The Shape of X test :", X_test_us.shape)
print("The Shape of Y train :", y_train_us.shape)
print("The Shape of Y test :", y_test_us.shape)

The Shape of X train : (7468, 13)
The Shape of X test : (1868, 13)
The Shape of Y train : (7468,)
The Shape of Y test : (1868,)


In [39]:
# Lets Build a Predictive Model using the Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

# define a Model
model3 = RandomForestClassifier()

# Train the Model
model3.fit(X_train_us, y_train_us)

# Predictions for the X test
y_pred_us = model3.predict(X_test_us)

In [40]:
# Accuracy score and classification report
from sklearn.metrics import classification_report, accuracy_score

print(accuracy_score(y_test_us, y_pred_us))
print(classification_report(y_test_us, y_pred_us))

0.7965738758029979
              precision    recall  f1-score   support

           0       0.85      0.72      0.78       929
           1       0.76      0.88      0.81       939

    accuracy                           0.80      1868
   macro avg       0.80      0.80      0.80      1868
weighted avg       0.80      0.80      0.80      1868



## Using Synthetic Sampling

In [42]:
from imblearn.over_sampling import SMOTE

# defining the Model
sm = SMOTE(random_state=0)

# Training the Model
sm.fit(X, y)

# Making the Samples
X_resampled3, y_resampled3 = sm.fit_resample(X, y)

In [43]:
# lets check the Class Distribution
y_resampled3.value_counts()

1    50140
0    50140
Name: is_promoted, dtype: int64

In [44]:
# Splitting the dataset
X_train_sm, X_test_sm, y_train_sm, y_test_sm =train_test_split(X_resampled3,
                                                               y_resampled3,
                                                               test_size=0.2,
                                                               random_state=0)

# lets print the shapes
print("The Shape of X train :", X_train_sm.shape)
print("The Shape of X test :", X_test_sm.shape)
print("The Shape of Y train :", y_train_sm.shape)
print("The Shape of Y test :", y_test_sm.shape)

The Shape of X train : (80224, 13)
The Shape of X test : (20056, 13)
The Shape of Y train : (80224,)
The Shape of Y test : (20056,)


In [45]:
# Lets Build a Predictive Model using the Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

# define a Model
model4 = RandomForestClassifier()

# Train the Model
model4.fit(X_train_sm, y_train_sm)

# Predictions for the X test
y_pred_sm = model4.predict(X_test_sm)

In [46]:
# Accuracy score and classification report
from sklearn.metrics import classification_report, accuracy_score

print(accuracy_score(y_test_sm, y_pred_sm))
print(classification_report(y_test_sm, y_pred_sm))

0.9482449142401277
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      9981
           1       0.97      0.93      0.95     10075

    accuracy                           0.95     20056
   macro avg       0.95      0.95      0.95     20056
weighted avg       0.95      0.95      0.95     20056



## Using Neighbors Based Sampling

In [47]:
from imblearn.under_sampling import EditedNearestNeighbours

# defining the Model
enn = EditedNearestNeighbours()

# Training the Model
enn.fit(X, y)

# Making the Samples
X_resampled4, y_resampled4 = enn.fit_resample(X, y)

In [48]:
# lets check the Class Distribution
y_resampled4.value_counts()

0    39220
1     4668
Name: is_promoted, dtype: int64

In [49]:
# Splitting the dataset
X_train_enn, X_test_enn, y_train_enn, y_test_enn =train_test_split(X_resampled4,
                                                               y_resampled4,
                                                               test_size=0.2,
                                                               random_state=0)

# lets print the shapes
print("The Shape of X train :", X_train_enn.shape)
print("The Shape of X test :", X_test_enn.shape)
print("The Shape of Y train :", y_train_enn.shape)
print("The Shape of Y test :", y_test_enn.shape)

The Shape of X train : (35110, 13)
The Shape of X test : (8778, 13)
The Shape of Y train : (35110,)
The Shape of Y test : (8778,)


In [50]:
# Lets Build a Predictive Model using the Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

# define a Model
model5 = RandomForestClassifier()

# Train the Model
model5.fit(X_train_enn, y_train_enn)

# Predictions for the X test
y_pred_enn = model5.predict(X_test_enn)

In [51]:
# Accuracy score and classification report
from sklearn.metrics import classification_report, accuracy_score

print(accuracy_score(y_test_enn, y_pred_enn))
print(classification_report(y_test_enn, y_pred_enn))

0.9172932330827067
              precision    recall  f1-score   support

           0       0.92      0.99      0.96      7785
           1       0.88      0.31      0.46       993

    accuracy                           0.92      8778
   macro avg       0.90      0.65      0.71      8778
weighted avg       0.91      0.92      0.90      8778



## Combination of Over and UnderSampling

In [52]:
from imblearn.combine import SMOTEENN

# defining the Model
sn = SMOTEENN(random_state = 0)

# Training the Model
sn.fit(X, y)

# Making the Samples
X_resampled5, y_resampled5 = sn.fit_resample(X, y)

In [53]:
# lets check the Class Distribution
y_resampled5.value_counts()

1    42331
0    26832
Name: is_promoted, dtype: int64

In [54]:
# Splitting the dataset
X_train_sn, X_test_sn, y_train_sn, y_test_sn =train_test_split(X_resampled5,
                                                               y_resampled5,
                                                               test_size=0.2,
                                                               random_state=0)

# lets print the shapes
print("The Shape of X train :", X_train_sn.shape)
print("The Shape of X test :", X_test_sn.shape)
print("The Shape of Y train :", y_train_sn.shape)
print("The Shape of Y test :", y_test_sn.shape)

The Shape of X train : (55330, 13)
The Shape of X test : (13833, 13)
The Shape of Y train : (55330,)
The Shape of Y test : (13833,)


In [55]:
# Lets Build a Predictive Model using the Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

# define a Model
model6 = RandomForestClassifier()

# Train the Model
model6.fit(X_train_sn, y_train_sn)

# Predictions for the X test
y_pred_sn = model6.predict(X_test_sn)

In [56]:
# Accuracy score and classification report
from sklearn.metrics import classification_report, accuracy_score

print(accuracy_score(y_test_sn, y_pred_sn))
print(classification_report(y_test_sn, y_pred_sn))

0.940793754066363
              precision    recall  f1-score   support

           0       0.91      0.93      0.92      5203
           1       0.96      0.95      0.95      8630

    accuracy                           0.94     13833
   macro avg       0.94      0.94      0.94     13833
weighted avg       0.94      0.94      0.94     13833



## Using Ensemble Models

### Balanced Bagging Classifier

In [57]:
from imblearn.ensemble import BalancedBaggingClassifier

# defining a Model
model7 = BalancedBaggingClassifier(random_state = 0)

# Train the Model
model7.fit(X_train, y_train)

# Predictions for the X test
y_pred_bg = model7.predict(X_test)

In [58]:
# Accuracy score and classification report
from sklearn.metrics import classification_report, accuracy_score

print(accuracy_score(y_test, y_pred_bg))
print(classification_report(y_test, y_pred_bg))

0.8362525086663017
              precision    recall  f1-score   support

           0       0.97      0.85      0.90     10041
           1       0.30      0.70      0.42       921

    accuracy                           0.84     10962
   macro avg       0.63      0.78      0.66     10962
weighted avg       0.91      0.84      0.86     10962



### Balanced Random Forest Classifier

In [59]:
from imblearn.ensemble import BalancedRandomForestClassifier

# defining a Model
model8 = BalancedRandomForestClassifier(random_state = 0)

# Train the Model
model8.fit(X_train, y_train)

# Predictions for the X test
y_pred_rf = model8.predict(X_test)

In [60]:
# Accuracy score and classification report
from sklearn.metrics import classification_report, accuracy_score

print(accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

0.7298850574712644
              precision    recall  f1-score   support

           0       0.99      0.71      0.83     10041
           1       0.22      0.90      0.36       921

    accuracy                           0.73     10962
   macro avg       0.61      0.81      0.59     10962
weighted avg       0.92      0.73      0.79     10962



## Using XG Boost Classifier

In [63]:
from xgboost import XGBClassifier

# defining the Model
model9 = XGBClassifier(scale_pos_weight = 10)

# Train the Model
model9.fit(X_train, y_train)

# Predictions for the X test
y_pred_xgb = model9.predict(X_test)





In [64]:
# Accuracy score and classification report
from sklearn.metrics import classification_report, accuracy_score

print(accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

0.8375296478744755
              precision    recall  f1-score   support

           0       0.97      0.85      0.91     10041
           1       0.30      0.72      0.43       921

    accuracy                           0.84     10962
   macro avg       0.64      0.78      0.67     10962
weighted avg       0.91      0.84      0.87     10962



### Conclussion: 

* Without Resampling: Recall = 25%
* With Oversampling: Recall = 98%
* With undersampling: Recall = 71%

```We can see that Oversampling Performs better than Undersampling, as Undersampling leads to Huge Data Loss, making it Difficult for the Model to Learn the Rules Effectively.```