In [None]:
# Import Dependencies
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel

## Promotion Dataset

Multiple attributes have been provided around Employee's past and current performance along with demographics.

Features:

- employee_id: Unique ID for employee
- department: Department of employee
- region: Region of employment (unordered)
- education: Education Level
- gender: Gender of Employee
- recruitment_channel: Channel of recruitment for employee
- no_ of_ trainings: no of other trainings completed in previous year on soft skills, technical skills etc.
- age: Age of Employee
- previous_ year_ rating: Employee Rating for the previous year
- length_ of_ service: Length of service in years
- awards_ won?: if awards won during previous year then 1 else 0
- avg_ training_ score: Average score in current training evaluations
- is_promoted: (Target) Recommended for promotion

In [None]:
# Read the csv file into a pandas DataFrame called `promotions_df`
promotions_df =  pd.read_csv("../Resources/promotions.csv")
promotions_df.head()

In [None]:
# Create `X` and `y` datasets

X = promotions_df.drop('is_promoted', axis=1)
y = promotions_df['is_promoted']

In [None]:
# Dummy encode the `X` data into numerical features
X = pd.get_dummies(X,prefix="Label")

In [None]:
# Divide the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# Create a StandardScaler() model and fit it to the training data
scaler = StandardScaler().fit(X_train)

In [None]:
# Transform the training and testing data by using the scaler model
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Use Random Forest to predict promotions

In [None]:
# Fit a Random Forest Classifier model with 500 trees and print training and testing scores
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)

print(classification_report(y_test, y_pred))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

In [None]:
# Calculate feature importance from the fitted random forest model

features = clf.feature_importances_
print(features)

In [None]:
# Visualize the feature importance 
px.bar(x=X.columns, y=features,labels={"x":"Features","y":"Score Features"})

In [None]:
# Perofrm feature selection using sklearns SelectFromModel module
sel = SelectFromModel(clf)
sel.fit(X_train_scaled, y_train)

In [None]:
# Use the `.transofrm()` function to transform the scaled `X_train` and `X_test` data 
# to return a new dataframe with only the features from the previous step
X_train_new_scaled = scaler.transform(X_train)
X_test_new_scaled = scaler.transform(X_test)

selFeatures = pd.DataFrame(X_train_new_scaled)
selFeatures.head()

In [None]:
# Fit a model and print training and testing scores with 500 trees
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(selFeatures, y_train)
y_pred = clf.predict(X_test_new_scaled)
print(classification_report(y_test, y_pred))
print(f'Training Score: {clf.score(selFeatures, y_train)}')
print(f'Testing Score: {clf.score(X_test_new_scaled, y_test)}')

## Use K Nearest Neighbors to predict promotions

In [None]:
# Fit a K Nearest Neighbors model and loop through different k values of 1 to 20 by 2
# to find which has the highest accuracy.
# Note: We use only odd numbers because we don't want any ties.
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")

In [None]:
# Plot the results of the train and test scores from the previous loop   
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy score")
plt.show()

In [None]:
# Select the `k` that provides the best accuracy where the classifier starts to stablize
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train_scaled, y_train)
print('k=9 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))