# __Step 1__ #

In [None]:
import pandas as pd

In [None]:
df=pd.read_csv('HR_comma_sep.csv')
df.head()

In [None]:
df.info()

In [None]:
for column in df.columns:
    print('Column : ',column)
    print(df[column].value_counts())
    print('\n')

In [None]:
df.isnull().sum()

There is no null value in the datasets

# __Step 2__

In [None]:
corr_matrix=df.select_dtypes(include='number').corr()
corr_matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.heatmap(corr_matrix,annot=True,cmap='coolwarm',fmt='.2f')
plt.show()

## From the Above heatmap we can draw following Conclusions : ##
1. Person with very low satisfaction level have left the company 
2. Person with more number of Project have more last evaluation and average monthly hours
3. Person with more average monthly hours have more last evaluation

In [None]:
sns.distplot(df['satisfaction_level'])
plt.title('Employee Satisfaction Level')
plt.show()

In [None]:
sns.distplot(df['last_evaluation'])
plt.title('Employee last evaluation')
plt.show()

In [None]:
sns.distplot(df['average_montly_hours'])
plt.title('Employee average monthly hours')
plt.show()

In [None]:
sns.countplot(x='number_project',data=df,hue='left')

# __Step 3__

In [None]:
sns.scatterplot(data=df,x='satisfaction_level',y='last_evaluation',hue='left')

In [None]:
X=df[df['left']==1][['satisfaction_level','last_evaluation']]
plt.scatter(x=X['satisfaction_level'],y=X['last_evaluation'])
plt.xlabel('Satisfaction Level')
plt.ylabel('Last_evaluation')
plt.title('Scatter plot of employee based on satisfaction level and last evaluation')
plt.show()

In [None]:
from sklearn.cluster import KMeans

In [None]:
clustering_model=KMeans(n_clusters=3,init='k-means++',random_state=42)
clustering_model.fit(X)

In [None]:
label_for_left=clustering_model.predict(X)

In [None]:
import numpy as np

In [None]:
plt.scatter(x=X[label_for_left==0]['satisfaction_level'],y=X[label_for_left==0]['last_evaluation'],c='red',label='Cluster 0')
plt.scatter(x=X[label_for_left==1]['satisfaction_level'],y=X[label_for_left==1]['last_evaluation'],c='green',label='Cluster 1')
plt.scatter(x=X[label_for_left==2]['satisfaction_level'],y=X[label_for_left==2]['last_evaluation'],c='blue',label='Cluster 2')
plt.scatter(x=clustering_model.cluster_centers_[:,0],y=clustering_model.cluster_centers_[:,1],marker='o',c='yellow',label='Cluster Centres')
plt.xlabel('Satisfaction Level')
plt.ylabel('Last_evaluation')
plt.title('Scatter plot of employee based on satisfaction level and last evaluation')
plt.legend()
plt.show()

# __Step 4__

In [None]:
numerical_columns=['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','left','promotion_last_5years']
categorical_columns=['sales','salary']

In [None]:
numerical_df=df[numerical_columns]

In [None]:
categorical_df=df[categorical_columns]

In [None]:
transformed_categorical_df=pd.get_dummies(categorical_df,prefix=['sales','salary'],drop_first=True)

In [None]:
transformed_categorical_df=transformed_categorical_df.astype('int64')

In [None]:
transformed_df=pd.concat([numerical_df,transformed_categorical_df],axis=1)

In [None]:
transformed_df=transformed_df[transformed_df['left']==1]

In [None]:
transformed_df['Left Types']=label_for_left

In [None]:
X=transformed_df.drop(['Left Types'],axis=1)
y=transformed_df['Left Types']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y,random_state=123)

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
resample_model=SMOTE()

In [None]:
transformed_X_train,transformed_y_train=resample_model.fit_resample(X_train,y_train)

In [None]:
plt.scatter(x=transformed_X_train[transformed_y_train==0]['satisfaction_level'],y=transformed_X_train[transformed_y_train==0]['last_evaluation'],c='red',label='Cluster 0')
plt.scatter(x=transformed_X_train[transformed_y_train==1]['satisfaction_level'],y=transformed_X_train[transformed_y_train==1]['last_evaluation'],c='green',label='Cluster 1')
plt.scatter(x=transformed_X_train[transformed_y_train==2]['satisfaction_level'],y=transformed_X_train[transformed_y_train==2]['last_evaluation'],c='blue',label='Cluster 2')
plt.scatter(x=clustering_model.cluster_centers_[:,0],y=clustering_model.cluster_centers_[:,1],marker='o',c='yellow',label='Cluster Centres')
plt.xlabel('Satisfaction Level')
plt.ylabel('Last_evaluation')
plt.title('Scatter plot of employee based on satisfaction level and last evaluation')
plt.legend()
plt.show()

In [None]:
transformed_y_train.value_counts()

In [None]:
y_train.value_counts()

# __Step 5__

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.model_selection import KFold

In [None]:
kfold=KFold(n_splits=5,shuffle=True,random_state=123)
lr=LogisticRegression(multi_class='ovr',solver='liblinear')

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
accuracy=cross_val_score(estimator=lr,X=transformed_X_train,y=transformed_y_train,scoring='neg_mean_absolute_percentage_error',cv=kfold,n_jobs=-1)

In [None]:
accuracy

In [None]:
accuracy.mean()

In [None]:
from sklearn.metrics import classification_report

In [None]:
lr.fit(transformed_X_train,transformed_y_train)

In [None]:
y_pred=lr.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf=RandomForestClassifier(n_estimators=100,max_depth=5,random_state=123,n_jobs=-1)
rf_accuracy=cross_val_score(estimator=rf,X=transformed_X_train,y=transformed_y_train,scoring='neg_mean_absolute_percentage_error',cv=kfold,n_jobs=-1)

In [None]:
print(rf_accuracy)

In [None]:
print(rf_accuracy.mean())

In [None]:
rf.fit(transformed_X_train,transformed_y_train)

In [None]:
y_pred=rf.predict(X_test)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbc=GradientBoostingClassifier()

In [None]:
gbc_accuracy=cross_val_score(estimator=gbc,X=transformed_X_train,y=transformed_y_train,scoring='neg_mean_absolute_percentage_error',n_jobs=-1)
print(gbc_accuracy)

In [None]:
print(gbc_accuracy.mean())

In [None]:
gbc.fit(transformed_X_train,transformed_y_train)

In [None]:
y_pred=gbc.predict(X_test)
print(classification_report(y_test,y_pred))

# __Step 6__

In [None]:
from sklearn.metrics import roc_curve,roc_auc_score

In [None]:
from sklearn.preprocessing import label_binarize

In [None]:
classes=np.unique(y_test)
y_bin_test=label_binarize(y_test,classes=classes)
y_bin_test

In [None]:
y_bin_pred_lr=label_binarize(lr.predict(X_test),classes=np.unique(lr.predict(X_test)))
proba=lr.predict_proba(X_test)
y_score_lr=proba
print(y_score_lr)

In [None]:
for i in range(len(classes)):
    fpr,tpr,threshold=roc_curve(y_bin_test[:,i],y_score_lr[:,i])
    auc_score=roc_auc_score(y_bin_test[:,i],y_score_lr[:,i])
    plt.plot(fpr,tpr,label=f'Class : {i}, auc : {auc_score:.2f}')
plt.plot([0,1],[0,1],linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Reciever Operating Charateristic : ROC-Curve for Logistic Regression')
plt.legend()
plt.show()

In [None]:
y_bin_pred_rf=label_binarize(rf.predict(X_test),classes=np.unique(rf.predict(X_test)))
proba=rf.predict_proba(X_test)
y_score_rf=proba
print(y_score_rf)

In [None]:
for i in range(len(classes)):
    fpr,tpr,threshold=roc_curve(y_bin_test[:,i],y_score_rf[:,i])
    auc_score=roc_auc_score(y_bin_test[:,i],y_score_rf[:,i])
    plt.plot(fpr,tpr,label=f'Class : {i}, auc : {auc_score:.2f}')
plt.plot([0,1],[0,1],linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Reciever Operating Charateristic : ROC-Curve for Random Forest Classifier')
plt.legend()
plt.show()

In [None]:
y_bin_pred_gb=label_binarize(gbc.predict(X_test),classes=np.unique(gbc.predict(X_test)))
proba=gbc.predict_proba(X_test)
y_score_gb=proba
print(y_score_gb)

In [None]:
y_bin_pred_gb

In [None]:
for i in range(len(classes)):
    fpr,tpr,threshold=roc_curve(y_bin_test[:,i],y_score_gb[:,i])
    auc_score=roc_auc_score(y_bin_test[:,i],y_score_gb[:,i])
    plt.plot(fpr,tpr,label=f'Class : {i}, auc : {auc_score:.2f}')
plt.plot([0,1],[0,1],linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Reciever Operating Charateristic : ROC-Curve Gradient Boosting Classifier')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
lr_confusion_matrix=confusion_matrix(y_test,lr.predict(X_test))
print('Logistic Regression Confusion Matrix : ')
lr_confusion_matrix

In [None]:
rf_confusion_matrix=confusion_matrix(y_test,rf.predict(X_test))
print('Random Forest Classifier Confusion Matrix : ')
rf_confusion_matrix

In [None]:
gb_confusion_matrix=confusion_matrix(y_test,gbc.predict(X_test))
print('Gradient Boosting Classifier Confusion Matrix : ')
gb_confusion_matrix

From Confusion matrix we need to use Precision metrics

# __Step 7__

In [None]:
proba_gbc=gbc.predict_proba(X_test)

In [None]:
index=np.argmax(proba_gbc,axis=1)

In [None]:
proba=proba_gbc.max(axis=1)

In [None]:
zone_label=[
    'Safe Zone' if x <= 0.2 else 
    'Low-Risk Zone' if x <= 0.6 else 
    'Medium-Risk Zone' if x <= 0.9 else 
    'High-Risk Zone'
    for x in proba
    ]

In [None]:
proba_df=pd.DataFrame()

In [None]:
proba_df['Class']=index
proba_df['Zone']=zone_label

In [None]:
sns.countplot(proba_df,x='Zone',hue='Class')

In [None]:
sns.countplot(proba_df,x='Zone')

1. There is high chance of an employee for leaving the company
2. So we should aaply Retention strategy on High-Risk zone as compared to other zone