<a href="https://colab.research.google.com/github/Namithadaparthi/edunet_project/blob/main/Employees_Burnout_Analysis_%26_Prediction_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#Loading Dataset

In [None]:
pd.set_option('display.max_columns',None)
burnoutDf=pd.read_csv('/content/drive/MyDrive/employee_burnout_analysis1.CSV')
burnoutDf

In [None]:
#convert into dateTime datatype
burnoutDf["Date of Joining"]=pd.to_datetime(burnoutDf["Date of Joining"])

In [None]:
#give the number of rows and columns
burnoutDf.shape

In [None]:
#general information
burnoutDf.info()

In [None]:
#show top 5 rows
burnoutDf.head()

In [None]:
#extract all columns of the dataset
burnoutDf.columns

In [None]:
# check for null values
burnoutDf.isna().sum()

In [None]:
# check the duplicate values
burnoutDf.duplicated().sum()

In [None]:
# calculate the mean,std,min,max and count of every attributes
burnoutDf.describe()

In [None]:
#show the unique values
for i,col in enumerate(burnoutDf.columns):
  print(f"\n\n{burnoutDf[col].unique()}")
  print(f"\n{burnoutDf[col].value_counts()}\n\n")

In [None]:
# Drop irrelevant column
burnoutDf=burnoutDf.drop(['Employee ID'],axis=1)

In [None]:
# check the skewness of the attributes
intFloatburnoutDf=burnoutDf.select_dtypes([np.int,np.float])
for i, col in enumerate(intFloatburnoutDf.columns):
  if (intFloatburnoutDf[col].skew() >= 0.1):
    print("\n",col, "feature is Positively skewd and value is: ", intFloatburnoutDf[col].skew())
  elif (intFloatburnoutDf[col].skew() <= 0.1):
    print("\n",col, "feature is Negatively skewd and value is: ", intFloatburnoutDf[col].skew())
  else:
    print("\n",col, "feature is Normally Distributed and value is: ", intFloatburnoutDf[col].skew())

In [None]:
# Replace the null values with mean
burnoutDf['Resource Allocation'].fillna(burnoutDf['Resource Allocation'].mean(),inplace=True)
burnoutDf['Mental Fatigue Score'].fillna(burnoutDf['Mental Fatigue Score'].mean(),inplace=True)
burnoutDf['Burn Rate'].fillna(burnoutDf['Burn Rate'].mean(),inplace=True)

In [None]:
# check for null values
burnoutDf.isna().sum()

In [None]:
#show the correlation
burnoutDf.corr()

#Data Visualization

In [None]:
#plotting Heat map to check correlation
Corr=burnoutDf.corr()
sns.set(rc={'figure.figsize':(14,12)})
fig=px.imshow(Corr, text_auto=True, aspect="auto")
fig.show()

In [None]:
# count plot distribution of "Gender"
plt.figure(figsize=(10,8))
sns.countplot(x="Gender", data=burnoutDf, palette="magma")
plt.title("Plot Distribution of Gender")
plt.show()

In [None]:
# count plot distribution of "Company Type"
plt.figure(figsize=(10,8))
sns.countplot(x="Company Type", data=burnoutDf, palette="Spectral")
plt.title("Plot Distribution of Company Type")
plt.show()

In [None]:
# count plot distribution of "WFH Setup Available"
plt.figure(figsize=(10,8))
sns.countplot(x="WFH Setup Available", data=burnoutDf, palette="dark:salmon_r")
plt.title("Plot Distribution of WFH Setup Available")
plt.show()

In [None]:
# Count plot Distribution of attributes with the help of Histogram
burn_st=burnoutDf.loc[:,'Date of Joining':'Burn Rate']
burn_st=burn_st.select_dtypes([int, float])
for i, col in enumerate(burn_st.columns):
  fig = px.histogram(burn_st, x=col, title="plot Distribution of "+col,color_discrete_sequence=['indianred'])
  fig.update_layout(bargap=0.2)
  fig.show()

In [None]:
# plot distribution of burn rate on the basis of designation
fig = px.line(burnoutDf,y="Burn Rate",color="Designation",title="Burn Rate on the basis of Designation",color_discrete_sequence=px.colors.qualitative.Pastel1)
fig.update_layout(bargap=0.2)
fig.show()

In [None]:
# plot distribution of burn rate on the basis of gender
fig = px.line(burnoutDf,y="Burn Rate",color="Gender",title="Burn Rate on the basis of Gender",color_discrete_sequence=px.colors.qualitative.Pastel1)
fig.update_layout(bargap=0.2)
fig.show()

In [None]:
# plot distribution of mental fatigue score on the basis of designation
fig = px.line(burnoutDf,y="Mental Fatigue Score",color="Designation",title="Mental Fatigue Score vs Designation",color_discrete_sequence=px.colors.qualitative.Pastel1)
fig.update_layout(bargap=0.2)
fig.show()

In [None]:
#plot distribution of "destination vs mental fatigue" as per company type, burn rate and gender
sns.relplot(
    data=burnoutDf, x="Designation", y="Mental Fatigue Score",col="Company Type",
    hue="Company Type", size="Burn Rate", style="Gender",
    palette=["g", "r"], sizes=(50, 200)

)

#Label Encoding

In [None]:
# label encoding and assign in new variable
from sklearn import preprocessing
Label_encode=preprocessing.LabelEncoder()

In [None]:
#assign in new variable
burnoutDf['GenderLabel'] = Label_encode.fit_transform(burnoutDf['Gender'].values)
burnoutDf['Company_TypeLabel'] = Label_encode.fit_transform(burnoutDf['Company Type'].values)
burnoutDf['WFH_Setup_AvailableLabel'] = Label_encode.fit_transform(burnoutDf['WFH Setup Available'].values)

In [None]:
#check assigned values
gn=burnoutDf.groupby('Gender')
gn=gn['GenderLabel']
gn.first()

In [None]:
#check assigned values
ct=burnoutDf.groupby('Company Type')
ct=ct['Company_TypeLabel']
ct.first()

In [None]:
#check assigned values
wsa=burnoutDf.groupby('WFH Setup Available')
wsa=wsa['WFH_Setup_AvailableLabel']
wsa.first()

In [None]:
#show last 10 rows
burnoutDf.tail(10)

#Feature Selection

In [None]:
Columns=['Designation', 'Resource Allocation', 'Mental Fatigue Score',
         'GenderLabel', 'Company_TypeLabel', 'WFH_Setup_AvailableLabel']
x=burnoutDf[Columns]
y=burnoutDf['Burn Rate']

In [None]:
print(x)

In [None]:
print(y)

#Implementing PCA

In [None]:
#principle component analysis
from sklearn.decomposition import PCA

pca=PCA(0.95)
x_pca=pca.fit_transform(x)

print("PCA shape of x is:",x_pca.shape,"and original shape is:",x.shape)
print("% of importance of selected features is:",pca.explained_variance_ratio_)
print("The number of features selected through PCA is:",pca.n_components_)

#Data Splitting

In [None]:
#Data splitting in train and test
from sklearn.model_selection import train_test_split
x_train_pca,x_test,y_train,y_test=train_test_split(x_pca,y,test_size=0.25,random_state=10)

In [None]:
#print the shape of splitted data
print(x_train_pca.shape,x_test.shape,y_train.shape,y_test.shape)

#Model Implementation
Random Forest Regressor

In [None]:
from sklearn.metrics import r2_score

In [49]:
#Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

rf_model=RandomForestRegressor()
rf_model.fit(x_train_pca,y_train)

train_pred_rf=rf_model.predict(x_train_pca)
train_r2=r2_score(y_train, train_pred_rf)
test_pred_rf=rf_model.predict(x_test)
test_r2=r2_score(y_test,test_pred_rf)
#Accuracy score
print("Accuracy score of train data:"+str(round(100*train_r2,4))+"%")
print("Accuracy score of test data:"+str(round(100*test_r2,4))+"%")

Accuracy score of train data:91.1904%
Accuracy score of test data:83.8905%



#AdaBoost Regressor

In [50]:
#AdaBoost Regressor
from sklearn.ensemble import AdaBoostRegressor

abr_model=AdaBoostRegressor()
abr_model.fit(x_train_pca,y_train)

train_pred_adboost=abr_model.predict(x_train_pca)
train_r2=r2_score(y_train, train_pred_adboost)
test_pred_adaboost=abr_model.predict(x_test)
test_r2=r2_score(y_test,test_pred_adaboost)

#Accuracy score
print("Accuracy score of train data:"+str(round(100*train_r2,4))+"%")
print("Accuracy score of test data:"+str(round(100*test_r2,4))+"%")

Accuracy score of train data:78.6014%
Accuracy score of test data:78.0643%
