### HR Data Analytics

In [134]:
#Importing relevant libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', None)

In [135]:
#Loading the datasets
df = pd.read_csv("10000 HRA Records.csv")
df.head(3)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,24,No,Travel_Rarely,1084,Support,31,5,Medical,1,1,3,Male,66,4,1,Developer,2,Single,48603,583236,2,Y,Yes,15,4,3,80,3,39,3,3,23,6,14,6
1,22,No,Travel_Frequently,537,Sales,47,5,Life Sciences,1,2,2,Female,32,4,2,Sales Representative,2,Single,19157,249041,2,Y,No,16,1,3,80,1,4,1,2,1,1,1,1
2,33,Yes,Travel_Rarely,418,Human Resources,45,5,Other,1,3,3,Male,45,1,4,Human Resources,3,Single,42054,1261620,6,Y,No,48,3,1,80,4,21,5,4,4,1,2,2


In [136]:
#Recoding the values of some columns for ease of analysis: The columns are:1.Educaation, 
#2.Environment Satisfaction, 3.Jon Involvement, 4.Job Satisfaction, 
#5.Performance Rating, 6.Relationship Satisfaction, 7.Work Life Balance.

df["Education"] = df["Education"].map({1:"Below College",2:"College",3:"Bachelor",4:"Master",5:"PhD"})
df["EnvironmentSatisfaction"] = df["EnvironmentSatisfaction"].map({1:"Low", 2:"Medium", 3:"High", 4:"Very High"})
df["JobInvolvement"] = df["JobInvolvement"].map({1:"Low", 2:"Medium", 3:"High", 4:"Very High"})
df["JobSatisfaction"] = df["JobSatisfaction"].map({1:"Low", 2:"Medium", 3:"High", 4:"Very High"})
df["PerformanceRating"] = df["PerformanceRating"].map({1:"Low", 2:"Good", 3:"Excellent", 4:"Outstanding"})
df["RelationshipSatisfaction"] = df["RelationshipSatisfaction"].map({1:"Low", 2:"Medium", 3:"High", 4:"Very High"})
df["WorkLifeBalance"] = df["WorkLifeBalance"].map({1:"Bad", 2:"Good", 3:"Better", 4:"Best"})


In [137]:
#Dropping some columns:
df.drop(columns = ["EmployeeCount","Over18"],inplace = True)

In [133]:
#Examining the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       10000 non-null  int64 
 1   Attrition                 10000 non-null  object
 2   BusinessTravel            10000 non-null  object
 3   DailyRate                 10000 non-null  int64 
 4   Department                10000 non-null  object
 5   DistanceFromHome          10000 non-null  int64 
 6   Education                 10000 non-null  object
 7   EducationField            10000 non-null  object
 8   EmployeeNumber            10000 non-null  int64 
 9   EnvironmentSatisfaction   10000 non-null  object
 10  Gender                    10000 non-null  object
 11  HourlyRate                10000 non-null  int64 
 12  JobInvolvement            10000 non-null  object
 13  JobLevel                  10000 non-null  int64 
 14  JobRole                

In [138]:
#To check for null values:
df.isna().sum()[:5]

Age               0
Attrition         0
BusinessTravel    0
DailyRate         0
Department        0
dtype: int64

In [139]:
#Finding the number of columns and rows in the Dataframe
df.shape

(10000, 33)

In [18]:
df.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,EmployeeNumber,HourlyRate,JobLevel,MonthlyIncome,MonthlyRate,NumCompaniesWorked,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,39.0364,806.5456,25.3221,5000.5,114.7674,3.0068,25806.6773,400344.3,4.0038,24.6162,80.0,2.4977,20.5519,3.4888,10.7008,5.8152,5.8973,5.7857
std,12.489318,405.132923,14.454454,2886.89568,49.516602,1.402266,14424.452197,339076.6,2.591227,14.439804,0.0,1.11764,11.563212,1.707386,8.826226,5.907786,6.03072,5.915342
min,18.0,100.0,1.0,1.0,30.0,1.0,1001.0,1270.0,0.0,0.0,80.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,28.0,455.0,13.0,2500.75,72.0,2.0,13366.25,121683.2,2.0,12.0,80.0,2.0,11.0,2.0,3.0,1.0,2.0,1.0
50%,39.0,807.5,25.0,5000.5,115.0,3.0,25438.5,307298.0,4.0,25.0,80.0,2.0,21.0,4.0,8.0,4.0,4.0,4.0
75%,50.0,1160.0,38.0,7500.25,158.0,4.0,38385.0,598405.2,6.0,37.0,80.0,4.0,31.0,5.0,16.0,8.0,8.0,8.0
max,60.0,1500.0,50.0,10000.0,200.0,5.0,50996.0,1523280.0,8.0,49.0,80.0,4.0,40.0,6.0,40.0,39.0,38.0,38.0


In [140]:
# Function to group age into 4 age ranges:
def group_age(age):
    if 18 <= age <= 29:
        return "18-29"
    elif 30 <= age <= 39:
        return "30-39"
    elif 40 <= age <= 49:
        return "40-49"
    else:
        return "50-60"

# Apply the function to create a new 'AgeGroup' column
df['AgeGroup'] = df['Age'].apply(group_age)

In [141]:
# Function to group PercentSalaryHike into 4 groups:
def group_salhyke(salhike):
    if 0 <= salhike <= 15:
        return "0-15"
    elif 16 <= salhike <= 35:
        return "16-35"
    else:
        return "36-49"

# Apply the function to create a new '%%_SalaryHikeGroup' column
df['%%_SalaryHikeGroup'] = df['PercentSalaryHike'].apply(group_salhyke)

In [142]:
#Renaming a column
df.rename(columns = {"MonthlyIncome":"Salary"},inplace = True)

#### Employee Demographics
#What is the demographic composition of the workforce (age, gender)?
#Are there any significant demographic trends or patterns?


In [144]:
#Count of various age groups
df["AgeGroup"].value_counts().sort_values(ascending = False)

18-29    2837
50-60    2594
40-49    2296
30-39    2273
Name: AgeGroup, dtype: int64

In [145]:
#Count of gender
df["Gender"].value_counts().sort_values(ascending = False)

Male      5019
Female    4981
Name: Gender, dtype: int64

In [146]:
#Evaluating Salary by Age Group
df.pivot_table(index = "AgeGroup",
               values = "Salary", 
               aggfunc = "sum").sort_values(by = "Salary",ascending = False)

Unnamed: 0_level_0,Salary
AgeGroup,Unnamed: 1_level_1
18-29,73637086
50-60,67491185
40-49,58846678
30-39,58091824


In [147]:
df.pivot_table(index = "Gender",values = "Salary", aggfunc = "sum").sort_values(by = "Salary",ascending = False)

Unnamed: 0_level_0,Salary
Gender,Unnamed: 1_level_1
Male,129853638
Female,128213135


In [148]:
df.pivot_table(index = "Gender",
               values = "Salary",
               columns = ["AgeGroup"], 
               aggfunc = "sum")

AgeGroup,18-29,30-39,40-49,50-60
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,36877508,28507817,29082667,33745143
Male,36759578,29584007,29764011,33746042


#### Employee Turnover
#Are certain departments experiencing higher turnover than others?

In [149]:
df["Attrition"].value_counts()

No     5127
Yes    4873
Name: Attrition, dtype: int64

In [150]:
# Group by Department and calculate the count of Yes and No in the Attrition column
attrition_count_by_department = df.groupby(['Department', 'Attrition']).size().unstack(fill_value=0)

# Display the result
print(attrition_count_by_department)

Attrition                No  Yes
Department                      
Hardware                898  794
Human Resources         832  789
Research & Development  852  829
Sales                   826  794
Software                858  807
Support                 861  860


In [151]:
#Evaluating Mean Salary by Gender and Education Level
df.pivot_table(index = "Gender",
               values = "Salary",
               columns = ["Education"], 
               aggfunc = "mean")

Education,Bachelor,Below College,College,Master,PhD
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Female,25359.670224,25762.033948,25421.724708,26415.203,25727.645685
Male,25505.179688,25258.594412,26125.433735,26302.922613,26216.949275


In [152]:
#Evaluating Mean Salary by Gender and Education Field
df.pivot_table(index = "Gender",
               values = "Salary",
               columns = ["EducationField"], 
               aggfunc = "mean")

EducationField,Human Resources,Life Sciences,Marketing,Medical,Other,Technical Degree
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,26242.409653,25742.504785,25988.707955,25464.86625,25629.00237,25357.564576
Male,25750.0625,25952.34012,26508.979689,25850.444075,25623.235152,25526.360864


In [153]:
#Evaluating Mean Salary by Department and Education Level
df.pivot_table(index = "Department",
               values = "Salary",
               columns = ["EducationField"], 
               aggfunc = "mean")

EducationField,Human Resources,Life Sciences,Marketing,Medical,Other,Technical Degree
Department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Hardware,26394.841912,25654.173333,26203.399293,25794.864469,25465.448763,24813.487544
Human Resources,24777.035211,25395.638989,26500.531365,24622.413793,25916.302491,26389.979757
Research & Development,26485.520295,26734.644,26655.856115,25488.019608,26291.835088,26767.43299
Sales,26558.615079,24891.536398,25041.753205,26768.391791,25512.523256,25377.178439
Software,26534.081784,26035.255172,25900.645283,24956.010169,26211.204301,25200.988764
Support,25355.578767,26380.607509,27187.814935,26370.366667,24355.173145,23957.481633


#### Employee Performance / Training and Development Imapct Analysis

In [154]:
#Impact of worker training on Job involvement
df.pivot_table(index = "JobInvolvement",
               values = "TrainingTimesLastYear",
               aggfunc = "sum").sort_values(by="TrainingTimesLastYear", ascending=False)

Unnamed: 0_level_0,TrainingTimesLastYear
JobInvolvement,Unnamed: 1_level_1
Very High,8839
Medium,8729
High,8696
Low,8624


In [155]:
#Impact of worker training on Job satisfaction
df.pivot_table(index = "JobSatisfaction",
               values = "TrainingTimesLastYear",
               aggfunc = "sum").sort_values(by="TrainingTimesLastYear", ascending=False)

Unnamed: 0_level_0,TrainingTimesLastYear
JobSatisfaction,Unnamed: 1_level_1
Medium,8964
Low,8720
Very High,8641
High,8563


In [156]:
#Impact of worker training on Performance Rating
df.pivot_table(index = "PerformanceRating",
               values = "TrainingTimesLastYear",
               aggfunc = "sum").sort_values(by="TrainingTimesLastYear", ascending=False)

Unnamed: 0_level_0,TrainingTimesLastYear
PerformanceRating,Unnamed: 1_level_1
Low,9058
Outstanding,8830
Excellent,8697
Good,8303


#### Compensation Analysis:
#Is there pay equity within the organization across genders?
#How does compensation correlate with employee performance and satisfaction?


In [157]:
#Is there pay equity within the organization across genders? 
df.pivot_table(index = "Gender",
               values = "Salary",
               aggfunc = "sum").sort_values(by="Salary", ascending=False)

Unnamed: 0_level_0,Salary
Gender,Unnamed: 1_level_1
Male,129853638
Female,128213135


In [158]:
#How does compensation correlate with employee performance and satisfaction?
df.pivot_table(index = "PerformanceRating",
               values = "Salary",
               aggfunc = "mean").sort_values(by="Salary", ascending=False)

Unnamed: 0_level_0,Salary
PerformanceRating,Unnamed: 1_level_1
Good,26244.903894
Outstanding,25746.858893
Excellent,25697.052295
Low,25558.959624


In [159]:
#Count of travels 
df["BusinessTravel"].value_counts()

Travel_Rarely        3370
Non-Travel           3343
Travel_Frequently    3287
Name: BusinessTravel, dtype: int64

In [160]:
#Sum of Daily Work Rate by various Departments
df.pivot_table(index = "Department",
               values = "DailyRate",
               aggfunc = "sum").sort_values(by="DailyRate", ascending=False)

Unnamed: 0_level_0,DailyRate
Department,Unnamed: 1_level_1
Support,1369321
Hardware,1359119
Research & Development,1349440
Software,1345069
Sales,1322229
Human Resources,1320278


In [161]:
#Mean of Daily Work Rate by various Departments
df.pivot_table(index = "Department",
               values = "DailyRate",
               aggfunc = "mean").sort_values(by="DailyRate", ascending=False)

Unnamed: 0_level_0,DailyRate
Department,Unnamed: 1_level_1
Sales,816.190741
Human Resources,814.483652
Software,807.849249
Hardware,803.26182
Research & Development,802.760262
Support,795.654271


In [177]:
#Count of Environment Satisfaction values
df["EnvironmentSatisfaction"].value_counts()

Low          2520
Medium       2512
High         2494
Very High    2474
Name: EnvironmentSatisfaction, dtype: int64

In [163]:
#Evaluation of mean Salary by Job Role
df.pivot_table(index = "JobRole",
               values = "Salary",
               aggfunc = "mean").sort_values(by="Salary", ascending=False)

Unnamed: 0_level_0,Salary
JobRole,Unnamed: 1_level_1
Human Resources,26258.753535
Research Scientist,26180.554852
Manufacturing Director,26170.268173
Sales Representative,26021.834532
Laboratory Technician,25913.159478
Manager,25800.716528
Healthcare Representative,25632.864542
Developer,25439.436735
Sales Executive,25433.596562
Research Director,25239.092338


In [164]:
#Count of Number of companies workers have worked
df["NumCompaniesWorked"].value_counts().sort_values(ascending = False)

2    1145
7    1132
8    1132
1    1118
3    1111
6    1108
4    1101
0    1100
5    1053
Name: NumCompaniesWorked, dtype: int64

In [165]:
#Count of Relationship Satisfaction values workers have.
df["RelationshipSatisfaction"].value_counts().sort_values(ascending = False)

Medium       2553
Low          2500
High         2486
Very High    2461
Name: RelationshipSatisfaction, dtype: int64

In [166]:
#Counts of workers Work_Life-Balance values
df["WorkLifeBalance"].value_counts().sort_values(ascending = False)

Good      2539
Bad       2525
Best      2475
Better    2461
Name: WorkLifeBalance, dtype: int64

In [167]:
df["YearsAtCompany"].value_counts().sort_values(ascending = False)[:10]

1     1061
2      798
3      740
4      612
5      535
6      485
7      461
8      415
9      388
10     377
Name: YearsAtCompany, dtype: int64

In [168]:
df["%%_SalaryHikeGroup"].value_counts().sort_values(ascending = False)

16-35    3974
0-15     3197
36-49    2829
Name: %%_SalaryHikeGroup, dtype: int64

In [169]:
df["YearsSinceLastPromotion"].value_counts().sort_values(ascending = False)[:10]

1     2499
2     1399
3      997
4      806
5      619
6      496
7      441
8      362
9      342
10     264
Name: YearsSinceLastPromotion, dtype: int64

In [170]:
#Evaluating Mean Job Satisfaction by Years_Since_Last_Promotion
df.pivot_table(index = "JobSatisfaction",
               values = "YearsSinceLastPromotion",
               aggfunc = "mean").sort_values(by="YearsSinceLastPromotion", ascending=False)

Unnamed: 0_level_0,YearsSinceLastPromotion
JobSatisfaction,Unnamed: 1_level_1
Low,5.953982
Very High,5.934748
Medium,5.853354
High,5.847068


In [171]:
#Evaluating Mean Performance rating by Years_Since_Last_Promotion
df.pivot_table(index = "PerformanceRating",
               values = "YearsSinceLastPromotion",
               aggfunc = "mean").sort_values(by="YearsSinceLastPromotion", ascending=False)

Unnamed: 0_level_0,YearsSinceLastPromotion
PerformanceRating,Unnamed: 1_level_1
Excellent,5.968463
Low,5.967856
Good,5.891881
Outstanding,5.76087


In [172]:
#Evaluating Mean Salary by Over time
df.pivot_table(index = "OverTime",
               values = "Salary",
               aggfunc = "mean").sort_values(by="Salary", ascending=False)

Unnamed: 0_level_0,Salary
OverTime,Unnamed: 1_level_1
Yes,25821.27508
No,25792.172648


In [173]:
#Evaluating Mean Salary by Over time and Relationship Satisfaction
df.pivot_table(index = "OverTime",
               values = "Salary",
               columns = ["RelationshipSatisfaction"], 
               aggfunc = "mean")

RelationshipSatisfaction,High,Low,Medium,Very High
OverTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,25320.159843,25960.560065,26000.305901,25893.255302
Yes,25765.730263,26097.850946,25741.901976,25673.300405


In [174]:
#Evaluating Mean Worker Distance from Home Job Satisfaction
df.pivot_table(index = "JobSatisfaction",
               values = "DistanceFromHome",
               aggfunc = "mean").sort_values(by="DistanceFromHome", ascending=False)

Unnamed: 0_level_0,DistanceFromHome
JobSatisfaction,Unnamed: 1_level_1
Low,25.883153
High,25.319803
Medium,25.123635
Very High,24.966773


In [175]:
#Evaluating Age Group and Salary by Job Satisfaction
df.pivot_table(index = "AgeGroup",
               values = "Salary",
               columns = ["JobSatisfaction"], 
               aggfunc = "sum")

JobSatisfaction,High,Low,Medium,Very High
AgeGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18-29,20031598,17730763,17584659,18290066
30-39,13248969,15867019,15141464,13834372
40-49,14525341,13784742,15533947,15002648
50-60,15677476,16627948,17431593,17754168


In [179]:
#Exporting the file as a Comma Separated File:
df.to_csv("hrdata.csv")