In [0]:
# importing all required libraries

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
# create a SparkSession
emp_spark = SparkSession.builder.appName("HR Employee Attrition").getOrCreate()
print(emp_spark)

<pyspark.sql.session.SparkSession object at 0x7f09762a64f0>


In [0]:
# create a dataframe - /FileStore/tables/HR_Employee_Attrition.csv (file location)

emp_df = emp_spark.read.csv("/FileStore/tables/HR_Employee_Attrition.csv", inferSchema=True, header=True)
emp_df.show(5) # pls check dataset if your not able to see output of snippet
emp_df.printSchema()

+---+---------+-----------------+---------+--------------------+----------------+---------+--------------+-------------+--------------+-----------------------+------+----------+--------------+--------+--------------------+---------------+-------------+-------------+-----------+------------------+------+--------+-----------------+-----------------+------------------------+-------------+----------------+-----------------+---------------------+---------------+--------------+------------------+-----------------------+--------------------+
|Age|Attrition|   BusinessTravel|DailyRate|          Department|DistanceFromHome|Education|EducationField|EmployeeCount|EmployeeNumber|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|             JobRole|JobSatisfaction|MaritalStatus|MonthlyIncome|MonthlyRate|NumCompaniesWorked|Over18|OverTime|PercentSalaryHike|PerformanceRating|RelationshipSatisfaction|StandardHours|StockOptionLevel|TotalWorkingYears|TrainingTimesLastYear|WorkLifeBalanc

In [0]:
# let's drop some unwanted columns
emp_df_DC = emp_df.drop("BusinessTravel", "DailyRate", "DistanceFromHome", "JobInvolvement", "MonthlyRate", "RelationshipSatisfaction",
                        "StandardHours", "StockOptionLevel", "TrainingTimesLastYear", )

emp_df_DC.show(5) # pls check dataset if your not able to see output of snippet
emp_df_DC.printSchema()

+---+---------+--------------------+---------+--------------+-------------+--------------+-----------------------+------+----------+--------+--------------------+---------------+-------------+-------------+------------------+------+--------+-----------------+-----------------+-----------------+---------------+--------------+------------------+-----------------------+--------------------+
|Age|Attrition|          Department|Education|EducationField|EmployeeCount|EmployeeNumber|EnvironmentSatisfaction|Gender|HourlyRate|JobLevel|             JobRole|JobSatisfaction|MaritalStatus|MonthlyIncome|NumCompaniesWorked|Over18|OverTime|PercentSalaryHike|PerformanceRating|TotalWorkingYears|WorkLifeBalance|YearsAtCompany|YearsInCurrentRole|YearsSinceLastPromotion|YearsWithCurrManager|
+---+---------+--------------------+---------+--------------+-------------+--------------+-----------------------+------+----------+--------+--------------------+---------------+-------------+-------------+------------

In [0]:
# we are using SparkSQL here 
# so first of all we need create a temp view
emp_df_DC.createOrReplaceTempView("emp_data")

In [0]:
%sql select * from emp_data limit 10;

Age,Attrition,Department,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,TotalWorkingYears,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
41,Yes,Sales,2,Life Sciences,1,1,2,Female,94,2,Sales Executive,4,Single,5993,8,Y,Yes,11,3,8,1,6,4,0,5
49,No,Research & Development,1,Life Sciences,1,2,3,Male,61,2,Research Scientist,2,Married,5130,1,Y,No,23,4,10,3,10,7,1,7
37,Yes,Research & Development,2,Other,1,4,4,Male,92,1,Laboratory Technician,3,Single,2090,6,Y,Yes,15,3,7,3,0,0,0,0
33,No,Research & Development,4,Life Sciences,1,5,4,Female,56,1,Research Scientist,3,Married,2909,1,Y,Yes,11,3,8,3,8,7,3,0
27,No,Research & Development,1,Medical,1,7,1,Male,40,1,Laboratory Technician,2,Married,3468,9,Y,No,12,3,6,3,2,2,2,2
32,No,Research & Development,2,Life Sciences,1,8,4,Male,79,1,Laboratory Technician,4,Single,3068,0,Y,No,13,3,8,2,7,7,3,6
59,No,Research & Development,3,Medical,1,10,3,Female,81,1,Laboratory Technician,1,Married,2670,4,Y,Yes,20,4,12,2,1,0,0,0
30,No,Research & Development,1,Life Sciences,1,11,4,Male,67,1,Laboratory Technician,3,Divorced,2693,1,Y,No,22,4,1,3,1,0,0,0
38,No,Research & Development,3,Life Sciences,1,12,4,Male,44,3,Manufacturing Director,3,Single,9526,0,Y,No,21,4,10,3,9,7,1,8
36,No,Research & Development,3,Medical,1,13,3,Male,94,2,Healthcare Representative,3,Married,5237,6,Y,No,13,3,17,2,7,7,7,7


In [0]:
# total employee count in company
emp_spark.sql("""
                    select count(EmployeeCount) as emp_count from emp_data;
              
            """).show()

+---------+
|emp_count|
+---------+
|     1470|
+---------+



In [0]:
# check how many employee's are leave and how many still there
emp_spark.sql("""
                    select count(EmployeeCount) as emp_attrition_count, Attrition from emp_data
                    group by Attrition;
            """).show()

+-------------------+---------+
|emp_attrition_count|Attrition|
+-------------------+---------+
|               1233|       No|
|                237|      Yes|
+-------------------+---------+



In [0]:
# find out how many employee leave the org. by age count
emp_spark.sql("""
                   select min(Age), max(Age) from emp_data;
            """).show(5)

+--------+--------+
|min(Age)|max(Age)|
+--------+--------+
|      18|      60|
+--------+--------+



In [0]:
# find out high attrition by particular age group
emp_spark.sql("""
                    select sum(EmployeeCount) as emp_attrition_count, 
                        case
                            when age between 18 and 25 then "18-25"
                            when age between 25 and 32 then "25-30"
                            when age between 33 and 40 then "38-48"
                            else "40+"
                        end as age_groups
                    from emp_data
                    where Attrition == "Yes"
                    group by age_groups
                    order by age_groups asc;
            """).show(100)

# here we observe that most people leave the org. they are belog from age group 25-30 

+-------------------+----------+
|emp_attrition_count|age_groups|
+-------------------+----------+
|                 44|     18-25|
|                 85|     25-30|
|                 56|     38-48|
|                 52|       40+|
+-------------------+----------+



In [0]:
%sql    select sum(EmployeeCount) as emp_attrition_count, 
            case
                when age between 18 and 25 then "18-25"
                when age between 25 and 32 then "25-30"
                when age between 33 and 40 then "38-48"
                else "40+"
            end as age_groups
        from emp_data
                where Attrition == "Yes"
                group by age_groups
                order by age_groups asc;

emp_attrition_count,age_groups
44,18-25
85,25-30
56,38-48
52,40+


Databricks visualization. Run in Databricks to view.

In [0]:
# let's find attrition count by department wise
emp_spark.sql("""
                    select sum(EmployeeCount), Department from emp_data
                    where Attrition == "Yes"
                    group by Department
                    order by sum(EmployeeCount) desc;

            """).show(truncate=False)

# most people leave the org. belog from Research & Development department

+------------------+----------------------+
|sum(EmployeeCount)|Department            |
+------------------+----------------------+
|133               |Research & Development|
|92                |Sales                 |
|12                |Human Resources       |
+------------------+----------------------+



In [0]:
%sql                select sum(EmployeeCount), Department from emp_data
                    where Attrition == "Yes"
                    group by Department
                    order by sum(EmployeeCount) desc;

sum(EmployeeCount),Department
133,Research & Development
92,Sales
12,Human Resources


Databricks visualization. Run in Databricks to view.

In [0]:
# Attrition by Educational fields
emp_spark.sql("""
                select sum(EmployeeCount), EducationField from emp_data
                where Attrition == "Yes"
                group by EducationField
                order by sum(EmployeeCount) desc;
            """).show(truncate=False)

# most people leave the org.belog from "Life Sciences" EducationField

+------------------+----------------+
|sum(EmployeeCount)|EducationField  |
+------------------+----------------+
|89                |Life Sciences   |
|63                |Medical         |
|35                |Marketing       |
|32                |Technical Degree|
|11                |Other           |
|7                 |Human Resources |
+------------------+----------------+



In [0]:
%sql            select sum(EmployeeCount), EducationField from emp_data
                where Attrition == "Yes"
                group by EducationField
                order by sum(EmployeeCount) desc;

sum(EmployeeCount),EducationField
89,Life Sciences
63,Medical
35,Marketing
32,Technical Degree
11,Other
7,Human Resources


Databricks visualization. Run in Databricks to view.

In [0]:
# Attrition by Enviormental satisfaction (1-Low, 2-Medium, 3-High, 4-Satisfied)
emp_spark.sql("""
                    select sum(EmployeeCount),
                        case
                            when EnvironmentSatisfaction == 1 then "Low"
                            when EnvironmentSatisfaction == 2 then "Medium"
                            when EnvironmentSatisfaction == 3 then "High"
                            else "Satisfied"
                        end as Enviormental_Rating
                    from emp_data
                    where Attrition == "Yes"
                    group by EnvironmentSatisfaction
                    order by sum(EmployeeCount) desc;

            """).show()

# so there are 72 people who leave the org. they are not satisfied by org. services

+------------------+-------------------+
|sum(EmployeeCount)|Enviormental_Rating|
+------------------+-------------------+
|                72|                Low|
|                62|               High|
|                60|          Satisfied|
|                43|             Medium|
+------------------+-------------------+



In [0]:
%sql                select sum(EmployeeCount),
                        case
                            when EnvironmentSatisfaction == 1 then "Low"
                            when EnvironmentSatisfaction == 2 then "Medium"
                            when EnvironmentSatisfaction == 3 then "High"
                            else "Satisfied"
                        end as Enviormental_Rating
                    from emp_data
                    where Attrition == "Yes"
                    group by EnvironmentSatisfaction
                    order by sum(EmployeeCount) desc;

sum(EmployeeCount),Enviormental_Rating
72,Low
62,High
60,Satisfied
43,Medium


Databricks visualization. Run in Databricks to view.

In [0]:
# how many people leave the org. accordingly Job Title
emp_spark.sql("""
                    select sum(EmployeeCount), JobRole from emp_data
                    where Attrition == "Yes"
                    group by JobRole
                    sort by sum(EmployeeCount) desc;
              
            """).show(truncate=False)

# here we observe that "Laboratory Technician" and "Sales Executive" Job Role people leave the org.

+------------------+-------------------------+
|sum(EmployeeCount)|JobRole                  |
+------------------+-------------------------+
|62                |Laboratory Technician    |
|57                |Sales Executive          |
|47                |Research Scientist       |
|33                |Sales Representative     |
|12                |Human Resources          |
|10                |Manufacturing Director   |
|9                 |Healthcare Representative|
|5                 |Manager                  |
|2                 |Research Director        |
+------------------+-------------------------+



In [0]:
%sql                select sum(EmployeeCount), JobRole from emp_data
                    where Attrition == "Yes"
                    group by JobRole
                    sort by sum(EmployeeCount) desc;

sum(EmployeeCount),JobRole
62,Laboratory Technician
57,Sales Executive
47,Research Scientist
33,Sales Representative
12,Human Resources
10,Manufacturing Director
9,Healthcare Representative
5,Manager
2,Research Director


Databricks visualization. Run in Databricks to view.

The insights we get from this data are - Most People leave the org.
1. age group -> 25-30
2. department -> Research & Development department
3. EducationField -> "Life Sciences"
4. Job Role -> "Laboratory Technician" and "Sales Executive"
5. 72 people who leave the org. they are not satisfied by org. services