In [1]:
!pip install pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("HR_Analytics").getOrCreate()



In [2]:
# Read files
employees_df = spark.read.csv("employees.csv", header=True, inferSchema=True)
attendance_df = spark.read.csv("attendance.csv", header=True, inferSchema=True)
bonuses_df = spark.read.json("bonuses.json")

# Show schemas and sample data
employees_df.printSchema()
employees_df.show()

attendance_df.printSchema()
attendance_df.show()

bonuses_df.printSchema()
bonuses_df.show()

# Count distinct departments
employees_df.select("Department").distinct().count()

root
 |-- EmpID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoinDate: date (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- ManagerID: integer (nullable = true)

+-----+------+-----------+----------+------+---------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|
+-----+------+-----------+----------+------+---------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|
|    3|Simran|Engineering|2022-07-10| 75000|        1|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|
|    5| Nisha|         HR|2023-01-05| 50000|        1|
+-----+------+-----------+----------+------+---------+

root
 |-- EmpID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Status: string (nullable = true)

+-----+----------+-------+
|EmpID|      Date| Status|
+-----+----------+-------+
|    1|2024-04-01|Present|
|    1|2024-04-02|Present|
|    2|2024-

3

In [3]:
# Add TenureYears column
from pyspark.sql.functions import datediff, current_date, round, col
employees_df = employees_df.withColumn(
    "TenureYears",
    round(datediff(current_date(), col("JoinDate")) / 365, 1)
)

# Calculate TotalCompensation (Salary + Bonus)
from pyspark.sql.functions import coalesce, lit
employees_with_bonus = employees_df.join(bonuses_df, "EmpID", "left")
employees_with_bonus = employees_with_bonus.withColumn(
    "TotalCompensation",
    col("Salary") + coalesce(col("Bonus"), lit(0))
)

# Filter employees with >2 years tenure
employees_df.filter(col("TenureYears") > 2).show()

# Employees with managers
employees_df.filter(col("ManagerID").isNotNull()).show()

+-----+------+-----------+----------+------+---------+-----------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|
+-----+------+-----------+----------+------+---------+-----------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|        4.1|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2|
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|        5.6|
|    5| Nisha|         HR|2023-01-05| 50000|        1|        2.4|
+-----+------+-----------+----------+------+---------+-----------+

+-----+------+-----------+----------+------+---------+-----------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|
+-----+------+-----------+----------+------+---------+-----------+
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2|
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       

In [5]:
# Average salary per department
from pyspark.sql.functions import avg # Import the avg function
employees_df.groupBy("Department").agg(avg("Salary").alias("AvgSalary")).show()

# Employees per manager
employees_df.groupBy("ManagerID").count().show()

# Absences per employee
attendance_df.filter(col("Status") == "Absent").groupBy("EmpID").count().show()

+-----------+---------+
| Department|AvgSalary|
+-----------+---------+
|Engineering|  77500.0|
|         HR|  52500.0|
|  Marketing|  60000.0|
+-----------+---------+

+---------+-----+
|ManagerID|count|
+---------+-----+
|     NULL|    1|
|        1|    4|
+---------+-----+

+-----+-----+
|EmpID|count|
+-----+-----+
|    4|    2|
|    2|    1|
+-----+-----+



In [7]:
# Attendance percentage
from pyspark.sql.functions import count, sum, when # Import count, sum, and when
from pyspark.sql.functions import col # col was already imported, but including here for clarity
attendance_summary = attendance_df.groupBy("EmpID").agg(
    count("*").alias("TotalDays"),
    sum(when(col("Status") == "Present", 1).otherwise(0)).alias("PresentDays")
).withColumn("AttendancePercentage", (col("PresentDays") / col("TotalDays")) * 100)

employee_attendance = employees_df.join(attendance_summary, "EmpID", "left")
employee_attendance.show()

# Top 3 by TotalCompensation
employees_with_bonus.orderBy(col("TotalCompensation").desc()).limit(3).show()

+-----+------+-----------+----------+------+---------+-----------+---------+-----------+--------------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|TotalDays|PresentDays|AttendancePercentage|
+-----+------+-----------+----------+------+---------+-----------+---------+-----------+--------------------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|        4.1|        2|          2|               100.0|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2|        2|          1|                50.0|
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9|        2|          2|               100.0|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|        5.6|        2|          0|                 0.0|
|    5| Nisha|         HR|2023-01-05| 50000|        1|        2.4|        2|          2|               100.0|
+-----+------+-----------+----------+------+---------+-----------+---------+-----------+--------------------+

+-----+--

In [8]:
# Create database and tables
spark.sql("CREATE DATABASE IF NOT EXISTS hr")
spark.sql("USE hr")

employees_df.write.mode("overwrite").saveAsTable("hr.employees")
attendance_df.write.mode("overwrite").saveAsTable("hr.attendance")
bonuses_df.write.mode("overwrite").saveAsTable("hr.bonuses")

# Run SQL queries
spark.sql("SELECT Department, Name, Salary FROM employees WHERE Salary > 50000").show()

+-----------+------+------+
| Department|  Name|Salary|
+-----------+------+------+
|         HR| Anita| 55000|
|Engineering|   Raj| 80000|
|Engineering|Simran| 75000|
|  Marketing| Aamir| 60000|
+-----------+------+------+



In [9]:
# UDF for Tech/Non-Tech classification
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def dept_classifier(dept):
    return "Tech" if dept == "Engineering" else "Non-Tech"

dept_udf = udf(dept_classifier, StringType())
employees_df.withColumn("DeptType", dept_udf(col("Department"))).show()

# Save as Parquet
employee_attendance.write.partitionBy("Department").parquet("employee_attendance.parquet")

+-----+------+-----------+----------+------+---------+-----------+--------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|DeptType|
+-----+------+-----------+----------+------+---------+-----------+--------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|        4.1|Non-Tech|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2|    Tech|
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9|    Tech|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|        5.6|Non-Tech|
|    5| Nisha|         HR|2023-01-05| 50000|        1|        2.4|Non-Tech|
+-----+------+-----------+----------+------+---------+-----------+--------+



In [10]:
spark.stop()