In [None]:
1. Ingestion & Exploration

Read all 3 files (CSV + JSON) using PySpark.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("HRAnalyticsIngestion") \
    .getOrCreate()

employees_path = "/content/drive/MyDrive/employees.csv"
attendance_path = "/content/drive/MyDrive/attendance.csv"
bonuses_path = "/content/drive/MyDrive/bonuses.json"

employees_df = spark.read.option("header", True).option("inferSchema", True).csv(employees_path)
attendance_df = spark.read.option("header", True).option("inferSchema", True).csv(attendance_path)

bonuses_df = spark.read.option("multiline", True).json(bonuses_path)

Show schemas and sample records.

In [4]:
employees_df.printSchema()
attendance_df.printSchema()
bonuses_df.printSchema()

employees_df.show()
attendance_df.show()
bonuses_df.show()

root
 |-- EmpID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoinDate: date (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- ManagerID: integer (nullable = true)

root
 |-- EmpID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Status: string (nullable = true)

root
 |-- Bonus: long (nullable = true)
 |-- EmpID: long (nullable = true)
 |-- Year: long (nullable = true)

+-----+------+-----------+----------+------+---------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|
+-----+------+-----------+----------+------+---------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|
|    3|Simran|Engineering|2022-07-10| 75000|        1|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|
|    5| Nisha|         HR|2023-01-05| 50000|        1|
+-----+------+-----------+----------+------+---------+

+-----+----------+-------+
|EmpID|   

Count distinct departments.

In [5]:
distinct_departments = employees_df.select("Department").distinct().count()
print(f"Total distinct departments: {distinct_departments}")

Total distinct departments: 3


2. DataFrame Operations

Add a column TenureYears using datediff() and round()

In [6]:
from pyspark.sql.functions import datediff, current_date, round, col

employees_df = employees_df.withColumn(
    "TenureYears",
    round(datediff(current_date(), col("JoinDate")) / 365, 2)
)

employees_df.select("EmpID", "Name", "JoinDate", "TenureYears").show()

+-----+------+----------+-----------+
|EmpID|  Name|  JoinDate|TenureYears|
+-----+------+----------+-----------+
|    1| Anita|2021-05-01|       4.11|
|    2|   Raj|2020-03-15|       5.24|
|    3|Simran|2022-07-10|       2.92|
|    4| Aamir|2019-11-20|       5.56|
|    5| Nisha|2023-01-05|       2.43|
+-----+------+----------+-----------+



Calculate TotalCompensation = Salary + Bonus

In [7]:
emp_bonus_df = employees_df.join(bonuses_df, on="EmpID", how="left") \
    .withColumn("TotalCompensation", col("Salary") + col("Bonus"))

emp_bonus_df.select("EmpID", "Name", "Salary", "Bonus", "TotalCompensation").show()

+-----+------+------+-----+-----------------+
|EmpID|  Name|Salary|Bonus|TotalCompensation|
+-----+------+------+-----+-----------------+
|    1| Anita| 55000| 5000|            60000|
|    2|   Raj| 80000| 7000|            87000|
|    3|Simran| 75000| 6500|            81500|
|    4| Aamir| 60000| 6000|            66000|
|    5| Nisha| 50000| 4000|            54000|
+-----+------+------+-----+-----------------+



Filter employees with more than 2 years in the company.

In [8]:
emp_bonus_df.filter(col("TenureYears") > 2).select("EmpID", "Name", "TenureYears").show()

+-----+------+-----------+
|EmpID|  Name|TenureYears|
+-----+------+-----------+
|    1| Anita|       4.11|
|    2|   Raj|       5.24|
|    3|Simran|       2.92|
|    4| Aamir|       5.56|
|    5| Nisha|       2.43|
+-----+------+-----------+



Show employees who report to a manager

In [9]:
employees_df.filter(col("ManagerID").isNotNull()).select("EmpID", "Name", "ManagerID").show()

+-----+------+---------+
|EmpID|  Name|ManagerID|
+-----+------+---------+
|    2|   Raj|        1|
|    3|Simran|        1|
|    4| Aamir|        1|
|    5| Nisha|        1|
+-----+------+---------+



3. Aggregation

Average salary per department

In [10]:
from pyspark.sql.functions import avg

employees_df.groupBy("Department") \
    .agg(avg("Salary").alias("AverageSalary")) \
    .show()

+-----------+-------------+
| Department|AverageSalary|
+-----------+-------------+
|Engineering|      77500.0|
|         HR|      52500.0|
|  Marketing|      60000.0|
+-----------+-------------+



Number of employees under each manager.

In [11]:
from pyspark.sql.functions import count

employees_df.groupBy("ManagerID") \
    .agg(count("EmpID").alias("NumEmployees")) \
    .filter(col("ManagerID").isNotNull()) \
    .show()

+---------+------------+
|ManagerID|NumEmployees|
+---------+------------+
|        1|           4|
+---------+------------+



Count of absences per employee

In [12]:
attendance_df.filter(col("Status") == "Absent") \
    .groupBy("EmpID") \
    .agg(count("Date").alias("AbsenceCount")) \
    .show()

+-----+------------+
|EmpID|AbsenceCount|
+-----+------------+
|    4|           2|
|    2|           1|
+-----+------------+



4. Joins

Join employees and attendance

In [13]:
from pyspark.sql.functions import when, count, col

total_days_df = attendance_df.groupBy("EmpID").agg(count("Date").alias("TotalDays"))

present_days_df = attendance_df.filter(col("Status") == "Present") \
    .groupBy("EmpID") \
    .agg(count("Date").alias("PresentDays"))

attendance_rate_df = total_days_df.join(present_days_df, "EmpID", "left") \
    .withColumn("AttendancePercent", (col("PresentDays") / col("TotalDays")) * 100)

attendance_rate_df.show()

+-----+---------+-----------+-----------------+
|EmpID|TotalDays|PresentDays|AttendancePercent|
+-----+---------+-----------+-----------------+
|    1|        2|          2|            100.0|
|    3|        2|          2|            100.0|
|    5|        2|          2|            100.0|
|    4|        2|       NULL|             NULL|
|    2|        2|          1|             50.0|
+-----+---------+-----------+-----------------+



Join employees and bonuses

In [14]:
from pyspark.sql.functions import desc

top_3_employees_df = employees_df.join(bonuses_df, "EmpID") \
    .withColumn("TotalCompensation", col("Salary") + col("Bonus")) \
    .orderBy(desc("TotalCompensation")) \
    .limit(3)

top_3_employees_df.select("EmpID", "Name", "Department", "TotalCompensation").show()

+-----+------+-----------+-----------------+
|EmpID|  Name| Department|TotalCompensation|
+-----+------+-----------+-----------------+
|    2|   Raj|Engineering|            87000|
|    3|Simran|Engineering|            81500|
|    4| Aamir|  Marketing|            66000|
+-----+------+-----------+-----------------+



 Multi-level join

In [15]:
multi_join_df = employees_df.join(bonuses_df, "EmpID", "left") \
    .join(attendance_df, "EmpID", "left")

multi_join_df.select("EmpID", "Name", "Department", "Date", "Status", "Bonus").show()

+-----+------+-----------+----------+-------+-----+
|EmpID|  Name| Department|      Date| Status|Bonus|
+-----+------+-----------+----------+-------+-----+
|    1| Anita|         HR|2024-04-02|Present| 5000|
|    1| Anita|         HR|2024-04-01|Present| 5000|
|    2|   Raj|Engineering|2024-04-02|Present| 7000|
|    2|   Raj|Engineering|2024-04-01| Absent| 7000|
|    3|Simran|Engineering|2024-04-02|Present| 6500|
|    3|Simran|Engineering|2024-04-01|Present| 6500|
|    4| Aamir|  Marketing|2024-04-02| Absent| 6000|
|    4| Aamir|  Marketing|2024-04-01| Absent| 6000|
|    5| Nisha|         HR|2024-04-02|Present| 4000|
|    5| Nisha|         HR|2024-04-01|Present| 4000|
+-----+------+-----------+----------+-------+-----+



5. String & Date Functions

Extract year and month from JoinDate.

In [16]:
from pyspark.sql.functions import year, month

employees_df = employees_df \
    .withColumn("JoinYear", year(col("JoinDate"))) \
    .withColumn("JoinMonth", month(col("JoinDate")))

employees_df.select("EmpID", "Name", "JoinDate", "JoinYear", "JoinMonth").show()

+-----+------+----------+--------+---------+
|EmpID|  Name|  JoinDate|JoinYear|JoinMonth|
+-----+------+----------+--------+---------+
|    1| Anita|2021-05-01|    2021|        5|
|    2|   Raj|2020-03-15|    2020|        3|
|    3|Simran|2022-07-10|    2022|        7|
|    4| Aamir|2019-11-20|    2019|       11|
|    5| Nisha|2023-01-05|    2023|        1|
+-----+------+----------+--------+---------+



Mask employee names using regex.

In [17]:
from pyspark.sql.functions import regexp_replace

masked_names_df = employees_df.withColumn("MaskedName", regexp_replace("Name", r"(.)(.*)", r"$1****"))
masked_names_df.select("EmpID", "Name", "MaskedName").show()

+-----+------+----------+
|EmpID|  Name|MaskedName|
+-----+------+----------+
|    1| Anita|     A****|
|    2|   Raj|     R****|
|    3|Simran|     S****|
|    4| Aamir|     A****|
|    5| Nisha|     N****|
+-----+------+----------+



Use substring() to create EmpCode like "EMP001".

In [18]:
from pyspark.sql.functions import lpad, concat, lit

employees_df = employees_df.withColumn("EmpCode", concat(lit("EMP"), lpad(col("EmpID").cast("string"), 3, "0")))
employees_df.select("EmpID", "EmpCode").show()

+-----+-------+
|EmpID|EmpCode|
+-----+-------+
|    1| EMP001|
|    2| EMP002|
|    3| EMP003|
|    4| EMP004|
|    5| EMP005|
+-----+-------+



6. Conditional & Null Handling

Use when/otherwise to label performance

In [19]:
from pyspark.sql.functions import when

performance_df = bonuses_df.withColumn(
    "Performance",
    when(col("Bonus") > 6000, "High")
    .when((col("Bonus") >= 4000) & (col("Bonus") <= 6000), "Medium")
    .otherwise("Low")
)

performance_df.select("EmpID", "Bonus", "Performance").show()

+-----+-----+-----------+
|EmpID|Bonus|Performance|
+-----+-----+-----------+
|    1| 5000|     Medium|
|    2| 7000|       High|
|    3| 6500|       High|
|    4| 6000|     Medium|
|    5| 4000|     Medium|
+-----+-----+-----------+



Handle missing ManagerID using fillna

In [20]:
employees_filled_df = employees_df.fillna({"ManagerID": "No Manager"})
employees_filled_df.select("EmpID", "Name", "ManagerID").show()

+-----+------+---------+
|EmpID|  Name|ManagerID|
+-----+------+---------+
|    1| Anita|     NULL|
|    2|   Raj|        1|
|    3|Simran|        1|
|    4| Aamir|        1|
|    5| Nisha|        1|
+-----+------+---------+



7. Spark SQL

Create and use database hr.

In [21]:
spark.sql("CREATE DATABASE IF NOT EXISTS hr")

spark.catalog.setCurrentDatabase("hr")

Save all DataFrames as tables

In [22]:
employees_df.write.mode("overwrite").saveAsTable("employees")
attendance_df.write.mode("overwrite").saveAsTable("attendance")
bonuses_df.write.mode("overwrite").saveAsTable("bonuses")

SQL – Top paid employee in each department

In [23]:
spark.sql("""
    SELECT Department, Name, Salary
    FROM (
        SELECT *, RANK() OVER (PARTITION BY Department ORDER BY Salary DESC) as rnk
        FROM employees
    ) ranked
    WHERE rnk = 1
""").show()

+-----------+-----+------+
| Department| Name|Salary|
+-----------+-----+------+
|Engineering|  Raj| 80000|
|         HR|Anita| 55000|
|  Marketing|Aamir| 60000|
+-----------+-----+------+



SQL – Attendance rate by department

In [24]:
spark.sql("""
    SELECT e.Department,
           ROUND(100 * SUM(CASE WHEN a.Status = 'Present' THEN 1 ELSE 0 END) / COUNT(*), 2) AS AttendanceRate
    FROM employees e
    JOIN attendance a ON e.EmpID = a.EmpID
    GROUP BY e.Department
""").show()

+-----------+--------------+
| Department|AttendanceRate|
+-----------+--------------+
|Engineering|          75.0|
|         HR|         100.0|
|  Marketing|           0.0|
+-----------+--------------+



SQL – Employees joined after 2021 with salary > 70,000

In [25]:
spark.sql("""
    SELECT EmpID, Name, JoinDate, Salary
    FROM employees
    WHERE JoinDate > '2021-12-31' AND Salary > 70000
""").show()

+-----+------+----------+------+
|EmpID|  Name|  JoinDate|Salary|
+-----+------+----------+------+
|    3|Simran|2022-07-10| 75000|
+-----+------+----------+------+



Task 8 – Advanced

Use a UDF to classify department as "Tech" vs "Non-Tech".

In [26]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def classify_dept(dept):
    tech_depts = ["Engineering"]
    return "Tech" if dept in tech_depts else "Non-Tech"

classify_dept_udf = udf(classify_dept, StringType())

employees_df = employees_df.withColumn("DeptType", classify_dept_udf(col("Department")))

employees_df.select("Department", "DeptType").show()

+-----------+--------+
| Department|DeptType|
+-----------+--------+
|         HR|Non-Tech|
|Engineering|    Tech|
|Engineering|    Tech|
|  Marketing|Non-Tech|
|         HR|Non-Tech|
+-----------+--------+



Create a view emp_attendance_summary .

In [27]:
attendance_summary_df = attendance_df.groupBy("EmpID") \
    .pivot("Status", ["Present", "Absent"]) \
    .count() \
    .na.fill(0)

emp_attendance_summary_df = employees_df.join(attendance_summary_df, "EmpID", "left")

emp_attendance_summary_df.createOrReplaceTempView("emp_attendance_summary")

spark.sql("SELECT * FROM emp_attendance_summary").show()

+-----+------+-----------+----------+------+---------+-----------+--------+---------+-------+--------+-------+------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|JoinYear|JoinMonth|EmpCode|DeptType|Present|Absent|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+-------+--------+-------+------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|       4.11|    2021|        5| EMP001|Non-Tech|      2|     0|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|       5.24|    2020|        3| EMP002|    Tech|      1|     1|
|    3|Simran|Engineering|2022-07-10| 75000|        1|       2.92|    2022|        7| EMP003|    Tech|      2|     0|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|       5.56|    2019|       11| EMP004|Non-Tech|      0|     2|
|    5| Nisha|         HR|2023-01-05| 50000|        1|       2.43|    2023|        1| EMP005|Non-Tech|      2|     0|
+-----+------+-----------+----------+------+---------+--

Save it as Parquet partitioned by Department .

In [28]:
emp_attendance_summary_df.write \
    .mode("overwrite") \
    .partitionBy("Department") \
    .parquet("/content/drive/MyDrive/pyspark_data/emp_attendance_summary_parquet")
