In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
.appName("PysparkAssessment2")\
.getOrCreate()

spark

In [2]:
# Ingestion & Exploration
# Read all 3 files (CSV + JSON) using PySpark.
from google.colab import drive
drive.mount('/content/drive')

employees_df = spark.read.csv('/content/drive/MyDrive/employees.csv',header= True,inferSchema=True)
attendance_df = spark.read.csv('/content/drive/MyDrive/attendance.csv',header= True,inferSchema=True)
bonuses_df = spark.read.option("multiline", True).json('/content/drive/MyDrive/bonuses.json')

employees_df.show(5)
attendance_df.show(5)
bonuses_df.show(5)

Mounted at /content/drive
+-----+------+-----------+----------+------+---------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|
+-----+------+-----------+----------+------+---------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|
|    3|Simran|Engineering|2022-07-10| 75000|        1|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|
|    5| Nisha|         HR|2023-01-05| 50000|        1|
+-----+------+-----------+----------+------+---------+

+-----+----------+-------+
|EmpID|      Date| Status|
+-----+----------+-------+
|    1|2024-04-01|Present|
|    1|2024-04-02|Present|
|    2|2024-04-01| Absent|
|    2|2024-04-02|Present|
|    3|2024-04-01|Present|
+-----+----------+-------+
only showing top 5 rows

+-----+-----+----+
|Bonus|EmpID|Year|
+-----+-----+----+
| 5000|    1|2023|
| 7000|    2|2023|
| 6500|    3|2023|
| 6000|    4|2023|
| 4000|    5|2023|
+-----+-----+----+



In [3]:
# Show schemas and sample records.

print("Employees Schema:")
employees_df.printSchema()

print("Attendance Schema:")
attendance_df.printSchema()

print("Bonuses Schema:")
bonuses_df.printSchema()

Employees Schema:
root
 |-- EmpID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoinDate: date (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- ManagerID: integer (nullable = true)

Attendance Schema:
root
 |-- EmpID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Status: string (nullable = true)

Bonuses Schema:
root
 |-- Bonus: long (nullable = true)
 |-- EmpID: long (nullable = true)
 |-- Year: long (nullable = true)



In [4]:
# DataFrame Operations
# Add a column TenureYears using datediff() and round() .

from pyspark.sql.functions import date_diff, current_date,round

employees_df = employees_df.withColumn("TenureYears",round(date_diff(current_date(),employees_df['JoinDate'])/365.0,1))
employees_df.show()

+-----+------+-----------+----------+------+---------+-----------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|
+-----+------+-----------+----------+------+---------+-----------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|        4.1|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2|
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|        5.6|
|    5| Nisha|         HR|2023-01-05| 50000|        1|        2.4|
+-----+------+-----------+----------+------+---------+-----------+



In [5]:
# Calculate TotalCompensation = Salary + Bonus .

from pyspark.sql.functions import col

emp_bonus = employees_df.join(bonuses_df,on = 'EmpID')

emp_bonus = emp_bonus.withColumn("TotalCompention",col("Salary") + col("Bonus"))
emp_bonus.show()

+-----+------+-----------+----------+------+---------+-----------+-----+----+---------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|TotalCompention|
+-----+------+-----------+----------+------+---------+-----------+-----+----+---------------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|        4.1| 5000|2023|          60000|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2| 7000|2023|          87000|
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9| 6500|2023|          81500|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|        5.6| 6000|2023|          66000|
|    5| Nisha|         HR|2023-01-05| 50000|        1|        2.4| 4000|2023|          54000|
+-----+------+-----------+----------+------+---------+-----------+-----+----+---------------+



In [6]:
# Filter employees with more than 2 years in the company.
from pyspark.sql.functions import col

exp_emp = emp_bonus.filter(col("TenureYears") > 2)
exp_emp.show()

+-----+------+-----------+----------+------+---------+-----------+-----+----+---------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|TotalCompention|
+-----+------+-----------+----------+------+---------+-----------+-----+----+---------------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|        4.1| 5000|2023|          60000|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2| 7000|2023|          87000|
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9| 6500|2023|          81500|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|        5.6| 6000|2023|          66000|
|    5| Nisha|         HR|2023-01-05| 50000|        1|        2.4| 4000|2023|          54000|
+-----+------+-----------+----------+------+---------+-----------+-----+----+---------------+



In [7]:
# Show employees who report to a manager ( ManagerID is not null ).

from pyspark.sql.functions import col

report_manager = emp_bonus.filter(col("ManagerID").isNotNull())
report_manager.show()

+-----+------+-----------+----------+------+---------+-----------+-----+----+---------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|TotalCompention|
+-----+------+-----------+----------+------+---------+-----------+-----+----+---------------+
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2| 7000|2023|          87000|
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9| 6500|2023|          81500|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|        5.6| 6000|2023|          66000|
|    5| Nisha|         HR|2023-01-05| 50000|        1|        2.4| 4000|2023|          54000|
+-----+------+-----------+----------+------+---------+-----------+-----+----+---------------+



In [8]:
# Aggregation
# Average salary per department.

from pyspark.sql.functions import avg,col

avg_dept = employees_df.groupBy("Department").agg(avg(col("Salary")))
avg_dept.show()

+-----------+-----------+
| Department|avg(Salary)|
+-----------+-----------+
|Engineering|    77500.0|
|         HR|    52500.0|
|  Marketing|    60000.0|
+-----------+-----------+



In [10]:
# Number of employees under each manager.

from pyspark.sql.functions import count

no_emp_manager = employees_df.groupBy("ManagerID").agg(count("EmpID"))
no_emp_manager.show()

+---------+------------+
|ManagerID|count(EmpID)|
+---------+------------+
|     NULL|           1|
|        1|           4|
+---------+------------+



In [11]:
# Count of absences per employee.

from pyspark.sql.functions import col

absences_df = attendance_df.filter(col("Status") == "Absent").groupBy("EmpID").agg(count("*").alias("AbsenceCount"))
absences_df.show()

+-----+------------+
|EmpID|AbsenceCount|
+-----+------------+
|    4|           2|
|    2|           1|
+-----+------------+



In [12]:
# Joins
# Join employees and attendance → Get attendance % (Present days / Total days).

from pyspark.sql.functions import count, when, col, round

total_days_df = attendance_df.groupBy("EmpID").agg(count("*").alias("TotalDays"))

present_days_df = attendance_df.filter(col("Status") == "Present").groupBy("EmpID").agg(count("*").alias("PresentDays"))

attendance_summary_df = employees_df.join(total_days_df, "EmpID").join(present_days_df, "EmpID", "left").fillna(0, subset=["PresentDays"]) \
    .withColumn("AttendancePct", round((col("PresentDays") / col("TotalDays")) * 100, 2))

attendance_summary_df.show()

+-----+------+-----------+----------+------+---------+-----------+---------+-----------+-------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|TotalDays|PresentDays|AttendancePct|
+-----+------+-----------+----------+------+---------+-----------+---------+-----------+-------------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|        4.1|        2|          2|        100.0|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2|        2|          1|         50.0|
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9|        2|          2|        100.0|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|        5.6|        2|          0|          0.0|
|    5| Nisha|         HR|2023-01-05| 50000|        1|        2.4|        2|          2|        100.0|
+-----+------+-----------+----------+------+---------+-----------+---------+-----------+-------------+



In [13]:
# Join employees and bonuses → Show top 3 employees by TotalCompensation.

from pyspark.sql.functions import desc

comp_df = employees_df.join(bonuses_df, "EmpID", "left").withColumn("TotalCompensation", col("Salary") + col("Bonus"))

top_3_earners = comp_df.orderBy(desc("TotalCompensation")).select("EmpID", "Name", "Department", "TotalCompensation").limit(3)

top_3_earners.show()

+-----+------+-----------+-----------------+
|EmpID|  Name| Department|TotalCompensation|
+-----+------+-----------+-----------------+
|    2|   Raj|Engineering|            87000|
|    3|Simran|Engineering|            81500|
|    4| Aamir|  Marketing|            66000|
+-----+------+-----------+-----------------+



In [14]:
# Multi-level join: employees + bonuses + attendance .
# Join all three datasets
multi_join_df = employees_df .join(bonuses_df, "EmpID", "left").join(attendance_df, "EmpID", "left")

multi_join_df.show()

+-----+------+-----------+----------+------+---------+-----------+-----+----+----------+-------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|      Date| Status|
+-----+------+-----------+----------+------+---------+-----------+-----+----+----------+-------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|        4.1| 5000|2023|2024-04-02|Present|
|    1| Anita|         HR|2021-05-01| 55000|     NULL|        4.1| 5000|2023|2024-04-01|Present|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2| 7000|2023|2024-04-02|Present|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2| 7000|2023|2024-04-01| Absent|
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9| 6500|2023|2024-04-02|Present|
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9| 6500|2023|2024-04-01|Present|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|        5.6| 6000|2023|2024-04-02| Absent|
|    4| Aamir|  Marketing|2019

In [15]:
# String & Date Functions
# Extract year and month from JoinDate

from pyspark.sql.functions import year, month

employees_df = employees_df.withColumn("JoinYear", year("JoinDate")).withColumn("JoinMonth", month("JoinDate"))

employees_df.show()

+-----+------+-----------+----------+------+---------+-----------+--------+---------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|JoinYear|JoinMonth|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|        4.1|    2021|        5|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2|    2020|        3|
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9|    2022|        7|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|        5.6|    2019|       11|
|    5| Nisha|         HR|2023-01-05| 50000|        1|        2.4|    2023|        1|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+



In [16]:
# Mask employee names using regex.

from pyspark.sql.functions import regexp_replace

employees_df = employees_df.withColumn("MaskedName",
    regexp_replace("Name", "(?<=^.).", "*"))

employees_df.select("EmpID", "Name", "MaskedName").show()

+-----+------+----------+
|EmpID|  Name|MaskedName|
+-----+------+----------+
|    1| Anita|     A*ita|
|    2|   Raj|       R*j|
|    3|Simran|    S*mran|
|    4| Aamir|     A*mir|
|    5| Nisha|     N*sha|
+-----+------+----------+



In [17]:
# Use substring() to create EmpCode like "EMP001".

from pyspark.sql.functions import lpad, concat, lit

employees_df = employees_df.withColumn("EmpCode", concat(lit("EMP"), lpad(col("EmpID").cast("string"), 3, "0")))

employees_df.show()

+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|JoinYear|JoinMonth|MaskedName|EmpCode|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|        4.1|    2021|        5|     A*ita| EMP001|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2|    2020|        3|       R*j| EMP002|
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9|    2022|        7|    S*mran| EMP003|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|        5.6|    2019|       11|     A*mir| EMP004|
|    5| Nisha|         HR|2023-01-05| 50000|        1|        2.4|    2023|        1|     N*sha| EMP005|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+



In [18]:
# Conditional & Null Handling
# Use when/otherwise to label performance:
# “High” if Bonus > 6000
# “Medium” if 4000–6000
# “Low” otherwise

from pyspark.sql.functions import when
employees_df = employees_df.join(bonuses_df, on="EmpID", how="left")

employees_df = employees_df.withColumn(
    "Performance",
    when(col("Bonus") > 6000, "High")
    .when((col("Bonus") >= 4000) & (col("Bonus") <= 6000), "Medium")
    .otherwise("Low")
)

employees_df.show()

+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+-----+----+-----------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|JoinYear|JoinMonth|MaskedName|EmpCode|Bonus|Year|Performance|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+-----+----+-----------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|        4.1|    2021|        5|     A*ita| EMP001| 5000|2023|     Medium|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2|    2020|        3|       R*j| EMP002| 7000|2023|       High|
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9|    2022|        7|    S*mran| EMP003| 6500|2023|       High|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|        5.6|    2019|       11|     A*mir| EMP004| 6000|2023|     Medium|
|    5| Nisha|         HR|2023-01-05| 50000|        1|        2.4|    2023|        1|     N*sha| EMP005|

In [19]:
# Handle missing ManagerID using fillna("No Manager") .

from pyspark.sql.functions import col

employees_df = employees_df.withColumn("ManagerID", col("ManagerID").cast("string"))
employees_df = employees_df.fillna({"ManagerID": "No Manager"})
employees_df.select("EmpID", "Name", "ManagerID").show()

+-----+------+----------+
|EmpID|  Name| ManagerID|
+-----+------+----------+
|    1| Anita|No Manager|
|    2|   Raj|         1|
|    3|Simran|         1|
|    4| Aamir|         1|
|    5| Nisha|         1|
+-----+------+----------+



In [20]:
# Spark SQL
# Create and use database hr .
spark.sql("Create database hr")
spark.sql("use hr")

# Save all DataFrames as tables: employees , attendance , bonuses .
employees_df.write.mode("overwrite").saveAsTable("employees")
attendance_df.write.mode("overwrite").saveAsTable("attendance")
bonuses_df.write.mode("overwrite").saveAsTable("bonuses")

In [23]:
# Write SQL queries:
# Top paid employee in each department.
spark.sql("""
SELECT e.Department, e.Name, e.Salary
FROM employees e
JOIN (
    SELECT Department, MAX(Salary) as MaxSalary
    FROM employees
    GROUP BY Department
) max_salaries
ON e.Department = max_salaries.Department AND e.Salary = max_salaries.MaxSalary
""").show()

# Attendance rate by department.
spark.sql("DROP VIEW IF EXISTS attendance_summary")

spark.sql("""
    CREATE OR REPLACE TEMP VIEW attendance_summary AS
    SELECT a.EmpID,
           e.Department,
           COUNT(*) AS TotalDays,
           SUM(CASE WHEN a.Status = 'Present' THEN 1 ELSE 0 END) AS PresentDays
    FROM attendance a
    JOIN employees e ON a.EmpID = e.EmpID
    GROUP BY a.EmpID, e.Department
""")

spark.sql("""
    SELECT Department,
           ROUND(SUM(PresentDays)*100.0 / SUM(TotalDays), 2) AS AttendanceRatePct
    FROM attendance_summary
    GROUP BY Department
""").show()

# Employees joined after 2021 with salary > 70,000.
spark.sql("""
SELECT EmpID, Name, Department, JoinDate, Salary
FROM employees
WHERE JoinDate > '2021-12-31' AND Salary > 70000
""").show()

+-----------+-----+------+
| Department| Name|Salary|
+-----------+-----+------+
|         HR|Anita| 55000|
|Engineering|  Raj| 80000|
|  Marketing|Aamir| 60000|
+-----------+-----+------+

+-----------+-----------------+
| Department|AttendanceRatePct|
+-----------+-----------------+
|Engineering|            75.00|
|         HR|           100.00|
|  Marketing|             0.00|
+-----------+-----------------+

+-----+------+-----------+----------+------+
|EmpID|  Name| Department|  JoinDate|Salary|
+-----+------+-----------+----------+------+
|    3|Simran|Engineering|2022-07-10| 75000|
+-----+------+-----------+----------+------+



In [24]:
# Advanced (Optional)
# Use a UDF to classify department as "Tech" vs "Non-Tech".

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def classify_department(dept):
    return "Tech" if dept.lower() in ["engineering", "it", "development"] else "Non-Tech"

classify_udf = udf(classify_department, StringType())
employees_df = employees_df.withColumn("DeptType", classify_udf("Department"))
employees_df.show()

+-----+------+-----------+----------+------+----------+-----------+--------+---------+----------+-------+-----+----+-----------+--------+
|EmpID|  Name| Department|  JoinDate|Salary| ManagerID|TenureYears|JoinYear|JoinMonth|MaskedName|EmpCode|Bonus|Year|Performance|DeptType|
+-----+------+-----------+----------+------+----------+-----------+--------+---------+----------+-------+-----+----+-----------+--------+
|    1| Anita|         HR|2021-05-01| 55000|No Manager|        4.1|    2021|        5|     A*ita| EMP001| 5000|2023|     Medium|Non-Tech|
|    2|   Raj|Engineering|2020-03-15| 80000|         1|        5.2|    2020|        3|       R*j| EMP002| 7000|2023|       High|    Tech|
|    3|Simran|Engineering|2022-07-10| 75000|         1|        2.9|    2022|        7|    S*mran| EMP003| 6500|2023|       High|    Tech|
|    4| Aamir|  Marketing|2019-11-20| 60000|         1|        5.6|    2019|       11|     A*mir| EMP004| 6000|2023|     Medium|Non-Tech|
|    5| Nisha|         HR|2023-01-

In [25]:
# Create a view emp_attendance_summary .
from pyspark.sql.functions import count, sum, when

# Create attendance summary DataFrame
emp_attendance_summary = attendance_df.join(employees_df, "EmpID") \
    .groupBy("EmpID", "Name", "Department") \
    .agg(
        count("*").alias("TotalDays"),
        sum(when(col("Status") == "Present", 1).otherwise(0)).alias("PresentDays")
    ) \
    .withColumn("AttendanceRate", round((col("PresentDays") / col("TotalDays")) * 100, 2))

emp_attendance_summary.createOrReplaceTempView("emp_attendance_summary")
spark.sql("SELECT * FROM emp_attendance_summary").show()

+-----+------+-----------+---------+-----------+--------------+
|EmpID|  Name| Department|TotalDays|PresentDays|AttendanceRate|
+-----+------+-----------+---------+-----------+--------------+
|    2|   Raj|Engineering|        2|          1|          50.0|
|    5| Nisha|         HR|        2|          2|         100.0|
|    1| Anita|         HR|        2|          2|         100.0|
|    4| Aamir|  Marketing|        2|          0|           0.0|
|    3|Simran|Engineering|        2|          2|         100.0|
+-----+------+-----------+---------+-----------+--------------+



In [26]:
# Save it as Parquet partitioned by Department .

emp_attendance_summary.write.mode("overwrite").partitionBy("Department").parquet("/content/emp_attendance_summary_parquet")