In [4]:
from pyspark.sql import SparkSession

# Initialize Spark
spark = SparkSession.builder \
    .appName("HR_Analytics") \
    .config("spark.sql.shuffle.partitions", "2") \
    .getOrCreate()




In [6]:
from google.colab import drive
drive.mount('/content/drive')

# Set the correct folder path
base_path = "/content/drive/MyDrive/HR_Analytics_Data/"  # Change this to your actual folder name

# Step 1: Initialize Spark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("HR_Analytics") \
    .config("spark.sql.shuffle.partitions", "2") \
    .getOrCreate()

# Step 2: Read CSV and JSON
employees_df = spark.read.option("header", True).option("inferSchema", True).csv(base_path + "employees.csv")
attendance_df = spark.read.option("header", True).option("inferSchema", True).csv(base_path + "attendance.csv")
bonuses_df = spark.read.option("multiline", True).json(base_path + "bonuses.json")

# Step 3: Show schema and sample
employees_df.printSchema()
attendance_df.printSchema()
bonuses_df.printSchema()

employees_df.show(3)
attendance_df.show(3)
bonuses_df.show(3)

#Step 4: Count distinct departments
from pyspark.sql.functions import countDistinct
employees_df.select(countDistinct("Department")).show()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
root
 |-- EmpID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- JoinDate: date (nullable = true)
 |-- Salary: integer (nullable = true)
 |-- ManagerID: integer (nullable = true)

root
 |-- EmpID: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Status: string (nullable = true)

root
 |-- Bonus: long (nullable = true)
 |-- EmpID: long (nullable = true)
 |-- Year: long (nullable = true)

+-----+------+-----------+----------+------+---------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|
+-----+------+-----------+----------+------+---------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|
|    3|Simran|Engineering|2022-07-10| 75000|        1|
+-----+------+-----------+----------+------+---------+
only showing top 3 

In [8]:
from pyspark.sql.functions import to_date, datediff, current_date, round, col

# Fix JoinDate if needed
employees_df = employees_df.withColumn("JoinDate", to_date(col("JoinDate"), "yyyy-MM-dd"))

# Tenure in years
employees_df = employees_df.withColumn("TenureYears", round(datediff(current_date(), col("JoinDate")) / 365, 1))

# Join bonus to compute TotalCompensation
emp_bonus_df = employees_df.join(bonuses_df, "EmpID", "left") \
                           .withColumn("TotalCompensation", col("Salary") + col("Bonus"))

# Show all
emp_bonus_df.select("EmpID", "Name", "JoinDate", "TenureYears", "ManagerID", "Salary", "Bonus", "TotalCompensation").show()

# Filter >2 years
emp_bonus_df.filter(col("TenureYears") > 2).show()

# Employees with manager
emp_bonus_df.filter(col("ManagerID").isNotNull()).show()



+-----+------+----------+-----------+---------+------+-----+-----------------+
|EmpID|  Name|  JoinDate|TenureYears|ManagerID|Salary|Bonus|TotalCompensation|
+-----+------+----------+-----------+---------+------+-----+-----------------+
|    1| Anita|2021-05-01|        4.1|     NULL| 55000| 5000|            60000|
|    2|   Raj|2020-03-15|        5.2|        1| 80000| 7000|            87000|
|    3|Simran|2022-07-10|        2.9|        1| 75000| 6500|            81500|
|    4| Aamir|2019-11-20|        5.6|        1| 60000| 6000|            66000|
|    5| Nisha|2023-01-05|        2.4|        1| 50000| 4000|            54000|
+-----+------+----------+-----------+---------+------+-----+-----------------+

+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|TotalCompensation|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+


In [9]:
# Avg salary per department
employees_df.groupBy("Department").avg("Salary").show()

# Count of employees per manager
employees_df.groupBy("ManagerID").count().show()

# Count of absences
attendance_df.filter(col("Status") == "Absent").groupBy("EmpID").count().show()


+-----------+-----------+
| Department|avg(Salary)|
+-----------+-----------+
|Engineering|    77500.0|
|         HR|    52500.0|
|  Marketing|    60000.0|
+-----------+-----------+

+---------+-----+
|ManagerID|count|
+---------+-----+
|     NULL|    1|
|        1|    4|
+---------+-----+

+-----+-----+
|EmpID|count|
+-----+-----+
|    2|    1|
|    4|    2|
+-----+-----+



In [10]:
# Attendance percentage
from pyspark.sql.functions import count, when

attendance_pct_df = attendance_df.groupBy("EmpID") \
    .agg(
        count("*").alias("TotalDays"),
        count(when(col("Status") == "Present", True)).alias("PresentDays")
    ).withColumn("AttendancePct", round(col("PresentDays") / col("TotalDays") * 100, 1))

attendance_pct_df.show()

# Top 3 by total compensation
emp_bonus_df.orderBy(col("TotalCompensation").desc()).show(3)

# Multi-level join
multi_join_df = employees_df.join(bonuses_df, "EmpID").join(attendance_pct_df, "EmpID")
multi_join_df.show()


+-----+---------+-----------+-------------+
|EmpID|TotalDays|PresentDays|AttendancePct|
+-----+---------+-----------+-------------+
|    2|        2|          1|         50.0|
|    4|        2|          0|          0.0|
|    5|        2|          2|        100.0|
|    1|        2|          2|        100.0|
|    3|        2|          2|        100.0|
+-----+---------+-----------+-------------+

+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|Bonus|Year|TotalCompensation|
+-----+------+-----------+----------+------+---------+-----------+-----+----+-----------------+
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2| 7000|2023|            87000|
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9| 6500|2023|            81500|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|        5.6| 6000|2023|            66000|
+-----+------+-----------+-

In [11]:
from pyspark.sql.functions import year, month, regexp_replace, lpad, substring

# Extract year/month from JoinDate
employees_df = employees_df.withColumn("JoinYear", year("JoinDate")).withColumn("JoinMonth", month("JoinDate"))

# Mask name
employees_df = employees_df.withColumn("MaskedName", regexp_replace("Name", ".", "*"))

# Employee code
employees_df = employees_df.withColumn("EmpCode", lpad(col("EmpID").cast("string"), 3, "0"))
employees_df = employees_df.withColumn("EmpCode", col("EmpCode").substr(1, 3).alias("EmpCode"))
employees_df = employees_df.withColumn("EmpCode", col("EmpCode").cast("string"))
employees_df = employees_df.withColumn("EmpCode", col("EmpCode").alias("EMP") + col("EmpCode"))

employees_df.show()


+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|JoinYear|JoinMonth|MaskedName|EmpCode|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|        4.1|    2021|        5|     *****|    2.0|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2|    2020|        3|       ***|    4.0|
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9|    2022|        7|    ******|    6.0|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|        5.6|    2019|       11|     *****|    8.0|
|    5| Nisha|         HR|2023-01-05| 50000|        1|        2.4|    2023|        1|     *****|   10.0|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+



In [12]:
from pyspark.sql.functions import when

# Performance label
bonuses_df = bonuses_df.withColumn("Performance",
    when(col("Bonus") > 6000, "High")
    .when(col("Bonus").between(4000, 6000), "Medium")
    .otherwise("Low")
)

# Handle nulls
employees_df = employees_df.fillna({"ManagerID": "No Manager"})

bonuses_df.show()
employees_df.show()


+-----+-----+----+-----------+
|Bonus|EmpID|Year|Performance|
+-----+-----+----+-----------+
| 5000|    1|2023|     Medium|
| 7000|    2|2023|       High|
| 6500|    3|2023|       High|
| 6000|    4|2023|     Medium|
| 4000|    5|2023|     Medium|
+-----+-----+----+-----------+

+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|JoinYear|JoinMonth|MaskedName|EmpCode|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|        4.1|    2021|        5|     *****|    2.0|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|        5.2|    2020|        3|       ***|    4.0|
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9|    2022|        7|    ******|    6.0|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|        5.6|    2019|       11|     

In [13]:
# Create database
spark.sql("CREATE DATABASE IF NOT EXISTS hr")
spark.catalog.setCurrentDatabase("hr")

# Register tables
employees_df.write.mode("overwrite").saveAsTable("employees")
attendance_df.write.mode("overwrite").saveAsTable("attendance")
bonuses_df.write.mode("overwrite").saveAsTable("bonuses")

# 1. Top paid employee per department
spark.sql("""
    SELECT Department, Name, Salary
    FROM employees
    WHERE (Department, Salary) IN (
        SELECT Department, MAX(Salary) FROM employees GROUP BY Department
    )
""").show()

# 2. Attendance rate by department
spark.sql("""
    SELECT e.Department,
           ROUND(SUM(CASE WHEN a.Status = 'Present' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) AS AttendanceRate
    FROM employees e
    JOIN attendance a ON e.EmpID = a.EmpID
    GROUP BY e.Department
""").show()

# 3. Joined after 2021 with salary > 70000
spark.sql("""
    SELECT * FROM employees
    WHERE YEAR(JoinDate) > 2021 AND Salary > 70000
""").show()


+-----------+-----+------+
| Department| Name|Salary|
+-----------+-----+------+
|         HR|Anita| 55000|
|Engineering|  Raj| 80000|
|  Marketing|Aamir| 60000|
+-----------+-----+------+

+-----------+--------------+
| Department|AttendanceRate|
+-----------+--------------+
|Engineering|          75.0|
|         HR|         100.0|
|  Marketing|           0.0|
+-----------+--------------+

+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+
|EmpID|  Name| Department|  JoinDate|Salary|ManagerID|TenureYears|JoinYear|JoinMonth|MaskedName|EmpCode|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+
|    3|Simran|Engineering|2022-07-10| 75000|        1|        2.9|    2022|        7|    ******|    6.0|
+-----+------+-----------+----------+------+---------+-----------+--------+---------+----------+-------+



In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, count, when, round
from pyspark.sql.types import StringType

# Spark session
spark = SparkSession.builder.appName("HR_Analytics").getOrCreate()

# ✅ 1. Create UDF to classify department
def classify_dept(dept):
    return "Tech" if dept in ["Engineering", "IT"] else "Non-Tech"

classify_udf = udf(classify_dept, StringType())

# ✅ 2. Add column DeptType to employees
employees_df = employees_df.withColumn("DeptType", classify_udf(col("Department")))
employees_df.select("EmpID", "Department", "DeptType").show()

#  3. Compute attendance % for each employee
from pyspark.sql.functions import countDistinct

attendance_pct_df = attendance_df.groupBy("EmpID") \
    .agg(
        (count(when(col("Status") == "Present", True)) / count("*") * 100).alias("AttendancePct")
    )

attendance_pct_df = attendance_pct_df.withColumn("AttendancePct", round(col("AttendancePct"), 2))
attendance_pct_df.show()

#  4. Join employees with attendance summary
emp_attendance_summary = employees_df.join(attendance_pct_df, "EmpID", "left")
emp_attendance_summary.select("EmpID", "Name", "Department", "DeptType", "AttendancePct").show()

#  5. Create temporary view
emp_attendance_summary.createOrReplaceTempView("emp_attendance_summary")

#  6. Query the view to verify
spark.sql("SELECT EmpID, Name, DeptType, AttendancePct FROM emp_attendance_summary ORDER BY EmpID").show()

# 7. Save as Parquet partitioned by Department
save_path = base_path + "emp_attendance_summary/"  # example: "/content/drive/MyDrive/HR_Analytics_Data/emp_attendance_summary/"
emp_attendance_summary.write.mode("overwrite").partitionBy("Department").parquet(save_path)

# 8. Read back saved Parquet to confirm
parquet_df = spark.read.parquet(save_path)
parquet_df.show()


+-----+-----------+--------+
|EmpID| Department|DeptType|
+-----+-----------+--------+
|    1|         HR|Non-Tech|
|    2|Engineering|    Tech|
|    3|Engineering|    Tech|
|    4|  Marketing|Non-Tech|
|    5|         HR|Non-Tech|
+-----+-----------+--------+

+-----+-------------+
|EmpID|AttendancePct|
+-----+-------------+
|    2|         50.0|
|    4|          0.0|
|    5|        100.0|
|    1|        100.0|
|    3|        100.0|
+-----+-------------+

+-----+------+-----------+--------+-------------+
|EmpID|  Name| Department|DeptType|AttendancePct|
+-----+------+-----------+--------+-------------+
|    1| Anita|         HR|Non-Tech|        100.0|
|    2|   Raj|Engineering|    Tech|         50.0|
|    3|Simran|Engineering|    Tech|        100.0|
|    4| Aamir|  Marketing|Non-Tech|          0.0|
|    5| Nisha|         HR|Non-Tech|        100.0|
+-----+------+-----------+--------+-------------+

+-----+------+--------+-------------+
|EmpID|  Name|DeptType|AttendancePct|
+-----+-----