In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, sum, count, when, hour, to_timestamp, trim

# Step 1: Launch Spark session
spark = SparkSession.builder.appName("EmployeeAttendanceProductivity").getOrCreate()

In [2]:
# Step 2: Load large attendance log (CSV file)
df = spark.read.csv("attendance_clean.csv", header=True, inferSchema=True)

In [3]:
# Trim whitespace in status for accurate matching
df = df.withColumn("status", trim(col("status")))

In [4]:
# Convert clockin, clockout columns to timestamp type
df = df.withColumn("clockin", to_timestamp(col("clockin"))) \
       .withColumn("clockout", to_timestamp(col("clockout")))

# Calculate late login flag for clockin after 9:30AM
df = df.withColumn(
    "is_late",
    when(hour("clockin") > 9, 1)
     .when((hour("clockin") == 9) & (col("clockin").substr(15, 2).cast("int") > 30), 1)
     .otherwise(0)
)

In [5]:
# Mark absences, considering null, empty, or various case values for 'Absent'
df = df.withColumn(
    "is_absent",
    when(
        (col("status").isNull()) |
        (col("status") == "") |
        (col("status").rlike("(?i)absent")),
        1).otherwise(0)
)

In [6]:
# Calculate work hours difference in decimals (null if absent)
df = df.withColumn(
    "workhours",
    (col("clockout").cast("long") - col("clockin").cast("long")) / 3600
)

In [7]:
# Group by department and aggregate metrics
dept_metrics = df.groupBy("department").agg(
    avg("workhours").alias("avg_workhours"),
    avg("taskscompleted").alias("avg_taskscompleted"),
    sum("is_late").alias("total_late_logins"),
    sum("is_absent").alias("total_absences"),
    avg("is_late").alias("late_login_ratio"),
    avg("is_absent").alias("absent_ratio")
)

In [8]:
print("Department-level Attendance and Productivity Summary:")
dept_metrics.show(truncate=False)

print("Departments with High Lateness or Absence:")
dept_metrics.filter((col("late_login_ratio") > 0.1) | (col("absent_ratio") > 0.1)).show(truncate=False)

Department-level Attendance and Productivity Summary:
+----------+-----------------+------------------+-----------------+--------------+----------------+------------+
|department|avg_workhours    |avg_taskscompleted|total_late_logins|total_absences|late_login_ratio|absent_ratio|
+----------+-----------------+------------------+-----------------+--------------+----------------+------------+
|HR        |8.583333333333332|4.5               |0                |0             |0.0             |0.0         |
|Finance   |8.0              |2.0               |0                |1             |0.0             |0.5         |
|Marketing |8.166666666666666|2.0               |0                |1             |0.0             |0.5         |
|IT        |8.611111111111112|4.0               |0                |0             |0.0             |0.0         |
+----------+-----------------+------------------+-----------------+--------------+----------------+------------+

Departments with High Lateness or Absence

In [9]:
# Stop Spark session
spark.stop()