**Importing Liabraries**

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

**Creating Spark Session**

In [2]:
spark = SparkSession.builder.appName("PySpark").getOrCreate()
spark

**Uploading required files**

In [3]:
from google.colab import files
uploaded = files.upload()

Saving attendance.csv to attendance.csv
Saving employees.csv to employees.csv
Saving tasks.csv to tasks.csv


**Loading Files**

In [4]:
dfAtt = spark.read.csv(r"/content/attendance.csv", header=True, inferSchema=True)
dfEmp = spark.read.csv(r"/content/employees.csv", header=True, inferSchema=True)
dfTas = spark.read.csv(r"/content/tasks.csv", header=True, inferSchema=True)

**Printing Files**

In [5]:
dfAtt.show()

+------------+----------+----------+----------------+----------------+------+---------+
|attendanceID|employeeID|      date|         clockIN|        clockOUT|isLate|isAbscent|
+------------+----------+----------+----------------+----------------+------+---------+
|           1|         1|01-06-2024|01-06-2024 09:02|01-06-2024 17:00|     1|        0|
|           2|         2|01-06-2024|01-06-2024 08:55|01-06-2024 17:10|     0|        0|
|           3|         3|01-06-2024|01-06-2024 09:10|01-06-2024 17:05|     1|        0|
|           4|         4|01-06-2024|01-06-2024 08:48|01-06-2024 17:15|     0|        0|
|           5|         5|01-06-2024|            NULL|            NULL|     0|        1|
|           6|         1|02-06-2024|02-06-2024 08:50|02-06-2024 17:00|     0|        0|
|           7|         2|02-06-2024|02-06-2024 09:20|02-06-2024 17:10|     1|        0|
|           8|         3|02-06-2024|            NULL|            NULL|     0|        1|
|           9|         4|02-06-2

In [6]:
dfEmp.show()

+----------+-------------+-----------+------------------+--------------------+----------+--------+
|employeeID|         name| department|              role|               email|  hireDate|  status|
+----------+-------------+-----------+------------------+--------------------+----------+--------+
|         1|     John Doe|Engineering|Software Developer|john.doe@example.com|2023-01-15|  Active|
|         2|   Jane Smith|  Marketing|Content Strategist|jane.smith@exampl...|2022-11-20|  Active|
|         3|Alice Johnson|         HR|        HR Manager|alice.johnson@exa...|2021-09-10|  Active|
|         4|    Bob Brown|Engineering|   DevOps Engineer|bob.brown@example...|2023-05-01|  Active|
|         5|    Eva Green|    Finance|        Accountant|eva.green@example...|2022-06-30|Resigned|
+----------+-------------+-----------+------------------+--------------------+----------+--------+



In [7]:
dfTas.show()

+------+----------+--------------------+----------+---------------+
|taskID|employeeID|            taskName|  taskDate|tasksCompeleted|
+------+----------+--------------------+----------+---------------+
|     1|         1|     API Integration|2024-06-01|              5|
|     2|         2|Content Calendar ...|2024-06-01|              3|
|     3|         3|       Policy Review|2024-06-01|              0|
|     4|         4|         CI/CD Setup|2024-06-01|              4|
|     5|         5|    Invoice Auditing|2024-06-01|              6|
|     6|         1|    Backend Refactor|2024-06-02|              4|
|     7|         2|Email Campaign De...|2024-06-02|              2|
|     8|         3|Employee Feedback...|2024-06-02|              0|
|     9|         4|        Docker Setup|2024-06-02|              5|
|    10|         5|Expense Sheet Val...|2024-06-02|              3|
|    11|         1|    API Docs Writing|2024-06-03|              5|
|    12|         2|Marketing Funnel ...|2024-06-

**Printing Schemas**

In [8]:
dfAtt.printSchema()

root
 |-- attendanceID: integer (nullable = true)
 |-- employeeID: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- clockIN: string (nullable = true)
 |-- clockOUT: string (nullable = true)
 |-- isLate: integer (nullable = true)
 |-- isAbscent: integer (nullable = true)



In [9]:
dfEmp.printSchema()

root
 |-- employeeID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- role: string (nullable = true)
 |-- email: string (nullable = true)
 |-- hireDate: date (nullable = true)
 |-- status: string (nullable = true)



In [10]:
dfTas.printSchema()

root
 |-- taskID: integer (nullable = true)
 |-- employeeID: integer (nullable = true)
 |-- taskName: string (nullable = true)
 |-- taskDate: date (nullable = true)
 |-- tasksCompeleted: integer (nullable = true)



**Filtering late login and abscences**

In [11]:
dfAtt.filter((dfAtt.isLate == 1) | (dfAtt.isAbscent == 1)) \
  .join(dfEmp.select(["name", "employeeID"]), on="employeeID", how="inner") \
  .withColumn("Attendance", F.when(F.col("islate") == 1, "Late Login").otherwise("Abscent")) \
  .select(["name", "Attendance", "date"]) \
  .show()

+-------------+----------+----------+
|         name|Attendance|      date|
+-------------+----------+----------+
|     John Doe|Late Login|01-06-2024|
|Alice Johnson|Late Login|01-06-2024|
|    Eva Green|   Abscent|01-06-2024|
|   Jane Smith|Late Login|02-06-2024|
|Alice Johnson|   Abscent|02-06-2024|
|    Eva Green|Late Login|02-06-2024|
|   Jane Smith|   Abscent|03-06-2024|
|    Bob Brown|Late Login|03-06-2024|
|     John Doe|Late Login|04-06-2024|
|Alice Johnson|Late Login|04-06-2024|
|    Bob Brown|   Abscent|04-06-2024|
|   Jane Smith|Late Login|05-06-2024|
|Alice Johnson|   Abscent|05-06-2024|
|    Eva Green|Late Login|05-06-2024|
|   Jane Smith|Late Login|06-06-2024|
|    Bob Brown|Late Login|06-06-2024|
|    Eva Green|   Abscent|06-06-2024|
+-------------+----------+----------+



**Average work hours and productivity**

In [12]:
dfAtt_cleaned = dfAtt.filter(F.col("clockIN") != "NULL")

In [15]:
dfJoined = dfAtt_cleaned.join(dfEmp, on="employeeID", how="inner").join(dfTas, on="employeeID", how="inner")
dfJoined = dfJoined \
    .withColumn(
     "workHours",
    F.round(
        (F.unix_timestamp(F.col("clockOUT"), "dd-MM-yyyy HH:mm") - F.unix_timestamp(F.col("clockIN"), "dd-MM-yyyy HH:mm")) / 3600,
        2
    )) \
    .withColumn("productivityScore", F.round(F.col("tasksCompeleted") / F.col("workHours"), 4))

In [16]:
dfJoined.groupBy("department").agg(
    F.round(F.mean("workHours"), 2).alias("averageWorkHours"),
    F.round(F.mean("productivityScore"), 2).alias("averageProductivityScore")
).show()

+-----------+----------------+------------------------+
| department|averageWorkHours|averageProductivityScore|
+-----------+----------------+------------------------+
|Engineering|            8.02|                    0.52|
|         HR|            8.08|                    0.06|
|    Finance|            7.87|                    0.38|
|  Marketing|            8.07|                    0.31|
+-----------+----------------+------------------------+



**Delieverables**
- PySpark script with filtering and group aggregations
- Output showing attendance issues by department


In [18]:
#1. pyspark script has attached in .ipynb format in git repository

In [17]:
# 2. attendance issues by department
dfJoined_2 = dfAtt.join(dfEmp, on="employeeID", how="inner")

dfJoined_2.select(["department", "isLate", "isAbscent"]) \
  .filter((F.col("isLate") == 1) | (F.col("isAbscent") == 1)) \
  .groupby("department") \
  .agg(
      F.sum("isLate").alias("lateCount"),
      F.sum("isAbscent").alias("abscentCount")
  ) \
  .withColumn("issuesCount", F.col("lateCount") + F.col("abscentCount")) \
  .show()

+-----------+---------+------------+-----------+
| department|lateCount|abscentCount|issuesCount|
+-----------+---------+------------+-----------+
|Engineering|        4|           1|          5|
|         HR|        2|           2|          4|
|    Finance|        2|           2|          4|
|  Marketing|        3|           1|          4|
+-----------+---------+------------+-----------+

