**Importing Libraries**

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

**Creating Spark Session**

In [2]:
spark = SparkSession.builder.appName("DevOps").getOrCreate()
spark

**Uploading required files**

In [3]:
from google.colab import files
uploaded = files.upload()

Saving attendance.csv to attendance.csv
Saving employees.csv to employees.csv
Saving tasks.csv to tasks.csv


**Loading Files**

In [4]:
dfAtt = spark.read.csv(r"/content/attendance.csv", header=True, inferSchema=True)
dfEmp = spark.read.csv(r"/content/employees.csv", header=True, inferSchema=True)
dfTas = spark.read.csv(r"/content/tasks.csv", header=True, inferSchema=True)

**Printing Files**

In [5]:
dfAtt.show()

+------------+----------+----------+----------------+----------------+------+---------+
|attendanceID|employeeID|      date|         clockIN|        clockOUT|isLate|isAbscent|
+------------+----------+----------+----------------+----------------+------+---------+
|           1|         1|01-06-2024|01-06-2024 09:02|01-06-2024 17:00|     1|        0|
|           2|         2|01-06-2024|01-06-2024 08:55|01-06-2024 17:10|     0|        0|
|           3|         3|01-06-2024|01-06-2024 09:10|01-06-2024 17:05|     1|        0|
|           4|         4|01-06-2024|01-06-2024 08:48|01-06-2024 17:15|     0|        0|
|           5|         5|01-06-2024|            NULL|            NULL|     0|        1|
|           6|         1|02-06-2024|02-06-2024 08:50|02-06-2024 17:00|     0|        0|
|           7|         2|02-06-2024|02-06-2024 09:20|02-06-2024 17:10|     1|        0|
|           8|         3|02-06-2024|            NULL|            NULL|     0|        1|
|           9|         4|02-06-2

In [6]:
dfEmp.show()

+----------+-------------+-----------+------------------+--------------------+----------+--------+
|employeeID|         name| department|              role|               email|  hireDate|  status|
+----------+-------------+-----------+------------------+--------------------+----------+--------+
|         1|     John Doe|Engineering|Software Developer|john.doe@example.com|2023-01-15|  Active|
|         2|   Jane Smith|  Marketing|Content Strategist|jane.smith@exampl...|2022-11-20|  Active|
|         3|Alice Johnson|         HR|        HR Manager|alice.johnson@exa...|2021-09-10|  Active|
|         4|    Bob Brown|Engineering|   DevOps Engineer|bob.brown@example...|2023-05-01|  Active|
|         5|    Eva Green|    Finance|        Accountant|eva.green@example...|2022-06-30|Resigned|
+----------+-------------+-----------+------------------+--------------------+----------+--------+



In [7]:
dfTas.show()

+------+----------+--------------------+----------+---------------+
|taskID|employeeID|            taskName|  taskDate|tasksCompeleted|
+------+----------+--------------------+----------+---------------+
|     1|         1|     API Integration|2024-06-01|              5|
|     2|         2|Content Calendar ...|2024-06-01|              3|
|     3|         3|       Policy Review|2024-06-01|              0|
|     4|         4|         CI/CD Setup|2024-06-01|              4|
|     5|         5|    Invoice Auditing|2024-06-01|              6|
|     6|         1|    Backend Refactor|2024-06-02|              4|
|     7|         2|Email Campaign De...|2024-06-02|              2|
|     8|         3|Employee Feedback...|2024-06-02|              0|
|     9|         4|        Docker Setup|2024-06-02|              5|
|    10|         5|Expense Sheet Val...|2024-06-02|              3|
|    11|         1|    API Docs Writing|2024-06-03|              5|
|    12|         2|Marketing Funnel ...|2024-06-

**Printing Schemas**

In [8]:
dfAtt.printSchema()

root
 |-- attendanceID: integer (nullable = true)
 |-- employeeID: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- clockIN: string (nullable = true)
 |-- clockOUT: string (nullable = true)
 |-- isLate: integer (nullable = true)
 |-- isAbscent: integer (nullable = true)



In [9]:
dfEmp.printSchema()

root
 |-- employeeID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- role: string (nullable = true)
 |-- email: string (nullable = true)
 |-- hireDate: date (nullable = true)
 |-- status: string (nullable = true)



In [10]:
dfTas.printSchema()

root
 |-- taskID: integer (nullable = true)
 |-- employeeID: integer (nullable = true)
 |-- taskName: string (nullable = true)
 |-- taskDate: date (nullable = true)
 |-- tasksCompeleted: integer (nullable = true)



**Cleaning Null values**

In [11]:
dfEmp = dfEmp.dropna()
dfTas = dfTas.dropna()

**Top 5 abscentees**

In [18]:
Top5Absentees = dfEmp.join(dfAtt, on="employeeID", how="inner").groupBy("employeeID").agg(
                 F.sum("isAbscent").alias("AbscentCount")
             ).join(dfEmp.select(["employeeID", "name"]), how="inner", on="employeeID").sort("AbscentCount", ascending=False).limit(5).select(["employeeID", "name", "AbscentCount"])

Top5Absentees.write.mode("overwrite").csv("abscentees_top_5")

Top5Absentees.show()

+----------+-------------+------------+
|employeeID|         name|AbscentCount|
+----------+-------------+------------+
|         3|Alice Johnson|           2|
|         5|    Eva Green|           2|
|         4|    Bob Brown|           1|
|         2|   Jane Smith|           1|
|         1|     John Doe|           0|
+----------+-------------+------------+



**Lowest performing departments**

In [19]:
LowestPerformingDept = dfEmp.join(dfAtt, on="employeeID", how="inner").join(dfTas, on="employeeID", how="inner").groupBy("department").agg(
                    F.sum("tasksCompeleted").alias("TasksProductivityScore")
                ).sort("TasksProductivityScore", ascending=True).limit(2)

LowestPerformingDept.write.mode("overwrite").csv("lowest_performing_departments")

LowestPerformingDept.show()

+----------+----------------------+
|department|TasksProductivityScore|
+----------+----------------------+
|        HR|                    18|
| Marketing|                    90|
+----------+----------------------+



**Deliverables**

In [20]:
# Report with top 5 absentees/lowest performing departments

Top5Absentees.coalesce(1) \
    .write.mode("overwrite").option("header", True) \
    .csv("/content/Top5Absentees")

LowestPerformingDept.coalesce(1) \
    .write.mode("overwrite").option("header", True) \
    .csv("/content/LowestPerformingDept")

In [21]:
import shutil, glob

Abs_file = glob.glob("/content/Top5Absentees/part-*.csv")[0]
shutil.move(Abs_file, "/content/Top5Absentees.csv")

low_file = glob.glob("/content/LowestPerformingDept/part-*.csv")[0]
shutil.move(low_file, "/content/LowestPerformingDept.csv")

'/content/LowestPerforming.csv'

In [22]:
from google.colab import files

files.download("/content/Top5Absentees.csv")
files.download("/content/LowestPerformingDept.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>