In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("EmployeeTimesheet").getOrCreate()

Data Ingestion & Schema Handling

1. Load the CSV using inferred schema.

In [0]:
df_inferred = spark.read.option("header", True).option("inferSchema", True).csv("file:///Workspace/Shared/employee_timesheet.csv")


2. Load the same file with schema explicitly defined.

In [0]:
schema = StructType([
    StructField("EmployeeID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Department", StringType(), True),
    StructField("Project", StringType(), True),
    StructField("WorkHours", IntegerType(), True),
    StructField("WorkDate", DateType(), True),
    StructField("Location", StringType(), True),
    StructField("Mode", StringType(), True)
])

df_explicit = spark.read.option("header", True).schema(schema).csv("file:///Workspace/Shared/employee_timesheet.csv")

3. Add a new column Weekday extracted from WorkDate .

In [0]:
df = df_explicit.withColumn("Weekday", date_format("WorkDate", "EEEE"))

Aggregations & Grouping

4. Calculate total work hours by employee.

In [0]:
df.groupBy("EmployeeID", "Name").agg(sum("WorkHours").alias("TotalHours")).show()

+----------+-----+----------+
|EmployeeID| Name|TotalHours|
+----------+-----+----------+
|      E103| John|         5|
|      E104|Meena|         6|
|      E102|  Raj|        15|
|      E101|Anita|        17|
+----------+-----+----------+



5. Calculate average work hours per department.

In [0]:
df.groupBy("Department").agg(avg("WorkHours").alias("AvgHours")).show()

+----------+-----------------+
|Department|         AvgHours|
+----------+-----------------+
|        HR|              7.5|
|   Finance|              5.0|
|        IT|7.666666666666667|
+----------+-----------------+



6. Get top 2 employees by total hours using window function.

In [0]:
window_spec = Window.orderBy(col("TotalHours").desc())
df_total = df.groupBy("EmployeeID", "Name").agg(sum("WorkHours").alias("TotalHours"))
df_total.withColumn("Rank", rank().over(window_spec)).filter("Rank <= 2").show()

+----------+-----+----------+----+
|EmployeeID| Name|TotalHours|Rank|
+----------+-----+----------+----+
|      E101|Anita|        17|   1|
|      E102|  Raj|        15|   2|
+----------+-----+----------+----+



Date Operations

7. Filter entries where WorkDate falls on a weekend.

In [0]:
df.filter(col("Weekday").isin(["Saturday", "Sunday"])).show()

+----------+----+----------+-------+---------+----------+--------+------+--------+
|EmployeeID|Name|Department|Project|WorkHours|  WorkDate|Location|  Mode| Weekday|
+----------+----+----------+-------+---------+----------+--------+------+--------+
|      E102| Raj|        HR|   Beta|        8|2024-05-04|  Mumbai|Remote|Saturday|
+----------+----+----------+-------+---------+----------+--------+------+--------+



8. Calculate running total of hours per employee using window.

In [0]:
window_emp = Window.partitionBy("EmployeeID").orderBy("WorkDate").rowsBetween(Window.unboundedPreceding, Window.currentRow)
df.withColumn("RunningTotal", sum("WorkHours").over(window_emp)).show()

+----------+-----+----------+-------+---------+----------+---------+------+---------+------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|RunningTotal|
+----------+-----+----------+-------+---------+----------+---------+------+---------+------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|           8|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|          17|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|           7|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|          15|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|           5|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|           6|
+----------+-----+----------+-------+---------+----------+---------+------+---------+------------+



Joining DataFrames

10. Join with timesheet data and list all employees with their DeptHead.

In [0]:
dept_df = spark.read.option("header", True).csv("file:/Workspace/Shared/department_location.csv")
df_joined = df.join(dept_df, on="Department", how="left")
df_joined.select("EmployeeID", "Name", "Department", "DeptHead").show()


+----------+-----+----------+--------+
|EmployeeID| Name|Department|DeptHead|
+----------+-----+----------+--------+
|      E101|Anita|        IT|   Anand|
|      E102|  Raj|        HR|  Shruti|
|      E103| John|   Finance|   Kamal|
|      E101|Anita|        IT|   Anand|
|      E104|Meena|        IT|   Anand|
|      E102|  Raj|        HR|  Shruti|
+----------+-----+----------+--------+



Pivot & Unpivot

11. Pivot table: total hours per employee per project.

In [0]:
df.groupBy("EmployeeID").pivot("Project").agg(sum("WorkHours")).show()

+----------+-----+----+-----+
|EmployeeID|Alpha|Beta|Gamma|
+----------+-----+----+-----+
|      E103|    5|NULL| NULL|
|      E104| NULL|NULL|    6|
|      E101|   17|NULL| NULL|
|      E102| NULL|  15| NULL|
+----------+-----+----+-----+



12. Unpivot example: Convert mode-specific hours into rows.

In [0]:
df_mode = df.select("EmployeeID", "WorkHours", "Mode")
df_mode.groupBy("EmployeeID", "Mode").agg(sum("WorkHours").alias("ModeHours")).show()

+----------+------+---------+
|EmployeeID|  Mode|ModeHours|
+----------+------+---------+
|      E104|Onsite|        6|
|      E102|Remote|        8|
|      E101|Remote|       17|
|      E102|Onsite|        7|
|      E103|Remote|        5|
+----------+------+---------+



UDF & Conditional Logic

13. Create a UDF to classify work hours

In [0]:
from pyspark.sql.types import StringType

def workload_tag(hours):
    if hours >= 8: return "Full"
    elif hours >= 4: return "Partial"
    else: return "Light"

workload_udf = udf(workload_tag, StringType())

14. Add a column WorkloadCategory using this UDF.

In [0]:
df = df.withColumn("WorkloadCategory", workload_udf(col("WorkHours")))

Nulls and Cleanup

15. Introduce some nulls in Mode column.

In [0]:
df_null = df.withColumn("Mode", when(col("EmployeeID") == "E104", None).otherwise(col("Mode")))
df_null.show()

+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|WorkloadCategory|
+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|            Full|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|         Partial|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|         Partial|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|            Full|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|  NULL|   Friday|         Partial|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|            Full|
+----------+-----+----------+-------+---------+----------+---------+-----

16. Fill nulls with "Not Provided".

In [0]:
df_filled = df_null.fillna({"Mode": "Not Provided"})
df_filled.show()

+----------+-----+----------+-------+---------+----------+---------+------------+---------+----------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|        Mode|  Weekday|WorkloadCategory|
+----------+-----+----------+-------+---------+----------+---------+------------+---------+----------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|      Remote|Wednesday|            Full|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|      Onsite|Wednesday|         Partial|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|      Remote| Thursday|         Partial|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|      Remote|   Friday|            Full|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Not Provided|   Friday|         Partial|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|      Remote| Saturday|            Full|
+----------+-----+-

17. Drop rows where WorkHours < 4.

In [0]:
df_cleaned = df_filled.filter(col("WorkHours") >= 4)
df_cleaned.show()

+----------+-----+----------+-------+---------+----------+---------+------------+---------+----------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|        Mode|  Weekday|WorkloadCategory|
+----------+-----+----------+-------+---------+----------+---------+------------+---------+----------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|      Remote|Wednesday|            Full|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|      Onsite|Wednesday|         Partial|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|      Remote| Thursday|         Partial|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|      Remote|   Friday|            Full|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Not Provided|   Friday|         Partial|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|      Remote| Saturday|            Full|
+----------+-----+-

Advanced Conditions

18. Use when-otherwise to mark employees as "Remote Worker" if >80% entries are
Remote.

In [0]:
remote_ratio = df.groupBy("EmployeeID") \
    .agg((sum(when(col("Mode") == "Remote", 1).otherwise(0)) / count("*")).alias("RemoteRatio"))

df_flagged = df.join(remote_ratio, "EmployeeID").withColumn(
    "WorkerType", when(col("RemoteRatio") > 0.8, "Remote Worker").otherwise("Mixed")
)
df_flagged.show()

+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+-----------+-------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|WorkloadCategory|RemoteRatio|   WorkerType|
+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+-----------+-------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|            Full|        1.0|Remote Worker|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|         Partial|        0.5|        Mixed|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|         Partial|        1.0|Remote Worker|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|            Full|        1.0|Remote Worker|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|         Par

19. Add a new column ExtraHours where hours > 8.

In [0]:
df_extra = df.withColumn("ExtraHours", when(col("WorkHours") > 8, col("WorkHours") - 8).otherwise(0))
df_extra.show()

+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+----------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|WorkloadCategory|ExtraHours|
+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+----------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|            Full|         0|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|         Partial|         0|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|         Partial|         0|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|            Full|         1|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|         Partial|         0|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|   

Union + Duplicate Handling

20. Append a dummy timesheet for new interns using unionByName()

In [0]:
intern_data = [("E999", "Intern", "IT", "Delta", 6, "2024-05-05", "Remote", "Remote", "Sunday")]

columns = df.columns  

df_intern = spark.createDataFrame(intern_data, columns)

df_combined = df.unionByName(df_intern)
df_combined.show()

21. Remove duplicate rows based on all columns.

In [0]:
df_dedup = df_combined.dropDuplicates()