In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Initialize Spark session
spark = SparkSession.builder.appName("EmployeeTimesheetAnalysis").getOrCreate()

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define file path
file_path = "/content/drive/MyDrive/employee_timesheet.csv"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
df_inferred = spark.read.csv(file_path, header=True, inferSchema=True)
df_inferred.show()


+----------+-----+----------+-------+---------+----------+---------+------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|
+----------+-----+----------+-------+---------+----------+---------+------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote|
+----------+-----+----------+-------+---------+----------+---------+------+



In [6]:
schema = StructType([
    StructField("EmployeeID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Department", StringType(), True),
    StructField("Project", StringType(), True),
    StructField("WorkHours", IntegerType(), True),
    StructField("WorkDate", DateType(), True),
    StructField("Location", StringType(), True),
    StructField("Mode", StringType(), True),
])

df_explicit = spark.read.csv(file_path, header=True, schema=schema)
df_explicit.show()


+----------+-----+----------+-------+---------+----------+---------+------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|
+----------+-----+----------+-------+---------+----------+---------+------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote|
+----------+-----+----------+-------+---------+----------+---------+------+



In [7]:
df = df_explicit.withColumn("Weekday", date_format("WorkDate", "EEEE"))
df.show()


+----------+-----+----------+-------+---------+----------+---------+------+---------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|
+----------+-----+----------+-------+---------+----------+---------+------+---------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|
+----------+-----+----------+-------+---------+----------+---------+------+---------+



In [8]:
df.groupBy("EmployeeID", "Name").agg(sum("WorkHours").alias("TotalHours")).show()
df.groupBy("Department").agg(avg("WorkHours").alias("AvgHours")).show()


+----------+-----+----------+
|EmployeeID| Name|TotalHours|
+----------+-----+----------+
|      E103| John|         5|
|      E104|Meena|         6|
|      E102|  Raj|        15|
|      E101|Anita|        17|
+----------+-----+----------+

+----------+-----------------+
|Department|         AvgHours|
+----------+-----------------+
|        HR|              7.5|
|   Finance|              5.0|
|        IT|7.666666666666667|
+----------+-----------------+



In [9]:
windowSpec = Window.orderBy(desc("TotalHours"))
total_hours_df = df.groupBy("EmployeeID", "Name").agg(sum("WorkHours").alias("TotalHours"))
total_hours_df.withColumn("Rank", dense_rank().over(windowSpec)).filter(col("Rank") <= 2).show()


+----------+-----+----------+----+
|EmployeeID| Name|TotalHours|Rank|
+----------+-----+----------+----+
|      E101|Anita|        17|   1|
|      E102|  Raj|        15|   2|
+----------+-----+----------+----+



In [10]:
df.filter(col("Weekday").isin(["Saturday", "Sunday"])).show()


+----------+----+----------+-------+---------+----------+--------+------+--------+
|EmployeeID|Name|Department|Project|WorkHours|  WorkDate|Location|  Mode| Weekday|
+----------+----+----------+-------+---------+----------+--------+------+--------+
|      E102| Raj|        HR|   Beta|        8|2024-05-04|  Mumbai|Remote|Saturday|
+----------+----+----------+-------+---------+----------+--------+------+--------+



In [11]:
windowEmp = Window.partitionBy("EmployeeID").orderBy("WorkDate").rowsBetween(Window.unboundedPreceding, 0)
df.withColumn("RunningTotal", sum("WorkHours").over(windowEmp)).show()


+----------+-----+----------+-------+---------+----------+---------+------+---------+------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|RunningTotal|
+----------+-----+----------+-------+---------+----------+---------+------+---------+------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Remote|Wednesday|           8|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|          17|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Onsite|Wednesday|           7|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|          15|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|           5|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|           6|
+----------+-----+----------+-------+---------+----------+---------+------+---------+------------+



In [12]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder.appName("EmployeeTimesheetAnalysis").getOrCreate()

# Read department_location.csv from Google Drive
dept_path = "/content/drive/MyDrive/department_location.csv"
dept_df = spark.read.csv(dept_path, header=True, inferSchema=True)

dept_df.show()



+----------+--------+
|Department|DeptHead|
+----------+--------+
|        IT|   Anand|
|        HR|  Shruti|
|   Finance|   Kamal|
+----------+--------+



In [13]:
joined_df = df.join(dept_df, on="Department", how="left")
joined_df.select("EmployeeID", "Name", "Department", "DeptHead").show()


+----------+-----+----------+--------+
|EmployeeID| Name|Department|DeptHead|
+----------+-----+----------+--------+
|      E101|Anita|        IT|   Anand|
|      E102|  Raj|        HR|  Shruti|
|      E103| John|   Finance|   Kamal|
|      E101|Anita|        IT|   Anand|
|      E104|Meena|        IT|   Anand|
|      E102|  Raj|        HR|  Shruti|
+----------+-----+----------+--------+



In [14]:
df.groupBy("EmployeeID").pivot("Project").agg(sum("WorkHours")).show()


+----------+-----+----+-----+
|EmployeeID|Alpha|Beta|Gamma|
+----------+-----+----+-----+
|      E103|    5|NULL| NULL|
|      E104| NULL|NULL|    6|
|      E101|   17|NULL| NULL|
|      E102| NULL|  15| NULL|
+----------+-----+----+-----+



In [15]:
mode_df = df.groupBy("EmployeeID", "Mode").agg(sum("WorkHours").alias("Hours"))
mode_df.show()


+----------+------+-----+
|EmployeeID|  Mode|Hours|
+----------+------+-----+
|      E104|Onsite|    6|
|      E102|Remote|    8|
|      E101|Remote|   17|
|      E102|Onsite|    7|
|      E103|Remote|    5|
+----------+------+-----+



In [16]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def workload_tag(hours):
    if hours >= 8:
        return "Full"
    elif hours >= 4:
        return "Partial"
    else:
        return "Light"

workload_udf = udf(workload_tag, StringType())


In [17]:
df = df.withColumn("WorkloadCategory", workload_udf(col("WorkHours")))
df.select("EmployeeID", "WorkHours", "WorkloadCategory").show()


+----------+---------+----------------+
|EmployeeID|WorkHours|WorkloadCategory|
+----------+---------+----------------+
|      E101|        8|            Full|
|      E102|        7|         Partial|
|      E103|        5|         Partial|
|      E101|        9|            Full|
|      E104|        6|         Partial|
|      E102|        8|            Full|
+----------+---------+----------------+



In [18]:
from pyspark.sql.functions import rand

df_null = df.withColumn("Mode", when(rand() > 0.7, None).otherwise(col("Mode")))
df_null.show()


+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|  Mode|  Weekday|WorkloadCategory|
+----------+-----+----------+-------+---------+----------+---------+------+---------+----------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|  NULL|Wednesday|            Full|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|  NULL|Wednesday|         Partial|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|Remote| Thursday|         Partial|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|Remote|   Friday|            Full|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|Onsite|   Friday|         Partial|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|Remote| Saturday|            Full|
+----------+-----+----------+-------+---------+----------+---------+-----

In [19]:
df_filled = df_null.fillna({"Mode": "Not Provided"})
df_filled.show()


+----------+-----+----------+-------+---------+----------+---------+------------+---------+----------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|        Mode|  Weekday|WorkloadCategory|
+----------+-----+----------+-------+---------+----------+---------+------------+---------+----------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Not Provided|Wednesday|            Full|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Not Provided|Wednesday|         Partial|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|      Remote| Thursday|         Partial|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|      Remote|   Friday|            Full|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|      Onsite|   Friday|         Partial|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|      Remote| Saturday|            Full|
+----------+-----+-

In [20]:
df_cleaned = df_filled.filter(col("WorkHours") >= 4)
df_cleaned.show()


+----------+-----+----------+-------+---------+----------+---------+------------+---------+----------------+
|EmployeeID| Name|Department|Project|WorkHours|  WorkDate| Location|        Mode|  Weekday|WorkloadCategory|
+----------+-----+----------+-------+---------+----------+---------+------------+---------+----------------+
|      E101|Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Not Provided|Wednesday|            Full|
|      E102|  Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Not Provided|Wednesday|         Partial|
|      E103| John|   Finance|  Alpha|        5|2024-05-02|    Delhi|      Remote| Thursday|         Partial|
|      E101|Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|      Remote|   Friday|            Full|
|      E104|Meena|        IT|  Gamma|        6|2024-05-03|Hyderabad|      Onsite|   Friday|         Partial|
|      E102|  Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|      Remote| Saturday|            Full|
+----------+-----+-

In [21]:
remote_ratio = df_cleaned.groupBy("EmployeeID").agg(
    (sum(when(col("Mode") == "Remote", 1).otherwise(0)) / count("*")).alias("RemoteRatio")
)

df_flagged = df_cleaned.join(remote_ratio, on="EmployeeID", how="left") \
    .withColumn("WorkerType", when(col("RemoteRatio") > 0.8, "Remote Worker").otherwise("Mixed"))

df_flagged.select("EmployeeID", "Name", "WorkerType").distinct().show()


+----------+-----+-------------+
|EmployeeID| Name|   WorkerType|
+----------+-----+-------------+
|      E102|  Raj|        Mixed|
|      E101|Anita|        Mixed|
|      E104|Meena|        Mixed|
|      E103| John|Remote Worker|
+----------+-----+-------------+



In [22]:
df_extra = df_flagged.withColumn("ExtraHours", when(col("WorkHours") > 8, col("WorkHours") - 8).otherwise(0))
df_extra.select("EmployeeID", "WorkHours", "ExtraHours").show()


+----------+---------+----------+
|EmployeeID|WorkHours|ExtraHours|
+----------+---------+----------+
|      E101|        8|         0|
|      E102|        7|         0|
|      E103|        5|         0|
|      E101|        9|         1|
|      E104|        6|         0|
|      E102|        8|         0|
+----------+---------+----------+



In [24]:
from pyspark.sql import Row
from pyspark.sql.types import *

# Step 1: Define schema
intern_schema = StructType([
    StructField("EmployeeID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Department", StringType(), True),
    StructField("Project", StringType(), True),
    StructField("WorkHours", IntegerType(), True),
    StructField("WorkDate", StringType(), True),  # Use StringType if not converting to Date
    StructField("Location", StringType(), True),
    StructField("Mode", StringType(), True),
    StructField("Weekday", StringType(), True),
    StructField("WorkloadCategory", StringType(), True),
    StructField("RemoteRatio", DoubleType(), True),
    StructField("WorkerType", StringType(), True),
    StructField("ExtraHours", IntegerType(), True),
])

# Step 2: Define intern data
intern_data = [("E999", "Intern1", "IT", "Delta", 5, "2024-05-05", "Chennai", "Remote", "Sunday",
                "Partial", None, "Remote Worker", 0)]

# Step 3: Create DataFrame with schema
intern_df = spark.createDataFrame(data=intern_data, schema=intern_schema)

# Step 4: Combine with main DataFrame
combined_df = df_extra.unionByName(intern_df)

# Show result
combined_df.show()



+----------+-------+----------+-------+---------+----------+---------+------------+---------+----------------+-----------+-------------+----------+
|EmployeeID|   Name|Department|Project|WorkHours|  WorkDate| Location|        Mode|  Weekday|WorkloadCategory|RemoteRatio|   WorkerType|ExtraHours|
+----------+-------+----------+-------+---------+----------+---------+------------+---------+----------------+-----------+-------------+----------+
|      E101|  Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Not Provided|Wednesday|            Full|        0.5|        Mixed|         0|
|      E102|    Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Not Provided|Wednesday|         Partial|        0.5|        Mixed|         0|
|      E103|   John|   Finance|  Alpha|        5|2024-05-02|    Delhi|      Remote| Thursday|         Partial|        1.0|Remote Worker|         0|
|      E101|  Anita|        IT|  Alpha|        9|2024-05-03|Bangalore|      Remote|   Friday|            Full|  

In [25]:
final_df = combined_df.dropDuplicates()
final_df.show()


+----------+-------+----------+-------+---------+----------+---------+------------+---------+----------------+-----------+-------------+----------+
|EmployeeID|   Name|Department|Project|WorkHours|  WorkDate| Location|        Mode|  Weekday|WorkloadCategory|RemoteRatio|   WorkerType|ExtraHours|
+----------+-------+----------+-------+---------+----------+---------+------------+---------+----------------+-----------+-------------+----------+
|      E103|   John|   Finance|  Alpha|        5|2024-05-02|    Delhi|      Remote| Thursday|         Partial|        1.0|Remote Worker|         0|
|      E102|    Raj|        HR|   Beta|        7|2024-05-01|   Mumbai|Not Provided|Wednesday|         Partial|        0.5|        Mixed|         0|
|      E101|  Anita|        IT|  Alpha|        8|2024-05-01|Bangalore|Not Provided|Wednesday|            Full|        0.5|        Mixed|         0|
|      E102|    Raj|        HR|   Beta|        8|2024-05-04|   Mumbai|      Remote| Saturday|            Full|  