In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime

# Initialize Spark
spark = SparkSession.builder.appName("EmployeeDataAnalysis").getOrCreate()

# Employee Data
employee_data = [
    ("Ananya", "HR", 50000),
    ("Rahul", "Engineering", 60000),
    ("Priya", "Engineering", 55000),
    ("Zoya", "Marketing", 45000),
    ("Karan", "HR", 52000),
    ("Naveen", "Engineering", 65000),
    ("Fatima", "Marketing", 48000)
]
columns_emp = ["Name", "Department", "Salary"]
df_emp = spark.createDataFrame(employee_data, columns_emp)

# Performance Data
performance_data = [
    ("Ananya", 4.8),
    ("Rahul", 4.5),
    ("Priya", 4.2),
    ("Zoya", 3.9),
    ("Karan", 4.6),
    ("Naveen", 4.9),
    ("Fatima", 4.0)
]
columns_perf = ["Name", "Rating"]
df_perf = spark.createDataFrame(performance_data, columns_perf)

# Project Data
project_data = [
    ("Ananya", "HR Portal", 129),
    ("Rahul", "Data Platform", 200),
    ("Priya", "Data Platform", 180),
    ("Zoya", "Campaign Tracker", 100),
    ("Karan", "HR Portal", 130),
    ("Naveen", "ML Pipeline", 220),
    ("Fatima", "Campaign Tracker", 90)
]
columns_proj = ["Name", "Project", "Hoursworked"]
df_proj = spark.createDataFrame(project_data, columns_proj)

display(df_emp)
display(df_perf)
display(df_proj)

Name,Department,Salary
Ananya,HR,50000
Rahul,Engineering,60000
Priya,Engineering,55000
Zoya,Marketing,45000
Karan,HR,52000
Naveen,Engineering,65000
Fatima,Marketing,48000


Name,Rating
Ananya,4.8
Rahul,4.5
Priya,4.2
Zoya,3.9
Karan,4.6
Naveen,4.9
Fatima,4.0


Name,Project,Hoursworked
Ananya,HR Portal,129
Rahul,Data Platform,200
Priya,Data Platform,180
Zoya,Campaign Tracker,100
Karan,HR Portal,130
Naveen,ML Pipeline,220
Fatima,Campaign Tracker,90


In [0]:
# 1. Join all three DataFrames
df_combined = df_emp.join(df_perf, "Name").join(df_proj, "Name")
display(df_combined)

# 2. Total hours worked per department
hours_per_dept = df_combined.groupBy("Department").agg(sum("Hoursworked").alias("TotalHours"))
display(hours_per_dept)

# 3. Average rating per project
avg_rating_per_project = df_combined.groupBy("Project").agg(avg("Rating").alias("AvgRating"))
display(avg_rating_per_project)

Name,Department,Salary,Rating,Project,Hoursworked
Ananya,HR,50000,4.8,HR Portal,129
Priya,Engineering,55000,4.2,Data Platform,180
Rahul,Engineering,60000,4.5,Data Platform,200
Zoya,Marketing,45000,3.9,Campaign Tracker,100
Karan,HR,52000,4.6,HR Portal,130
Naveen,Engineering,65000,4.9,ML Pipeline,220
Fatima,Marketing,48000,4.0,Campaign Tracker,90


Department,TotalHours
HR,259
Engineering,600
Marketing,190


Project,AvgRating
HR Portal,4.699999999999999
Data Platform,4.35
Campaign Tracker,3.95
ML Pipeline,4.9


In [0]:
# 4. Add a row with None rating
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

perf_schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Rating", DoubleType(), True)
])

new_row = spark.createDataFrame([("Meena", None)], schema=perf_schema)

df_perf = df_perf.union(new_row)
display(df_perf)

# 5. Filter rows with null ratings
null_rows = df_perf.filter(col("Rating").isNull())
display(null_rows)

# 6. Replace nulls with department average
# First ensure we're using the fixed df_perf in our combined DF
df_combined = df_emp.join(df_perf, "Name").join(df_proj, "Name")

dept_avg = df_combined.groupBy("Department").agg(avg("Rating").alias("DeptAvgRating"))

df_perf_fixed = df_perf.join(df_emp, "Name", "left") \
                       .join(dept_avg, "Department", "left") \
                       .withColumn("Rating", 
                                  when(col("Rating").isNull(), col("DeptAvgRating")) \
                                  .otherwise(col("Rating"))) \
                       .select("Name", "Rating")

display(df_perf_fixed)

Name,Rating
Ananya,4.8
Rahul,4.5
Priya,4.2
Zoya,3.9
Karan,4.6
Naveen,4.9
Fatima,4.0
Meena,


Name,Rating
Meena,


Name,Rating
Ananya,4.8
Rahul,4.5
Priya,4.2
Zoya,3.9
Karan,4.6
Naveen,4.9
Fatima,4.0
Meena,


In [0]:
# 7. Create PerformanceCategory column
df_combined = df_combined.withColumn("PerformanceCategory",
    when(col("Rating") >= 4.7, "Excellent") \
    .when((col("Rating") >= 4.0) & (col("Rating") < 4.7), "Good") \
    .otherwise("Average"))
display(df_combined)

# 8. UDF for bonus calculation
def assign_bonus(hours):
    return 110000 if hours > 200 else 15000

bonus_udf = udf(assign_bonus, IntegerType())
df_combined = df_combined.withColumn("Bonus", bonus_udf(col("Hoursworked")))
display(df_combined)

Name,Department,Salary,Rating,Project,Hoursworked,PerformanceCategory
Ananya,HR,50000,4.8,HR Portal,129,Excellent
Priya,Engineering,55000,4.2,Data Platform,180,Good
Rahul,Engineering,60000,4.5,Data Platform,200,Good
Zoya,Marketing,45000,3.9,Campaign Tracker,100,Average
Karan,HR,52000,4.6,HR Portal,130,Good
Naveen,Engineering,65000,4.9,ML Pipeline,220,Excellent
Fatima,Marketing,48000,4.0,Campaign Tracker,90,Good


Name,Department,Salary,Rating,Project,Hoursworked,PerformanceCategory,Bonus
Ananya,HR,50000,4.8,HR Portal,129,Excellent,15000
Priya,Engineering,55000,4.2,Data Platform,180,Good,15000
Rahul,Engineering,60000,4.5,Data Platform,200,Good,15000
Zoya,Marketing,45000,3.9,Campaign Tracker,100,Average,15000
Karan,HR,52000,4.6,HR Portal,130,Good,15000
Naveen,Engineering,65000,4.9,ML Pipeline,220,Excellent,110000
Fatima,Marketing,48000,4.0,Campaign Tracker,90,Good,15000


In [0]:
# 9. Add JoinDate and MonthsWorked
df_combined = df_combined.withColumn("JoinDate", lit("2021-06-01").cast("date"))
today = datetime.now().date()
df_combined = df_combined.withColumn("Monthsworked", 
                                   months_between(lit(today), col("JoinDate")))
display(df_combined)

# 10. Employees joined before 2022
employees_before_2022 = df_combined.filter(year(col("JoinDate")) < 2022)
print(f"Employees joined before 2022: {employees_before_2022.count()}")

# 11. Union with extra employees
extra_employees = [
    ("Meena", "HR", 48000),
    ("Raj", "Marketing", 51000)
]
df_extra = spark.createDataFrame(extra_employees, columns_emp)
df_emp = df_emp.union(df_extra)
display(df_emp)

Name,Department,Salary,Rating,Project,Hoursworked,PerformanceCategory,Bonus,JoinDate,Monthsworked
Ananya,HR,50000,4.8,HR Portal,129,Excellent,15000,2021-06-01,48.32258065
Priya,Engineering,55000,4.2,Data Platform,180,Good,15000,2021-06-01,48.32258065
Rahul,Engineering,60000,4.5,Data Platform,200,Good,15000,2021-06-01,48.32258065
Zoya,Marketing,45000,3.9,Campaign Tracker,100,Average,15000,2021-06-01,48.32258065
Karan,HR,52000,4.6,HR Portal,130,Good,15000,2021-06-01,48.32258065
Naveen,Engineering,65000,4.9,ML Pipeline,220,Excellent,110000,2021-06-01,48.32258065
Fatima,Marketing,48000,4.0,Campaign Tracker,90,Good,15000,2021-06-01,48.32258065


Employees joined before 2022: 7


Name,Department,Salary
Ananya,HR,50000
Rahul,Engineering,60000
Priya,Engineering,55000
Zoya,Marketing,45000
Karan,HR,52000
Naveen,Engineering,65000
Fatima,Marketing,48000
Meena,HR,48000
Raj,Marketing,51000


In [0]:
# 12. Save as partitioned Parquet 
df_combined_final = df_emp.join(df_perf_fixed, "Name", "left").join(df_proj, "Name", "left")

df_combined_final.write \
    .partitionBy("Department") \
    .mode("overwrite") \
    .parquet("/mnt/data/employee_data_partitioned.parquet")

print("Final dataset saved as partitioned Parquet!")

Final dataset saved as partitioned Parquet!


In [0]:
df_reloaded = spark.read.parquet("/mnt/data/employee_data_partitioned.parquet")
display(df_reloaded)

Name,Salary,Rating,Project,Hoursworked,Department
Rahul,60000,4.5,Data Platform,200.0,Engineering
Priya,55000,4.2,Data Platform,180.0,Engineering
Fatima,48000,4.0,Campaign Tracker,90.0,Marketing
Zoya,45000,3.9,Campaign Tracker,100.0,Marketing
Naveen,65000,4.9,ML Pipeline,220.0,Engineering
Ananya,50000,4.8,HR Portal,129.0,HR
Karan,52000,4.6,HR Portal,130.0,HR
Meena,48000,,,,HR
Raj,51000,,,,Marketing
