In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, max, min, when, current_date, datediff, lit
from pyspark.sql.functions import rand, expr, monotonically_increasing_id
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, sum as _sum

spark = SparkSession.builder.appName("AdvancedEmployeeAnalysis").getOrCreate()

# Dataset 1: employee_data
data = [
    ("Ananya", "HR", 52000),
    ("Rahul", "Engineering", 65000),
    ("Priya", "Engineering", 60000),
    ("Zoya", "Marketing", 48000),
    ("Karan", "HR", 53000),
    ("Naveen", "Engineering", 70000),
    ("Fatima", "Marketing", 45000)
]
columns = ["Name", "Department", "Salary"]
df_emp = spark.createDataFrame(data, columns)

# Dataset 2: performance_data
performance = [
    ("Ananya", 2023, 4.5),
    ("Rahul", 2023, 4.9),
    ("Priya", 2023, 4.3),
    ("Zoya", 2023, 3.8),
    ("Karan", 2023, 4.1),
    ("Naveen", 2023, 4.7),
    ("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)

# dataset 3: project_data
project_data = [
    ("Ananya", "HR Portal", 120),
    ("Rahul", "Data Platform", 200),
    ("Priya", "Data Platform", 180),
    ("Zoya", "Campaign Tracker", 100),
    ("Karan", "HR Portal", 130),
    ("Naveen", "ML Pipeline", 220),
    ("Fatima", "Campaign Tracker", 90)
]
columns_proj = ["Name", "Project", "HoursWorked"]
df_proj = spark.createDataFrame(project_data, columns_proj)


Joins and Advanced Aggregations

1. Join employee_data , performance_data , and project_data .

In [0]:
df_all = df_emp \
    .join(df_perf, on="Name", how="left") \
    .join(df_proj, on="Name", how="left")
df_all.show()


+------+-----------+------+----+------+----------------+-----------+
|  Name| Department|Salary|Year|Rating|         Project|HoursWorked|
+------+-----------+------+----+------+----------------+-----------+
|Ananya|         HR| 52000|2023|   4.5|       HR Portal|        120|
| Priya|Engineering| 60000|2023|   4.3|   Data Platform|        180|
| Rahul|Engineering| 65000|2023|   4.9|   Data Platform|        200|
|  Zoya|  Marketing| 48000|2023|   3.8|Campaign Tracker|        100|
| Karan|         HR| 53000|2023|   4.1|       HR Portal|        130|
|Naveen|Engineering| 70000|2023|   4.7|     ML Pipeline|        220|
|Fatima|  Marketing| 45000|2023|   3.9|Campaign Tracker|         90|
+------+-----------+------+----+------+----------------+-----------+



2. Compute total hours worked per department.

In [0]:
df_all.groupBy("Department").agg({"HoursWorked": "sum"}).withColumnRenamed("sum(HoursWorked)", "TotalHours").show()


+-----------+----------+
| Department|TotalHours|
+-----------+----------+
|         HR|       250|
|Engineering|       600|
|  Marketing|       190|
+-----------+----------+



3. Compute average rating per project.

In [0]:
df_all.groupBy("Project").agg({"Rating": "avg"}).withColumnRenamed("avg(Rating)", "AvgRating").show()


+----------------+------------------+
|         Project|         AvgRating|
+----------------+------------------+
|   Data Platform|               4.6|
|       HR Portal|               4.3|
|     ML Pipeline|               4.7|
|Campaign Tracker|3.8499999999999996|
+----------------+------------------+



Handling Missing Data

4. Add a row to performance_data with a None rating.

In [0]:
from pyspark.sql import Row

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
schema = StructType([StructField("Name", StringType(), True), StructField("Year", IntegerType(), True), StructField("Rating", FloatType(), True)])
new_row = spark.createDataFrame([Row(Name="John", Year=2023, Rating=None)], schema=schema)

df_perf_with_null = df_perf.union(new_row)
df_perf_with_null.show()


+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
|  John|2023|  NULL|
+------+----+------+



5. Filter rows with null values.

In [0]:
df_perf_with_null.filter(col("Rating").isNull()).show()


+----+----+------+
|Name|Year|Rating|
+----+----+------+
|John|2023|  NULL|
+----+----+------+



6. Replace null ratings with the department average.

In [0]:
df_perf_dept = df_perf_with_null.join(df_emp, on="Name", how="left")

avg_rating_by_dept = df_perf_dept.groupBy("Department") \
    .agg(avg("Rating").alias("DeptAvgRating"))

df_filled = df_perf_dept.join(avg_rating_by_dept, on="Department", how="left") \
    .withColumn("FinalRating", when(col("Rating").isNull(), col("DeptAvgRating")).otherwise(col("Rating")))

df_filled.select("Name", "Department", "FinalRating").show()


+------+-----------+-----------+
|  Name| Department|FinalRating|
+------+-----------+-----------+
|Ananya|         HR|        4.5|
| Rahul|Engineering|        4.9|
| Priya|Engineering|        4.3|
|  Zoya|  Marketing|        3.8|
| Karan|         HR|        4.1|
|Naveen|Engineering|        4.7|
|Fatima|  Marketing|        3.9|
|  John|       NULL|       NULL|
+------+-----------+-----------+



Built-In Functions and UDF

7. Create a column PerformanceCategory

In [0]:
df_perf_cat = df_perf.withColumn(
    "PerformanceCategory",
    when(col("Rating") >= 4.7, "Excellent")
    .when(col("Rating") >= 4.0, "Good")
    .otherwise("Average")
)
df_perf_cat.show()


+------+----+------+-------------------+
|  Name|Year|Rating|PerformanceCategory|
+------+----+------+-------------------+
|Ananya|2023|   4.5|               Good|
| Rahul|2023|   4.9|          Excellent|
| Priya|2023|   4.3|               Good|
|  Zoya|2023|   3.8|            Average|
| Karan|2023|   4.1|               Good|
|Naveen|2023|   4.7|          Excellent|
|Fatima|2023|   3.9|            Average|
+------+----+------+-------------------+



8. Create a UDF to assign bonus

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def calculate_bonus(hours):
    return 10000 if hours > 200 else 5000

bonus_udf = udf(calculate_bonus, IntegerType())

df_with_bonus = df_proj.withColumn("Bonus", bonus_udf(col("HoursWorked")))
df_with_bonus.show()


+------+----------------+-----------+-----+
|  Name|         Project|HoursWorked|Bonus|
+------+----------------+-----------+-----+
|Ananya|       HR Portal|        120| 5000|
| Rahul|   Data Platform|        200| 5000|
| Priya|   Data Platform|        180| 5000|
|  Zoya|Campaign Tracker|        100| 5000|
| Karan|       HR Portal|        130| 5000|
|Naveen|     ML Pipeline|        220|10000|
|Fatima|Campaign Tracker|         90| 5000|
+------+----------------+-----------+-----+



Date and Time Functions

9. Add a column JoinDate

In [0]:
from pyspark.sql.functions import to_date, months_between

df_with_dates = df_emp.withColumn("JoinDate", to_date(lit("2021-06-01"))) \
    .withColumn("MonthsWorked", months_between(current_date(), col("JoinDate")).cast("int"))

df_with_dates.show()


+------+-----------+------+----------+------------+
|  Name| Department|Salary|  JoinDate|MonthsWorked|
+------+-----------+------+----------+------------+
|Ananya|         HR| 52000|2021-06-01|          48|
| Rahul|Engineering| 65000|2021-06-01|          48|
| Priya|Engineering| 60000|2021-06-01|          48|
|  Zoya|  Marketing| 48000|2021-06-01|          48|
| Karan|         HR| 53000|2021-06-01|          48|
|Naveen|Engineering| 70000|2021-06-01|          48|
|Fatima|  Marketing| 45000|2021-06-01|          48|
+------+-----------+------+----------+------------+



10. Calculate how many employees joined before 2022.

In [0]:
df_with_dates.filter(col("JoinDate") < to_date(lit("2022-01-01"))).count()


7

Unions

11. Create another small team DataFrame and union() it with employee_data .

In [0]:
extra_employees = [
    ("Meena", "HR", 48000),
    ("Raj", "Marketing", 51000)
]
columns = ["Name", "Department", "Salary"]
df_extra = spark.createDataFrame(extra_employees, columns)

df_union = df_emp.union(df_extra)
df_union.show()


+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
| Meena|         HR| 48000|
|   Raj|  Marketing| 51000|
+------+-----------+------+



Saving Results

12. Save the final merged dataset (all 3 joins) as a partitioned Parquet file based
on Department

In [0]:
df_all.write.mode("overwrite").partitionBy("Department").parquet("/tmp/employee_project_performance")
