In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("combining exsiting data").getOrCreate()
spark 

In [0]:
employee = [
    (1, "Ananya", "HR", "2021-05-01", 55000, None),
    (2, "Rahul", "Engineering", "2020-03-15", 80000, 1),
    (3, "Priya", "Engineering", "2022-07-10", 75000, 1),
    (4, "Zoya", "Marketing", "2019-11-20", 60000, 1),
    (5, "Karan", "HR", "2023-01-05", 50000, 1),
    (6, "Naveen", "Engineering", "2021-08-01", 82000, 1),
    (7, "Fatima", "Marketing", "2022-09-15", 57000, 1)
]
columns = ["EmpID", "Name", "Dept", "JoinDate", "Salary", "ManagerID"]
df_emp = spark.createDataFrame(employee, columns)


columns = ["EmpID", "Name", "Dept", "JoinDate", "Salary","ManagerID"]

df_emp = spark.createDataFrame(employee, columns)
df_emp.show()

performance = [
("Ananya", 2023, 4.5),
("Rahul", 2023, 4.9),
("Priya", 2023, 4.3),
("Zoya", 2023, 3.8),
("Karan", 2023, 4.1),
("Naveen", 2023, 4.7),
("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)
df_perf.show()

project_data = [
("Ananya", "HR Portal", 120),
("Rahul", "Data Platform", 200),
("Priya", "Data Platform", 180),
("Zoya", "Campaign Tracker", 100),
("Karan", "HR Portal", 130),
("Naveen", "ML Pipeline", 220),
("Fatima", "Campaign Tracker", 90)
]
columns_proj = ["Name", "Project", "HoursWorked"]
df_proj = spark.createDataFrame(project_data, columns_proj)
df_proj.show()

+-----+------+-----------+----------+------+---------+
|EmpID|  Name|       Dept|  JoinDate|Salary|ManagerID|
+-----+------+-----------+----------+------+---------+
|    1|Ananya|         HR|2021-05-01| 55000|     NULL|
|    2| Rahul|Engineering|2020-03-15| 80000|        1|
|    3| Priya|Engineering|2022-07-10| 75000|        1|
|    4|  Zoya|  Marketing|2019-11-20| 60000|        1|
|    5| Karan|         HR|2023-01-05| 50000|        1|
|    6|Naveen|Engineering|2021-08-01| 82000|        1|
|    7|Fatima|  Marketing|2022-09-15| 57000|        1|
+-----+------+-----------+----------+------+---------+

+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
+------+----+------+

+------+----------------+-----------+
|  Name|         Project|HoursWorked|
+------+----------------+-----------+
|Ananya|       HR Portal|        120|
| Rahul|  

In [0]:
# Joins and Advanced Aggregations
# 1. Join employee_data , performance_data , and project_data .

df_joined = df_emp.join(df_perf, on="Name", how="inner").join(df_proj, on="Name", how="inner")
df_joined.show()

+------+-----+-----------+----------+------+---------+----+------+----------------+-----------+
|  Name|EmpID|       Dept|  JoinDate|Salary|ManagerID|Year|Rating|         Project|HoursWorked|
+------+-----+-----------+----------+------+---------+----+------+----------------+-----------+
|Ananya|    1|         HR|2021-05-01| 55000|     NULL|2023|   4.5|       HR Portal|        120|
| Priya|    3|Engineering|2022-07-10| 75000|        1|2023|   4.3|   Data Platform|        180|
| Rahul|    2|Engineering|2020-03-15| 80000|        1|2023|   4.9|   Data Platform|        200|
|  Zoya|    4|  Marketing|2019-11-20| 60000|        1|2023|   3.8|Campaign Tracker|        100|
| Karan|    5|         HR|2023-01-05| 50000|        1|2023|   4.1|       HR Portal|        130|
|Naveen|    6|Engineering|2021-08-01| 82000|        1|2023|   4.7|     ML Pipeline|        220|
|Fatima|    7|  Marketing|2022-09-15| 57000|        1|2023|   3.9|Campaign Tracker|         90|
+------+-----+-----------+----------+---

In [0]:
# 2. Compute total hours worked per department.

df_joined.groupBy("Dept").agg({"HoursWorked": "sum"}).show()

+-----------+----------------+
|       Dept|sum(HoursWorked)|
+-----------+----------------+
|         HR|             250|
|Engineering|             600|
|  Marketing|             190|
+-----------+----------------+



In [0]:
# 3. Compute average rating per project.

df_joined.groupBy("Project").agg({"Rating": "avg"}).show()

+----------------+------------------+
|         Project|       avg(Rating)|
+----------------+------------------+
|       HR Portal|               4.3|
|   Data Platform|               4.6|
|Campaign Tracker|3.8499999999999996|
|     ML Pipeline|               4.7|
+----------------+------------------+



In [0]:
# Handling Missing Data (introduce some manually)
# 4. Add a row to performance_data with a None rating.

from pyspark.sql import Row
new_row = [("Mariyam", 2023, None)]

new_perf = spark.createDataFrame(new_row, df_perf.schema)

df_perf_null = df_perf.union(new_perf)
df_perf_null.show()

+-------+----+------+
|   Name|Year|Rating|
+-------+----+------+
| Ananya|2023|   4.5|
|  Rahul|2023|   4.9|
|  Priya|2023|   4.3|
|   Zoya|2023|   3.8|
|  Karan|2023|   4.1|
| Naveen|2023|   4.7|
| Fatima|2023|   3.9|
|Mariyam|2023|  NULL|
+-------+----+------+



In [0]:
# 5. Filter rows with null values.

df_perf_null.filter(df_perf_null["Rating"].isNull()).show()

+-------+----+------+
|   Name|Year|Rating|
+-------+----+------+
|Mariyam|2023|  NULL|
+-------+----+------+



In [0]:
# 6. Replace null ratings with the department average.

from pyspark.sql.functions import avg, when, col

df_perf_with_dept = df_perf_null.join(df_emp.select("Name", "Dept"), on="Name", how="left")

dept_avg = df_perf_with_dept.filter(col("Rating").isNotNull()).groupBy("Dept").agg(avg("Rating").alias("DeptAvg"))

df_with_avg = df_perf_with_dept.join(dept_avg, on="Dept", how="left")

df_filled = df_with_avg.withColumn("Rating_Filled",when(col("Rating").isNull(), col("DeptAvg")).otherwise(col("Rating")))

df_filled.select("Name", "Dept", "Rating", "Rating_Filled").show()


+-------+-----------+------+-------------+
|   Name|       Dept|Rating|Rating_Filled|
+-------+-----------+------+-------------+
| Ananya|         HR|   4.5|          4.5|
|  Rahul|Engineering|   4.9|          4.9|
|  Priya|Engineering|   4.3|          4.3|
|   Zoya|  Marketing|   3.8|          3.8|
|  Karan|         HR|   4.1|          4.1|
| Naveen|Engineering|   4.7|          4.7|
| Fatima|  Marketing|   3.9|          3.9|
|Mariyam|       NULL|  NULL|         NULL|
+-------+-----------+------+-------------+



In [0]:
# Built-In Functions and UDF
# 7. Create a column PerformanceCategory :
# Excellent (>=4.7),
# Good (4.0–4.69),
# Average (<4.0)

from pyspark.sql.functions import when

df_with_category = df_filled.withColumn(
    "PerformanceCategory",
    when(col("Rating_Filled") >= 4.7, "Excellent")
    .when((col("Rating_Filled") >= 4.0) & (col("Rating_Filled") < 4.7), "Good")
    .otherwise("Average")
)

df_with_category.show()

+-----------+-------+----+------+------------------+-------------+-------------------+
|       Dept|   Name|Year|Rating|           DeptAvg|Rating_Filled|PerformanceCategory|
+-----------+-------+----+------+------------------+-------------+-------------------+
|         HR| Ananya|2023|   4.5|               4.3|          4.5|               Good|
|Engineering|  Rahul|2023|   4.9| 4.633333333333334|          4.9|          Excellent|
|Engineering|  Priya|2023|   4.3| 4.633333333333334|          4.3|               Good|
|  Marketing|   Zoya|2023|   3.8|3.8499999999999996|          3.8|            Average|
|         HR|  Karan|2023|   4.1|               4.3|          4.1|               Good|
|Engineering| Naveen|2023|   4.7| 4.633333333333334|          4.7|          Excellent|
|  Marketing| Fatima|2023|   3.9|3.8499999999999996|          3.9|            Average|
|       NULL|Mariyam|2023|  NULL|              NULL|         NULL|            Average|
+-----------+-------+----+------+----------

In [0]:
# 8. Create a UDF to assign bonus:
# If project hours > 200 → 10,000 Else → 5,000

from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

df_final = df_with_category.join(df_proj, on="Name", how="left")

def assign_bonus(hours):
    if hours is None:
        return 0
    return 10000 if hours > 200 else 5000

bonus_udf = udf(assign_bonus, IntegerType())

df_final = df_final.withColumn("Bonus", bonus_udf(col("HoursWorked")))

df_final.show()

+-------+-----------+----+------+------------------+-------------+-------------------+----------------+-----------+-----+
|   Name|       Dept|Year|Rating|           DeptAvg|Rating_Filled|PerformanceCategory|         Project|HoursWorked|Bonus|
+-------+-----------+----+------+------------------+-------------+-------------------+----------------+-----------+-----+
| Ananya|         HR|2023|   4.5|               4.3|          4.5|               Good|       HR Portal|        120| 5000|
|  Rahul|Engineering|2023|   4.9| 4.633333333333334|          4.9|          Excellent|   Data Platform|        200| 5000|
|  Priya|Engineering|2023|   4.3| 4.633333333333334|          4.3|               Good|   Data Platform|        180| 5000|
|   Zoya|  Marketing|2023|   3.8|3.8499999999999996|          3.8|            Average|Campaign Tracker|        100| 5000|
|  Karan|         HR|2023|   4.1|               4.3|          4.1|               Good|       HR Portal|        130| 5000|
| Naveen|Engineering|202

In [0]:
# Date and Time Functions
# 9. Add a column JoinDate with 2021-06-01 for all, then add MonthsWorked as
# difference from today.

from pyspark.sql.functions import lit, to_date, current_date, months_between

df_with_dates = df_emp.withColumn("JoinDate", to_date(lit("2021-06-01")))

df_with_dates = df_with_dates.withColumn(
    "MonthsWorked",
    months_between(current_date(), col("JoinDate")).cast("int")
)

df_with_dates.show()

+-----+------+-----------+----------+------+---------+------------+
|EmpID|  Name|       Dept|  JoinDate|Salary|ManagerID|MonthsWorked|
+-----+------+-----------+----------+------+---------+------------+
|    1|Ananya|         HR|2021-06-01| 55000|     NULL|          48|
|    2| Rahul|Engineering|2021-06-01| 80000|        1|          48|
|    3| Priya|Engineering|2021-06-01| 75000|        1|          48|
|    4|  Zoya|  Marketing|2021-06-01| 60000|        1|          48|
|    5| Karan|         HR|2021-06-01| 50000|        1|          48|
|    6|Naveen|Engineering|2021-06-01| 82000|        1|          48|
|    7|Fatima|  Marketing|2021-06-01| 57000|        1|          48|
+-----+------+-----------+----------+------+---------+------------+



In [0]:
# 10. Calculate how many employees joined before 2022.

from pyspark.sql.functions import col

df_joined_before_2022 = df_with_dates.filter(col("JoinDate") < to_date(lit("2022-01-01")))

df_joined_before_2022.select("Name", "JoinDate").show()

print("Total joined before 2022:", df_joined_before_2022.count())

+------+----------+
|  Name|  JoinDate|
+------+----------+
|Ananya|2021-06-01|
| Rahul|2021-06-01|
| Priya|2021-06-01|
|  Zoya|2021-06-01|
| Karan|2021-06-01|
|Naveen|2021-06-01|
|Fatima|2021-06-01|
+------+----------+

Total joined before 2022: 7


In [0]:
# Unions
# 11. Create another small team DataFrame and union() it with employee_data .
# extra_employees = [
# ("Meena", "HR", 48000),
# ("Raj", "Marketing", 51000)
# ]

from pyspark.sql import Row
extra_employees = [
(8,"Meena", "HR", None,48000,None),
(9,"Raj", "Marketing",None, 51000,None)
]
new_emp_df = spark.createDataFrame(extra_employees, df_emp.schema)
df_emp = df_emp.union(new_emp_df)

df_emp.show()

+-----+------+-----------+----------+------+---------+
|EmpID|  Name|       Dept|  JoinDate|Salary|ManagerID|
+-----+------+-----------+----------+------+---------+
|    1|Ananya|         HR|2021-05-01| 55000|     NULL|
|    2| Rahul|Engineering|2020-03-15| 80000|        1|
|    3| Priya|Engineering|2022-07-10| 75000|        1|
|    4|  Zoya|  Marketing|2019-11-20| 60000|        1|
|    5| Karan|         HR|2023-01-05| 50000|        1|
|    6|Naveen|Engineering|2021-08-01| 82000|        1|
|    7|Fatima|  Marketing|2022-09-15| 57000|        1|
|    8| Meena|         HR|      NULL| 48000|     NULL|
|    9|   Raj|  Marketing|      NULL| 51000|     NULL|
+-----+------+-----------+----------+------+---------+



In [0]:
# Saving Results
# 12. Save the final merged dataset (all 3 joins) as a partitioned Parquet file based on Department .

df_final = df_emp.join(df_perf_null, on="Name", how="inner").join(df_proj, on="Name", how="inner")

df_final.write.partitionBy("Dept").mode("overwrite").parquet("/tmp/merged_data_partitioned_by_dept")