In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, max, min, count, when, current_date, datediff, expr, rand
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, sum as _sum
from pyspark.sql.types import DateType
import datetime
import random

# Create SparkSession
spark = SparkSession.builder.appName("AdvancedEmployeePerformance").getOrCreate()

# Dataset 1: employee_data
data = [
    ("Ananya", "HR", 52000),
    ("Rahul", "Engineering", 65000),
    ("Priya", "Engineering", 60000),
    ("Zoya", "Marketing", 48000),
    ("Karan", "HR", 53000),
    ("Naveen", "Engineering", 70000),
    ("Fatima", "Marketing", 45000)
]
columns = ["Name", "Department", "Salary"]
df_emp = spark.createDataFrame(data, columns)

# Dataset 2: performance_data
performance = [
    ("Ananya", 2023, 4.5),
    ("Rahul", 2023, 4.9),
    ("Priya", 2023, 4.3),
    ("Zoya", 2023, 3.8),
    ("Karan", 2023, 4.1),
    ("Naveen", 2023, 4.7),
    ("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)


In [0]:
# 1. Average salary by department
df_emp.groupBy("Department").agg(avg("Salary").alias("AvgSalary")).show()

# 2. Count of employees per department
df_emp.groupBy("Department").agg(count("*").alias("EmpCount")).show()

# 3. Max and Min salary in Engineering
df_emp.filter(col("Department") == "Engineering") \
      .agg(max("Salary").alias("MaxSalary"), min("Salary").alias("MinSalary")).show()


+-----------+---------+
| Department|AvgSalary|
+-----------+---------+
|         HR|  52500.0|
|Engineering|  65000.0|
|  Marketing|  46500.0|
+-----------+---------+

+-----------+--------+
| Department|EmpCount|
+-----------+--------+
|         HR|       2|
|Engineering|       3|
|  Marketing|       2|
+-----------+--------+

+---------+---------+
|MaxSalary|MinSalary|
+---------+---------+
|    70000|    60000|
+---------+---------+



In [0]:
# 4. Inner join on Name
df_joined = df_emp.join(df_perf, on="Name", how="inner")
df_joined.show()

# 5. Show employee salary and performance rating
df_joined.select("Name", "Salary", "Rating").show()

# 6. Filter employees with rating > 4.5 and salary > 60000
df_joined.filter((col("Rating") > 4.5) & (col("Salary") > 60000)).show()


+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Ananya|         HR| 52000|2023|   4.5|
|Fatima|  Marketing| 45000|2023|   3.9|
| Karan|         HR| 53000|2023|   4.1|
|Naveen|Engineering| 70000|2023|   4.7|
| Priya|Engineering| 60000|2023|   4.3|
| Rahul|Engineering| 65000|2023|   4.9|
|  Zoya|  Marketing| 48000|2023|   3.8|
+------+-----------+------+----+------+

+------+------+------+
|  Name|Salary|Rating|
+------+------+------+
|Ananya| 52000|   4.5|
|Fatima| 45000|   3.9|
| Karan| 53000|   4.1|
|Naveen| 70000|   4.7|
| Priya| 60000|   4.3|
| Rahul| 65000|   4.9|
|  Zoya| 48000|   3.8|
+------+------+------+

+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Naveen|Engineering| 70000|2023|   4.7|
| Rahul|Engineering| 65000|2023|   4.9|
+------+-----------+------+----+------+



In [0]:
from pyspark.sql.window import Window

# 7. Rank employees by salary within department
windowSpec = Window.partitionBy("Department").orderBy(col("Salary").desc())
df_emp.withColumn("SalaryRank", rank().over(windowSpec)).show()

# 8. Cumulative salary in each department
df_emp.withColumn("CumulativeSalary", _sum("Salary").over(windowSpec.rowsBetween(Window.unboundedPreceding, Window.currentRow))).show()


+------+-----------+------+----------+
|  Name| Department|Salary|SalaryRank|
+------+-----------+------+----------+
|Naveen|Engineering| 70000|         1|
| Rahul|Engineering| 65000|         2|
| Priya|Engineering| 60000|         3|
| Karan|         HR| 53000|         1|
|Ananya|         HR| 52000|         2|
|  Zoya|  Marketing| 48000|         1|
|Fatima|  Marketing| 45000|         2|
+------+-----------+------+----------+

+------+-----------+------+----------------+
|  Name| Department|Salary|CumulativeSalary|
+------+-----------+------+----------------+
|Naveen|Engineering| 70000|           70000|
| Rahul|Engineering| 65000|          135000|
| Priya|Engineering| 60000|          195000|
| Karan|         HR| 53000|           53000|
|Ananya|         HR| 52000|          105000|
|  Zoya|  Marketing| 48000|           48000|
|Fatima|  Marketing| 45000|           93000|
+------+-----------+------+----------------+



In [0]:
from pyspark.sql.functions import to_date, lit

# 9. Add JoinDate (random dates from 2020 to 2023)
def random_date():
    start_date = datetime.date(2020, 1, 1)
    end_date = datetime.date(2023, 12, 31)
    return start_date + datetime.timedelta(days=random.randint(0, (end_date - start_date).days))

join_dates = [random_date() for _ in range(df_emp.count())]
join_dates_df = spark.createDataFrame([(name, date) for (name, _, _), date in zip(data, join_dates)], ["Name", "JoinDate"])

df_with_dates = df_emp.join(join_dates_df, on="Name", how="left")
df_with_dates = df_with_dates.withColumn("JoinDate", col("JoinDate").cast(DateType()))
df_with_dates.show()

# 10. YearsWithCompany using current_date and datediff
df_with_tenure = df_with_dates.withColumn(
    "YearsWithCompany",
    (datediff(current_date(), col("JoinDate")) / 365).cast("int")
)
df_with_tenure.select("Name", "JoinDate", "YearsWithCompany").show()


+------+-----------+------+----------+
|  Name| Department|Salary|  JoinDate|
+------+-----------+------+----------+
|Ananya|         HR| 52000|2020-02-03|
| Rahul|Engineering| 65000|2022-04-04|
| Priya|Engineering| 60000|2021-11-04|
|  Zoya|  Marketing| 48000|2020-07-21|
| Karan|         HR| 53000|2023-06-23|
|Naveen|Engineering| 70000|2020-09-13|
|Fatima|  Marketing| 45000|2022-08-17|
+------+-----------+------+----------+

+------+----------+----------------+
|  Name|  JoinDate|YearsWithCompany|
+------+----------+----------------+
|Ananya|2020-02-03|               5|
| Rahul|2022-04-04|               3|
| Priya|2021-11-04|               3|
|  Zoya|2020-07-21|               4|
| Karan|2023-06-23|               1|
|Naveen|2020-09-13|               4|
|Fatima|2022-08-17|               2|
+------+----------+----------------+



In [0]:
# Read back the CSV
df_csv = spark.read.option("header", True).csv("/path/to/output/employee_data_csv")
df_csv.show()

# Read back the Parquet
df_parquet = spark.read.parquet("/path/to/output/joined_data_parquet")
df_parquet.show()



+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Ananya|         HR| 52000|
+------+-----------+------+

+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Ananya|         HR| 52000|2023|   4.5|
|Fatima|  Marketing| 45000|2023|   3.9|
| Karan|         HR| 53000|2023|   4.1|
|Naveen|Engineering| 70000|2023|   4.7|
| Priya|Engineering| 60000|2023|   4.3|
| Rahul|Engineering| 65000|2023|   4.9|
|  Zoya|  Marketing| 48000|2023|   3.8|
+------+-----------+------+----+------+

