In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, max, min, when, current_date, datediff, lit
from pyspark.sql.functions import rand, expr, monotonically_increasing_id
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, sum as _sum

spark = SparkSession.builder.appName("AdvancedEmployeeAnalysis").getOrCreate()

# Dataset 1: employee_data
data = [
    ("Ananya", "HR", 52000),
    ("Rahul", "Engineering", 65000),
    ("Priya", "Engineering", 60000),
    ("Zoya", "Marketing", 48000),
    ("Karan", "HR", 53000),
    ("Naveen", "Engineering", 70000),
    ("Fatima", "Marketing", 45000)
]
columns = ["Name", "Department", "Salary"]
df_emp = spark.createDataFrame(data, columns)

# Dataset 2: performance_data
performance = [
    ("Ananya", 2023, 4.5),
    ("Rahul", 2023, 4.9),
    ("Priya", 2023, 4.3),
    ("Zoya", 2023, 3.8),
    ("Karan", 2023, 4.1),
    ("Naveen", 2023, 4.7),
    ("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)


GroupBy and Aggregations

1. Get the average salary by department.

In [0]:
df_emp.groupBy("Department").agg(avg("Salary").alias("AvgSalary")).show()

+-----------+---------+
| Department|AvgSalary|
+-----------+---------+
|         HR|  52500.0|
|Engineering|  65000.0|
|  Marketing|  46500.0|
+-----------+---------+



2. Count of employees per department.

In [0]:
df_emp.groupBy("Department").agg(count("*").alias("EmployeeCount")).show()

+-----------+-------------+
| Department|EmployeeCount|
+-----------+-------------+
|         HR|            2|
|Engineering|            3|
|  Marketing|            2|
+-----------+-------------+



3. Maximum and minimum salary in Engineering.

In [0]:
df_emp.filter(col("Department") == "Engineering").agg(
    max("Salary").alias("MaxSalary"),
    min("Salary").alias("MinSalary")
).show()

+---------+---------+
|MaxSalary|MinSalary|
+---------+---------+
|    70000|    60000|
+---------+---------+



Join and Combine Data

4. Perform an inner join between employee_data and performance_data on Name .

In [0]:
df_joined = df_emp.join(df_perf, on="Name", how="inner")

5. Show each employeeâ€™s salary and performance rating.

In [0]:
df_joined.select("Name", "Salary", "Rating").show()

+------+------+------+
|  Name|Salary|Rating|
+------+------+------+
|Ananya| 52000|   4.5|
|Fatima| 45000|   3.9|
| Karan| 53000|   4.1|
|Naveen| 70000|   4.7|
| Priya| 60000|   4.3|
| Rahul| 65000|   4.9|
|  Zoya| 48000|   3.8|
+------+------+------+



6. Filter employees with rating > 4.5 and salary > 60000.

In [0]:
df_joined.filter((col("Rating") > 4.5) & (col("Salary") > 60000)).show()

+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Naveen|Engineering| 70000|2023|   4.7|
| Rahul|Engineering| 65000|2023|   4.9|
+------+-----------+------+----+------+



Window & Rank

7. Rank employees by salary department-wise.

In [0]:
windowSpec = Window.partitionBy("Department").orderBy(col("Salary").desc())
df_ranked = df_emp.withColumn("SalaryRank", rank().over(windowSpec))
df_ranked.show()

+------+-----------+------+----------+
|  Name| Department|Salary|SalaryRank|
+------+-----------+------+----------+
|Naveen|Engineering| 70000|         1|
| Rahul|Engineering| 65000|         2|
| Priya|Engineering| 60000|         3|
| Karan|         HR| 53000|         1|
|Ananya|         HR| 52000|         2|
|  Zoya|  Marketing| 48000|         1|
|Fatima|  Marketing| 45000|         2|
+------+-----------+------+----------+



8. Calculate cumulative salary in each department.

In [0]:
df_cumulative = df_emp.withColumn("CumulativeSalary", _sum("Salary").over(windowSpec.rowsBetween(Window.unboundedPreceding, Window.currentRow)))
df_cumulative.show()

+------+-----------+------+----------------+
|  Name| Department|Salary|CumulativeSalary|
+------+-----------+------+----------------+
|Naveen|Engineering| 70000|           70000|
| Rahul|Engineering| 65000|          135000|
| Priya|Engineering| 60000|          195000|
| Karan|         HR| 53000|           53000|
|Ananya|         HR| 52000|          105000|
|  Zoya|  Marketing| 48000|           48000|
|Fatima|  Marketing| 45000|           93000|
+------+-----------+------+----------------+



Date Operations

9. Add a new column JoinDate

In [0]:
from pyspark.sql.functions import to_date

df_dates = df_emp.withColumn("JoinDate", expr("date_add(to_date('2020-01-01'), cast(rand() * 1460 as int))"))
df_dates.show()

+------+-----------+------+----------+
|  Name| Department|Salary|  JoinDate|
+------+-----------+------+----------+
|Ananya|         HR| 52000|2020-10-04|
| Rahul|Engineering| 65000|2023-01-17|
| Priya|Engineering| 60000|2023-08-28|
|  Zoya|  Marketing| 48000|2021-08-05|
| Karan|         HR| 53000|2021-01-28|
|Naveen|Engineering| 70000|2021-10-18|
|Fatima|  Marketing| 45000|2020-11-08|
+------+-----------+------+----------+



10. Add column YearsWithCompany

In [0]:
from pyspark.sql.functions import to_date

df_tenure = df_dates.withColumn("YearsWithCompany", (datediff(current_date(), col("JoinDate")) / 365).cast("int"))
df_tenure.show()

+------+-----------+------+----------+----------------+
|  Name| Department|Salary|  JoinDate|YearsWithCompany|
+------+-----------+------+----------+----------------+
|Ananya|         HR| 52000|2020-10-04|               4|
| Rahul|Engineering| 65000|2023-01-17|               2|
| Priya|Engineering| 60000|2023-08-28|               1|
|  Zoya|  Marketing| 48000|2021-08-05|               3|
| Karan|         HR| 53000|2021-01-28|               4|
|Naveen|Engineering| 70000|2021-10-18|               3|
|Fatima|  Marketing| 45000|2020-11-08|               4|
+------+-----------+------+----------+----------------+



Writing to Files

11. Write the full employee DataFrame to CSV with headers.

In [0]:
df_emp.write.mode("overwrite").option("header", True).csv("/tmp/employee_data_csv")


12. Save the joined DataFrame to a Parquet file.

In [0]:
df_joined.write.mode("overwrite").parquet("/tmp/employee_performance_parquet")