In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("agg and grouping").getOrCreate()
spark 

In [0]:
employee = [
    (1, "Anita", "HR", "2021-05-01", 55000,None),
    (2, "Raj", "Engineering", "2020-03-15", 80000,1),
    (3, "Simran", "Engineering", "2022-07-10", 75000,1),
    (4, "Aamir", "Marketing", "2019-11-20", 60000,1),
    (5, "Nisha", "HR", "2023-01-05", 50000,1)
]

columns = ["EmpID", "Name", "Dept", "JoinDate", "Salary","ManagerID"]

df_emp = spark.createDataFrame(employee, columns)
df_emp.show()

performance = [
("Anita", 2023, 4.5),
("Raj", 2023, 4.9),
("Simran", 2023, 4.3),
("Aamir", 2023, 3.8),
("Karan", 2023, 4.1),
("Nisha", 2023, 4.7),
("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)
df_perf.show()

+-----+------+-----------+----------+------+---------+
|EmpID|  Name|       Dept|  JoinDate|Salary|ManagerID|
+-----+------+-----------+----------+------+---------+
|    1| Anita|         HR|2021-05-01| 55000|     NULL|
|    2|   Raj|Engineering|2020-03-15| 80000|        1|
|    3|Simran|Engineering|2022-07-10| 75000|        1|
|    4| Aamir|  Marketing|2019-11-20| 60000|        1|
|    5| Nisha|         HR|2023-01-05| 50000|        1|
+-----+------+-----------+----------+------+---------+

+------+----+------+
|  Name|Year|Rating|
+------+----+------+
| Anita|2023|   4.5|
|   Raj|2023|   4.9|
|Simran|2023|   4.3|
| Aamir|2023|   3.8|
| Karan|2023|   4.1|
| Nisha|2023|   4.7|
|Fatima|2023|   3.9|
+------+----+------+



In [0]:
# GroupBy and Aggregations
# 1. Get the average salary by department.

df_emp.groupBy("Dept").agg({"Salary": "avg"}).show()

+-----------+-----------+
|       Dept|avg(Salary)|
+-----------+-----------+
|         HR|    52500.0|
|Engineering|    77500.0|
|  Marketing|    60000.0|
+-----------+-----------+



In [0]:
# 2. Count of employees per department.

df_emp.groupBy("Dept").agg({"EmpID": "count"}).show()

+-----------+------------+
|       Dept|count(EmpID)|
+-----------+------------+
|         HR|           2|
|Engineering|           2|
|  Marketing|           1|
+-----------+------------+



In [0]:
# 3. Maximum and minimum salary in Engineering.

from pyspark.sql.functions import max, min

df_emp.filter(df_emp.Dept == "Engineering").agg(max("Salary").alias("Max_Salary"),min("Salary").alias("Min_Salary")).show()

+----------+----------+
|Max_Salary|Min_Salary|
+----------+----------+
|     80000|     75000|
+----------+----------+



In [0]:
# Join and Combine Data
# 4. Perform an inner join between employee_data and performance_data on Name .

df_joined = df_emp.join(df_perf, on="Name", how="inner")
df_joined.show()

+------+-----+-----------+----------+------+---------+----+------+
|  Name|EmpID|       Dept|  JoinDate|Salary|ManagerID|Year|Rating|
+------+-----+-----------+----------+------+---------+----+------+
| Aamir|    4|  Marketing|2019-11-20| 60000|        1|2023|   3.8|
| Anita|    1|         HR|2021-05-01| 55000|     NULL|2023|   4.5|
| Nisha|    5|         HR|2023-01-05| 50000|        1|2023|   4.7|
|   Raj|    2|Engineering|2020-03-15| 80000|        1|2023|   4.9|
|Simran|    3|Engineering|2022-07-10| 75000|        1|2023|   4.3|
+------+-----+-----------+----------+------+---------+----+------+



In [0]:
# 5. Show each employee’s salary and performance rating.

df_joined.select("Name", "Salary", "Rating").show()

+------+------+------+
|  Name|Salary|Rating|
+------+------+------+
| Aamir| 60000|   3.8|
| Anita| 55000|   4.5|
| Nisha| 50000|   4.7|
|   Raj| 80000|   4.9|
|Simran| 75000|   4.3|
+------+------+------+



In [0]:
# 6. Filter employees with rating > 4.5 and salary > 60000.

df_joined.filter((df_joined.Rating > 4.5) & (df_joined.Salary > 60000)).show()

+----+-----+-----------+----------+------+---------+----+------+
|Name|EmpID|       Dept|  JoinDate|Salary|ManagerID|Year|Rating|
+----+-----+-----------+----------+------+---------+----+------+
| Raj|    2|Engineering|2020-03-15| 80000|        1|2023|   4.9|
+----+-----+-----------+----------+------+---------+----+------+



In [0]:
# Window & Rank (Bonus Challenge)
# 7. Rank employees by salary department-wise.

from pyspark.sql.window import Window
from pyspark.sql.functions import rank,sum

window_rank = Window.partitionBy("Dept").orderBy(df_emp["Salary"].desc())

df_emp.withColumn("Salary_Rank", rank().over(window_rank)).select("EmpID", "Name", "Dept", "Salary", "Salary_Rank").show()

+-----+------+-----------+------+-----------+
|EmpID|  Name|       Dept|Salary|Salary_Rank|
+-----+------+-----------+------+-----------+
|    2|   Raj|Engineering| 80000|          1|
|    3|Simran|Engineering| 75000|          2|
|    1| Anita|         HR| 55000|          1|
|    5| Nisha|         HR| 50000|          2|
|    4| Aamir|  Marketing| 60000|          1|
+-----+------+-----------+------+-----------+



In [0]:
# 8. Calculate cumulative salary in each department.
window_cumsum = Window.partitionBy("Dept").orderBy("Salary").rowsBetween(Window.unboundedPreceding, Window.currentRow)

df_emp.withColumn("Cumulative_Salary", sum("Salary").over(window_cumsum)).select("EmpID", "Name", "Dept", "Salary", "Cumulative_Salary").show()

+-----+------+-----------+------+-----------------+
|EmpID|  Name|       Dept|Salary|Cumulative_Salary|
+-----+------+-----------+------+-----------------+
|    3|Simran|Engineering| 75000|            75000|
|    2|   Raj|Engineering| 80000|           155000|
|    5| Nisha|         HR| 50000|            50000|
|    1| Anita|         HR| 55000|           105000|
|    4| Aamir|  Marketing| 60000|            60000|
+-----+------+-----------+------+-----------------+



In [0]:
# Date Operations
# 9. Add a new column JoinDate with random dates between 2020 and 2023.

import random
from datetime import date, timedelta
from pyspark.sql.functions import to_date, lit
from pyspark.sql.types import StringType

def random_date():
    start_date = date(2020, 1, 1)
    end_date = date(2023, 12, 31)
    delta = end_date - start_date
    return str(start_date + timedelta(days=random.randint(0, delta.days)))

random_dates = [random_date() for _ in range(df_emp.count())]

random_date_df = spark.createDataFrame([(d,) for d in random_dates], ["RandomJoinDate"])

from pyspark.sql.functions import monotonically_increasing_id

df_emp_indexed = df_emp.withColumn("id", monotonically_increasing_id())
random_date_indexed = random_date_df.withColumn("id", monotonically_increasing_id())

df_emp_random = df_emp_indexed.join(random_date_indexed, "id") \
    .drop("JoinDate") \
    .withColumnRenamed("RandomJoinDate", "JoinDate") \
    .withColumn("JoinDate", to_date("JoinDate", "yyyy-MM-dd")) \
    .drop("id")

df_emp_random.show()

+-----+------+-----------+------+---------+----------+
|EmpID|  Name|       Dept|Salary|ManagerID|  JoinDate|
+-----+------+-----------+------+---------+----------+
|    1| Anita|         HR| 55000|     NULL|2021-11-02|
|    2|   Raj|Engineering| 80000|        1|2021-08-25|
|    3|Simran|Engineering| 75000|        1|2020-05-13|
|    4| Aamir|  Marketing| 60000|        1|2020-03-14|
|    5| Nisha|         HR| 50000|        1|2020-04-22|
+-----+------+-----------+------+---------+----------+



In [0]:
# 10. Add column YearsWithCompany using current_date() and datediff() .

from pyspark.sql.functions import current_date, datediff, round

df_emp_years = df_emp_random.withColumn("YearsWithCompany",round(datediff(current_date(), "JoinDate") / 365, 1))

df_emp_years.select("Name", "JoinDate", "YearsWithCompany").show()

+------+----------+----------------+
|  Name|  JoinDate|YearsWithCompany|
+------+----------+----------------+
| Anita|2021-11-02|             3.6|
|   Raj|2021-08-25|             3.8|
|Simran|2020-05-13|             5.1|
| Aamir|2020-03-14|             5.2|
| Nisha|2020-04-22|             5.1|
+------+----------+----------------+



In [0]:
# Writing to Files
# 11. Write the full employee DataFrame to CSV with headers.
df_emp_random.write.mode('overwrite').csv("/FileStore/tables/employee_data.csv", header=True)

In [0]:
# 12. Save the joined DataFrame to a Parquet file.
df_joined.write.mode('overwrite').parquet("/FileStore/tables/employee_performance.parquet")

In [0]:
spark.read.csv("/FileStore/tables/employee_data.csv", header=True, inferSchema=True)

DataFrame[EmpID: int, Name: string, Dept: string, Salary: int, ManagerID: int, JoinDate: date]

In [0]:
spark.read.parquet("/FileStore/tables/employee_performance.parquet")

DataFrame[Name: string, EmpID: bigint, Dept: string, JoinDate: string, Salary: bigint, ManagerID: bigint, Year: bigint, Rating: double]