In [0]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Initialize Spark Session
spark = SparkSession.builder.appName("AggregationsAndJoins").getOrCreate()

# Create employee_data DataFrame
employee_data = [
    ("Ananya", "Engineering", 65000),
    ("Rahul", "Marketing", 58000),
    ("Priya", "Engineering", 72000),
    ("Zoya", "HR", 53000),
    ("Karan", "Marketing", 62000),
    ("Naveen", "Engineering", 68000),
    ("Fatima", "HR", 49000)
]
columns_emp = ["Name", "Department", "Salary"]
df_emp = spark.createDataFrame(employee_data, columns_emp)

# Create performance_data DataFrame
performance = [
    ("Ananya", 2023, 4.5),
    ("Rahul", 2023, 4.9),
    ("Priya", 2023, 4.3),
    ("Zoya", 2023, 3.8),
    ("Karan", 2023, 4.1),
    ("Naveen", 2023, 4.7),
    ("Fatima", 2023, 3.9)
]
columns_perf = ["Name", "Year", "Rating"]
df_perf = spark.createDataFrame(performance, columns_perf)

# Show DataFrames
print("Employee Data:")
df_emp.show()
print("Performance Data:")
df_perf.show()

Employee Data:
+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|Engineering| 65000|
| Rahul|  Marketing| 58000|
| Priya|Engineering| 72000|
|  Zoya|         HR| 53000|
| Karan|  Marketing| 62000|
|Naveen|Engineering| 68000|
|Fatima|         HR| 49000|
+------+-----------+------+

Performance Data:
+------+----+------+
|  Name|Year|Rating|
+------+----+------+
|Ananya|2023|   4.5|
| Rahul|2023|   4.9|
| Priya|2023|   4.3|
|  Zoya|2023|   3.8|
| Karan|2023|   4.1|
|Naveen|2023|   4.7|
|Fatima|2023|   3.9|
+------+----+------+



In [0]:
# 1. Average salary by department
avg_salary = df_emp.groupBy("Department").agg(avg("Salary").alias("AvgSalary"))
print("Average Salary by Department:")
avg_salary.show()

# 2. Count employees per department
count_employees = df_emp.groupBy("Department").count()
print("Employee Count by Department:")
count_employees.show()

# 3. Max & Min Salary in Engineering
eng_stats = df_emp.filter(col("Department") == "Engineering") \
                  .agg(max("Salary").alias("MaxSalary"), 
                       min("Salary").alias("MinSalary"))
print("Engineering Salary Stats:")
eng_stats.show()

Average Salary by Department:
+-----------+-----------------+
| Department|        AvgSalary|
+-----------+-----------------+
|Engineering|68333.33333333333|
|  Marketing|          60000.0|
|         HR|          51000.0|
+-----------+-----------------+

Employee Count by Department:
+-----------+-----+
| Department|count|
+-----------+-----+
|Engineering|    3|
|  Marketing|    2|
|         HR|    2|
+-----------+-----+

Engineering Salary Stats:
+---------+---------+
|MaxSalary|MinSalary|
+---------+---------+
|    72000|    65000|
+---------+---------+



In [0]:
# 4. Inner Join on Name
joined_df = df_emp.join(df_perf, "Name", "inner")
print("Joined DataFrame (Employee + Performance):")
joined_df.show()

# 5. Show Salary & Rating
salary_rating = joined_df.select("Name", "Salary", "Rating")
print("Salary & Rating:")
salary_rating.show()

# 6. Filter employees with Rating > 4.5 & Salary > 60000
high_performers = joined_df.filter((col("Rating") > 4.5) & (col("Salary") > 60000))
print("High Performers (Rating > 4.5 & Salary > 60000):")
high_performers.show()

Joined DataFrame (Employee + Performance):
+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Ananya|Engineering| 65000|2023|   4.5|
|Fatima|         HR| 49000|2023|   3.9|
| Karan|  Marketing| 62000|2023|   4.1|
|Naveen|Engineering| 68000|2023|   4.7|
| Priya|Engineering| 72000|2023|   4.3|
| Rahul|  Marketing| 58000|2023|   4.9|
|  Zoya|         HR| 53000|2023|   3.8|
+------+-----------+------+----+------+

Salary & Rating:
+------+------+------+
|  Name|Salary|Rating|
+------+------+------+
|Ananya| 65000|   4.5|
|Fatima| 49000|   3.9|
| Karan| 62000|   4.1|
|Naveen| 68000|   4.7|
| Priya| 72000|   4.3|
| Rahul| 58000|   4.9|
|  Zoya| 53000|   3.8|
+------+------+------+

High Performers (Rating > 4.5 & Salary > 60000):
+------+-----------+------+----+------+
|  Name| Department|Salary|Year|Rating|
+------+-----------+------+----+------+
|Naveen|Engineering| 68000|2023|   4.7|
+------+-----------+------+----+----

In [0]:
# 7. Rank employees by salary within each department
window_rank = Window.partitionBy("Department").orderBy(col("Salary").desc())
ranked_df = df_emp.withColumn("Rank", rank().over(window_rank))
print("Employees Ranked by Salary (Department-wise):")
ranked_df.show()

# 8. Cumulative salary per department
window_cumulative = Window.partitionBy("Department").orderBy("Salary").rowsBetween(Window.unboundedPreceding, Window.currentRow)
cumulative_df = df_emp.withColumn("CumulativeSalary", sum("Salary").over(window_cumulative))
print("Cumulative Salary by Department:")
cumulative_df.show()

Employees Ranked by Salary (Department-wise):
+------+-----------+------+----+
|  Name| Department|Salary|Rank|
+------+-----------+------+----+
| Priya|Engineering| 72000|   1|
|Naveen|Engineering| 68000|   2|
|Ananya|Engineering| 65000|   3|
|  Zoya|         HR| 53000|   1|
|Fatima|         HR| 49000|   2|
| Karan|  Marketing| 62000|   1|
| Rahul|  Marketing| 58000|   2|
+------+-----------+------+----+

Cumulative Salary by Department:
+------+-----------+------+----------------+
|  Name| Department|Salary|CumulativeSalary|
+------+-----------+------+----------------+
|Ananya|Engineering| 65000|           65000|
|Naveen|Engineering| 68000|          133000|
| Priya|Engineering| 72000|          205000|
|Fatima|         HR| 49000|           49000|
|  Zoya|         HR| 53000|          102000|
| Rahul|  Marketing| 58000|           58000|
| Karan|  Marketing| 62000|          120000|
+------+-----------+------+----------------+



In [0]:
# 9. Add JoinDate (random dates between 2020-2023)
df_with_dates = df_emp.withColumn(
    "JoinDate", 
    to_date(
        date_add(lit("2020-01-01"), 
        (rand() * 365 * 4).cast("int"))
    )
)
print("Employee Data with Random Join Dates:")
df_with_dates.show()

# 10. Calculate Years with Company
df_with_years = df_with_dates.withColumn(
    "YearsWithCompany", 
    datediff(current_date(), col("JoinDate")) / 365
)
print("Employee Data with Years in Company:")
df_with_years.show()

Employee Data with Random Join Dates:
+------+-----------+------+----------+
|  Name| Department|Salary|  JoinDate|
+------+-----------+------+----------+
|Ananya|Engineering| 65000|2022-01-17|
| Rahul|  Marketing| 58000|2021-12-09|
| Priya|Engineering| 72000|2021-11-08|
|  Zoya|         HR| 53000|2022-10-28|
| Karan|  Marketing| 62000|2021-10-18|
|Naveen|Engineering| 68000|2021-03-23|
|Fatima|         HR| 49000|2020-07-20|
+------+-----------+------+----------+

Employee Data with Years in Company:
+------+-----------+------+----------+-----------------+
|  Name| Department|Salary|  JoinDate| YearsWithCompany|
+------+-----------+------+----------+-----------------+
|Ananya|Engineering| 65000|2022-01-17|              3.4|
| Rahul|  Marketing| 58000|2021-12-09|3.506849315068493|
| Priya|Engineering| 72000|2021-11-08|3.591780821917808|
|  Zoya|         HR| 53000|2022-10-28|2.621917808219178|
| Karan|  Marketing| 62000|2021-10-18|3.649315068493151|
|Naveen|Engineering| 68000|2021-03-23|4

In [0]:
# Save employee DataFrame to CSV (with headers)
df_emp.write.mode("overwrite").option("header", "true").csv("/FileStore/tables/employee_data.csv")
print("Saved employee_data.csv to Databricks FileStore.")

# Save joined DataFrame to Parquet
joined_df.write.mode("overwrite").parquet("/FileStore/tables/employee_performance.parquet")
print("Saved employee_performance.parquet to Databricks FileStore.")

Saved employee_data.csv to Databricks FileStore.
Saved employee_performance.parquet to Databricks FileStore.
