In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import col, lit, when

# Create Spark session
spark = SparkSession.builder.appName("Quiz2").getOrCreate()
print("Spark Session Created Successfully!")

# Read CSV file with header
df = spark.read.options(header='True').csv("OfficeData.csv")
print("Basic DataFrame loaded:")
df.show()

Spark Session Created Successfully!
Basic DataFrame loaded:
+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [8]:
# create a DF, sort on bonus in ascending order
sorted_df = df.sort(col("bonus").asc())
print("DataFrame sorted by bonus in ascending order:")
sorted_df.show()

DataFrame sorted by bonus in ascending order:
+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
+-------------+----------+-----+------+---+-----+



In [9]:
# create a df, sort on age and salary in descending order and ascending order respectively
sorted_age_salary_df = df.sort(col("age").desc(), col("salary").asc())
print("DataFrame sorted by age (descending) and salary (ascending):")
sorted_age_salary_df.show()

DataFrame sorted by age (descending) and salary (ascending):
+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|      Michael|     Sales|   NY| 86000| 56|20000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|        James|     Sales|   NY| 90000| 34|10000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Maria|   Finance|   CA| 90000| 24|23000|
+-------------+----------+-----+------+---+-----+



In [10]:
# create a DF sorted on age, bonus, salary in descending order, descending order and ascending order respectively
sorted_age_bonus_salary_df = df.sort(col("age").desc(), col("bonus").desc(), col("salary").asc())
print("DataFrame sorted by age (descending), bonus (descending) and salary (ascending):")
sorted_age_bonus_salary_df.show()

DataFrame sorted by age (descending), bonus (descending) and salary (ascending):
+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|      Michael|     Sales|   NY| 86000| 56|20000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|        James|     Sales|   NY| 90000| 34|10000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Maria|   Finance|   CA| 90000| 24|23000|
+-------------+----------+-----+------+---+-----+



In [14]:
# create a new column increment and provide the increment to the employees based on the following criteria
# if employee is in NY state, his/her increment would be 10% of salary + 5% of bonus
# if employee is in CA state, his/her increment would be 12% of salary + 3% of bonus

increment_df = df.withColumn(
    "increment",
    when(col("state") == "NY", col("salary") * 0.10 + col("bonus") * 0.05)
    .when(col("state") == "CA", col("salary") * 0.12 + col("bonus") * 0.03)
    .otherwise(lit(0))
)

print("DataFrame with increment column:")
increment_df.show()

DataFrame with increment column:
+-------------+----------+-----+------+---+-----+---------+
|employee_name|department|state|salary|age|bonus|increment|
+-------------+----------+-----+------+---+-----+---------+
|        James|     Sales|   NY| 90000| 34|10000|   9500.0|
|      Michael|     Sales|   NY| 86000| 56|20000|   9600.0|
|       Robert|     Sales|   CA| 81000| 30|23000|  10410.0|
|        Maria|   Finance|   CA| 90000| 24|23000|  11490.0|
|        Raman|   Finance|   CA| 99000| 40|24000|  12600.0|
|        Scott|   Finance|   NY| 83000| 36|19000|   9250.0|
|          Jen|   Finance|   NY| 79000| 53|15000|   8650.0|
|         Jeff| Marketing|   CA| 80000| 25|18000|  10140.0|
|        Kumar| Marketing|   NY| 91000| 50|21000|  10150.0|
+-------------+----------+-----+------+---+-----+---------+

