In [0]:
data = [
("Ananya", "HR", 52000),
("Rahul", "Engineering", 65000),
("Priya", "Engineering", 60000),
("Zoya", "Marketing", 48000),
("Karan", "HR", 53000),
("Naveen", "Engineering", 70000),
("Fatima", "Marketing", 45000)
]
columns = ["Name", "Department", "Salary"]
df = spark.createDataFrame(data, columns)

In [0]:
# Exercise Set 1: Basics
# 1. Display all records in the DataFrame.

df.show()

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
+------+-----------+------+



In [0]:
# 2. Print the schema of the DataFrame.

df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)



In [0]:
# 3. Count total number of employees.

df.count()

7

In [0]:
# Exercise Set 2: Column Operations
# 4. Add a new column Bonus which is 15% of Salary.
from pyspark.sql.functions import col

df = df.withColumn("Bonus",col("Salary")*0.15)
df.show()

+------+-----------+------+-------+
|  Name| Department|Salary|  Bonus|
+------+-----------+------+-------+
|Ananya|         HR| 52000| 7800.0|
| Rahul|Engineering| 65000| 9750.0|
| Priya|Engineering| 60000| 9000.0|
|  Zoya|  Marketing| 48000| 7200.0|
| Karan|         HR| 53000| 7950.0|
|Naveen|Engineering| 70000|10500.0|
|Fatima|  Marketing| 45000| 6750.0|
+------+-----------+------+-------+



In [0]:
# 5. Add a new column NetPay = Salary + Bonus.
from pyspark.sql.functions import col

df = df.withColumn("NetPay",col("Salary") + col("Bonus"))
df.show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Ananya|         HR| 52000| 7800.0|59800.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
|Fatima|  Marketing| 45000| 6750.0|51750.0|
+------+-----------+------+-------+-------+



In [0]:
# Exercise Set 3: Filtering and Conditions
# 6. Display only employees from the “Engineering” department.
from pyspark.sql.functions import col

df.filter(col("Department") == "Engineering").show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
+------+-----------+------+-------+-------+



In [0]:
# 7. Display employees whose salary is greater than 60000.
from pyspark.sql.functions import col

df.filter(col("Salary") > 60000).show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
| Rahul|Engineering| 65000| 9750.0|74750.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
+------+-----------+------+-------+-------+



In [0]:
# 8. Display employees who are not in the “Marketing” department.
from pyspark.sql.functions import col

df.filter(col("Department") != "Marketing").show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Ananya|         HR| 52000| 7800.0|59800.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
+------+-----------+------+-------+-------+



In [0]:
# Exercise Set 4: Sorting and Limiting
# 9. Show top 3 highest paid employees.
from pyspark.sql.functions import col

df.orderBy(col("Salary").desc()).limit(3).show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Naveen|Engineering| 70000|10500.0|80500.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
+------+-----------+------+-------+-------+



In [0]:
# 10. Sort the data by Department ascending and Salary descending.

df.orderBy(col("Department").asc(),col("Salary").desc()).show()

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Naveen|Engineering| 70000|10500.0|80500.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Ananya|         HR| 52000| 7800.0|59800.0|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|
|Fatima|  Marketing| 45000| 6750.0|51750.0|
+------+-----------+------+-------+-------+



In [0]:
# Exercise Set 5: String and Case Logic
# 11. Add a new column Level :
# “Senior” if salary > 60000
# “Mid” if salary between 50000 and 60000
# “Junior” otherwise
from pyspark.sql.functions import col,when

df = df.withColumn("Level",when(col("Salary") > 60000,"Senior").when((col("Salary") >= 50000) & (col("Salary") <= 60000),"Mid").otherwise("Junior"))
df.show()

+------+-----------+------+-------+-------+------+
|  Name| Department|Salary|  Bonus| NetPay| Level|
+------+-----------+------+-------+-------+------+
|Ananya|         HR| 52000| 7800.0|59800.0|   Mid|
| Rahul|Engineering| 65000| 9750.0|74750.0|Senior|
| Priya|Engineering| 60000| 9000.0|69000.0|   Mid|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|Junior|
| Karan|         HR| 53000| 7950.0|60950.0|   Mid|
|Naveen|Engineering| 70000|10500.0|80500.0|Senior|
|Fatima|  Marketing| 45000| 6750.0|51750.0|Junior|
+------+-----------+------+-------+-------+------+



In [0]:
# 12. Convert all names to uppercase.
from pyspark.sql.functions import col,upper

df = df.withColumn("Name",upper(col("Name")))
df.show()

+------+-----------+------+-------+-------+------+
|  Name| Department|Salary|  Bonus| NetPay| Level|
+------+-----------+------+-------+-------+------+
|ANANYA|         HR| 52000| 7800.0|59800.0|   Mid|
| RAHUL|Engineering| 65000| 9750.0|74750.0|Senior|
| PRIYA|Engineering| 60000| 9000.0|69000.0|   Mid|
|  ZOYA|  Marketing| 48000| 7200.0|55200.0|Junior|
| KARAN|         HR| 53000| 7950.0|60950.0|   Mid|
|NAVEEN|Engineering| 70000|10500.0|80500.0|Senior|
|FATIMA|  Marketing| 45000| 6750.0|51750.0|Junior|
+------+-----------+------+-------+-------+------+

