In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, upper

# Create SparkSession
spark = SparkSession.builder.appName("EmployeeDataAnalysis").getOrCreate()

# Dataset
data = [
    ("Ananya", "HR", 52000),
    ("Rahul", "Engineering", 65000),
    ("Priya", "Engineering", 60000),
    ("Zoya", "Marketing", 48000),
    ("Karan", "HR", 53000),
    ("Naveen", "Engineering", 70000),
    ("Fatima", "Marketing", 45000)
]
columns = ["Name", "Department", "Salary"]

# Create DataFrame
df = spark.createDataFrame(data, columns)


In [0]:
# 1. Display all records
df.show()

# 2. Print the schema
df.printSchema()

# 3. Count total number of employees
df.count()


+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|  Zoya|  Marketing| 48000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
|Fatima|  Marketing| 45000|
+------+-----------+------+

root
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)



7

In [0]:
# 4. Add a new column Bonus (15% of Salary)
df_with_bonus = df.withColumn("Bonus", col("Salary") * 0.15)
df_with_bonus.show()

# 5. Add NetPay = Salary + Bonus
df_with_netpay = df_with_bonus.withColumn("NetPay", col("Salary") + col("Bonus"))
df_with_netpay.show()


+------+-----------+------+-------+
|  Name| Department|Salary|  Bonus|
+------+-----------+------+-------+
|Ananya|         HR| 52000| 7800.0|
| Rahul|Engineering| 65000| 9750.0|
| Priya|Engineering| 60000| 9000.0|
|  Zoya|  Marketing| 48000| 7200.0|
| Karan|         HR| 53000| 7950.0|
|Naveen|Engineering| 70000|10500.0|
|Fatima|  Marketing| 45000| 6750.0|
+------+-----------+------+-------+

+------+-----------+------+-------+-------+
|  Name| Department|Salary|  Bonus| NetPay|
+------+-----------+------+-------+-------+
|Ananya|         HR| 52000| 7800.0|59800.0|
| Rahul|Engineering| 65000| 9750.0|74750.0|
| Priya|Engineering| 60000| 9000.0|69000.0|
|  Zoya|  Marketing| 48000| 7200.0|55200.0|
| Karan|         HR| 53000| 7950.0|60950.0|
|Naveen|Engineering| 70000|10500.0|80500.0|
|Fatima|  Marketing| 45000| 6750.0|51750.0|
+------+-----------+------+-------+-------+



In [0]:
# 6. Employees from Engineering department
df.filter(col("Department") == "Engineering").show()

# 7. Employees with salary > 60000
df.filter(col("Salary") > 60000).show()

# 8. Employees not in Marketing
df.filter(col("Department") != "Marketing").show()


+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
|Naveen|Engineering| 70000|
+------+-----------+------+

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
| Rahul|Engineering| 65000|
|Naveen|Engineering| 70000|
+------+-----------+------+

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Ananya|         HR| 52000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
| Karan|         HR| 53000|
|Naveen|Engineering| 70000|
+------+-----------+------+



In [0]:
# 9. Top 3 highest paid employees
df.orderBy(col("Salary").desc()).show(3)

# 10. Sort by Department ASC, Salary DESC
df.orderBy(col("Department").asc(), col("Salary").desc()).show()


+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Naveen|Engineering| 70000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
+------+-----------+------+
only showing top 3 rows

+------+-----------+------+
|  Name| Department|Salary|
+------+-----------+------+
|Naveen|Engineering| 70000|
| Rahul|Engineering| 65000|
| Priya|Engineering| 60000|
| Karan|         HR| 53000|
|Ananya|         HR| 52000|
|  Zoya|  Marketing| 48000|
|Fatima|  Marketing| 45000|
+------+-----------+------+



In [0]:
# 11. Add Level column
df_with_level = df.withColumn(
    "Level",
    when(col("Salary") > 60000, "Senior")
    .when((col("Salary") >= 50000) & (col("Salary") <= 60000), "Mid")
    .otherwise("Junior")
)
df_with_level.show()

# 12. Convert all names to uppercase
df_upper = df.withColumn("Name_Upper", upper(col("Name")))
df_upper.show()


+------+-----------+------+------+
|  Name| Department|Salary| Level|
+------+-----------+------+------+
|Ananya|         HR| 52000|   Mid|
| Rahul|Engineering| 65000|Senior|
| Priya|Engineering| 60000|   Mid|
|  Zoya|  Marketing| 48000|Junior|
| Karan|         HR| 53000|   Mid|
|Naveen|Engineering| 70000|Senior|
|Fatima|  Marketing| 45000|Junior|
+------+-----------+------+------+

+------+-----------+------+----------+
|  Name| Department|Salary|Name_Upper|
+------+-----------+------+----------+
|Ananya|         HR| 52000|    ANANYA|
| Rahul|Engineering| 65000|     RAHUL|
| Priya|Engineering| 60000|     PRIYA|
|  Zoya|  Marketing| 48000|      ZOYA|
| Karan|         HR| 53000|     KARAN|
|Naveen|Engineering| 70000|    NAVEEN|
|Fatima|  Marketing| 45000|    FATIMA|
+------+-----------+------+----------+

