In [1]:
from google.colab import files
uploaded = files.upload()


Saving departments.csv to departments.csv
Saving employees.csv to employees.csv


In [2]:
!pip install pyspark

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Assessment").getOrCreate()

# Load CSVs
emp_df = spark.read.csv("employees.csv", header=True, inferSchema=True)
dept_df = spark.read.csv("departments.csv", header=True, inferSchema=True)

emp_df.show()
dept_df.show()


+------+--------+----------+------+---+
|emp_id|emp_name|department|salary|age|
+------+--------+----------+------+---+
|     1|   Alice|        HR| 60000| 30|
|     2|     Bob|        IT| 75000| 28|
|     3| Charlie|        IT| 70000| 34|
|     4|   David|   Finance| 80000| 40|
|     5|     Eve|        HR| 65000| 25|
|     6|   Frank|        IT| 90000| 38|
|     7|   Grace|   Finance| 85000| 45|
|     8|    Hank|        HR| 50000| 50|
+------+--------+----------+------+---+

+----------+---------+
|department| location|
+----------+---------+
|        HR|  Chennai|
|        IT|Bangalore|
|   Finance|   Mumbai|
+----------+---------+



In [3]:
#TRANSFORMATIONS & ACTIONS
#Filter: Employees with salary > 70000

high_salary = emp_df.filter(emp_df.salary > 70000)
high_salary.show()

+------+--------+----------+------+---+
|emp_id|emp_name|department|salary|age|
+------+--------+----------+------+---+
|     2|     Bob|        IT| 75000| 28|
|     4|   David|   Finance| 80000| 40|
|     6|   Frank|        IT| 90000| 38|
|     7|   Grace|   Finance| 85000| 45|
+------+--------+----------+------+---+



In [4]:
#Join: Join with department info

joined_df = emp_df.join(dept_df, on="department", how="inner")
joined_df.show()


+----------+------+--------+------+---+---------+
|department|emp_id|emp_name|salary|age| location|
+----------+------+--------+------+---+---------+
|        HR|     1|   Alice| 60000| 30|  Chennai|
|        IT|     2|     Bob| 75000| 28|Bangalore|
|        IT|     3| Charlie| 70000| 34|Bangalore|
|   Finance|     4|   David| 80000| 40|   Mumbai|
|        HR|     5|     Eve| 65000| 25|  Chennai|
|        IT|     6|   Frank| 90000| 38|Bangalore|
|   Finance|     7|   Grace| 85000| 45|   Mumbai|
|        HR|     8|    Hank| 50000| 50|  Chennai|
+----------+------+--------+------+---+---------+



In [5]:
#GroupBy + Aggregation: Total salary by department

from pyspark.sql.functions import sum

dept_salary = emp_df.groupBy("department").agg(sum("salary").alias("total_salary"))
dept_salary.show()


+----------+------------+
|department|total_salary|
+----------+------------+
|        HR|      175000|
|   Finance|      165000|
|        IT|      235000|
+----------+------------+



In [6]:
#Simple Aggregation: Average Age

from pyspark.sql.functions import avg

emp_df.select(avg("age").alias("avg_age")).show()


+-------+
|avg_age|
+-------+
|  36.25|
+-------+



In [7]:
#Window Function: Rank employees by salary within department

from pyspark.sql.window import Window
from pyspark.sql.functions import rank

windowSpec = Window.partitionBy("department").orderBy(emp_df.salary.desc())
ranked_df = emp_df.withColumn("rank", rank().over(windowSpec))
ranked_df.show()


+------+--------+----------+------+---+----+
|emp_id|emp_name|department|salary|age|rank|
+------+--------+----------+------+---+----+
|     7|   Grace|   Finance| 85000| 45|   1|
|     4|   David|   Finance| 80000| 40|   2|
|     5|     Eve|        HR| 65000| 25|   1|
|     1|   Alice|        HR| 60000| 30|   2|
|     8|    Hank|        HR| 50000| 50|   3|
|     6|   Frank|        IT| 90000| 38|   1|
|     2|     Bob|        IT| 75000| 28|   2|
|     3| Charlie|        IT| 70000| 34|   3|
+------+--------+----------+------+---+----+



In [9]:
#sample actions

print("Count:", emp_df.count())
print("First row:", emp_df.first())


Count: 8
First row: Row(emp_id=1, emp_name='Alice', department='HR', salary=60000, age=30)
