In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('employees').getOrCreate()

In [3]:
from pyspark.sql import Row
data = [
Row(EmpID=101, Name="Ravi", Department="Engineering", Project="AI Engine",
Salary=95000, HoursPerWeek=42),
Row(EmpID=102, Name="Sneha", Department="Engineering", Project="Data Platform",
Salary=87000, HoursPerWeek=45),
Row(EmpID=103, Name="Kabir", Department="Marketing", Project="Product Launch",
Salary=65000, HoursPerWeek=40),
Row(EmpID=104, Name="Anita", Department="Sales", Project="Client Outreach",
Salary=70000, HoursPerWeek=38),
Row(EmpID=105, Name="Divya", Department="Engineering", Project="AI Engine",
Salary=99000, HoursPerWeek=48),
Row(EmpID=106, Name="Amit", Department="Marketing", Project="Social Media",
Salary=62000, HoursPerWeek=35),
Row(EmpID=107, Name="Priya", Department="HR", Project="Policy Revamp",
Salary=58000, HoursPerWeek=37),
Row(EmpID=108, Name="Manav", Department="Sales", Project="Lead Gen", Salary=73000,
HoursPerWeek=41),
Row(EmpID=109, Name="Neha", Department="Engineering", Project="Security Suite",
Salary=91000, HoursPerWeek=46),
Row(EmpID=110, Name="Farah", Department="HR", Project="Onboarding", Salary=60000,
HoursPerWeek=36)
]

df = spark.createDataFrame(data)
df.show(truncate=False)

+-----+-----+-----------+---------------+------+------------+
|EmpID|Name |Department |Project        |Salary|HoursPerWeek|
+-----+-----+-----------+---------------+------+------------+
|101  |Ravi |Engineering|AI Engine      |95000 |42          |
|102  |Sneha|Engineering|Data Platform  |87000 |45          |
|103  |Kabir|Marketing  |Product Launch |65000 |40          |
|104  |Anita|Sales      |Client Outreach|70000 |38          |
|105  |Divya|Engineering|AI Engine      |99000 |48          |
|106  |Amit |Marketing  |Social Media   |62000 |35          |
|107  |Priya|HR         |Policy Revamp  |58000 |37          |
|108  |Manav|Sales      |Lead Gen       |73000 |41          |
|109  |Neha |Engineering|Security Suite |91000 |46          |
|110  |Farah|HR         |Onboarding     |60000 |36          |
+-----+-----+-----------+---------------+------+------------+



In [4]:
df.createOrReplaceTempView("employees_local")

In [5]:
df.createOrReplaceGlobalTempView("employees_global")

#PART A

List all employees working on the "AI Engine" project.

In [8]:
spark.sql("select * from employees_local where Project = 'AI Engine' ").show()

+-----+-----+-----------+---------+------+------------+
|EmpID| Name| Department|  Project|Salary|HoursPerWeek|
+-----+-----+-----------+---------+------+------------+
|  101| Ravi|Engineering|AI Engine| 95000|          42|
|  105|Divya|Engineering|AI Engine| 99000|          48|
+-----+-----+-----------+---------+------+------------+



Show all employees from the "Marketing" department with salaries greater than
60,000.

In [10]:
spark.sql("select * from employees_local where Department = 'Marketing' and salary >60000").show()

+-----+-----+----------+--------------+------+------------+
|EmpID| Name|Department|       Project|Salary|HoursPerWeek|
+-----+-----+----------+--------------+------+------------+
|  103|Kabir| Marketing|Product Launch| 65000|          40|
|  106| Amit| Marketing|  Social Media| 62000|          35|
+-----+-----+----------+--------------+------+------------+



Calculate the average salary for each department.

In [11]:
spark.sql('select department,avg(salary) as average from employees_local group by department').show()

+-----------+-------+
| department|average|
+-----------+-------+
|      Sales|71500.0|
|Engineering|93000.0|
|  Marketing|63500.0|
|         HR|59000.0|
+-----------+-------+



List the top 3 highest paid employees overall.

In [12]:
spark.sql('select * from employees_local order by salary desc limit 3').show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  109| Neha|Engineering|Security Suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



Find employees who work more than 40 hours per week.

In [13]:
spark.sql('select * from employees_local where hoursperweek >40').show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  102|Sneha|Engineering| Data Platform| 87000|          45|
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  108|Manav|      Sales|      Lead Gen| 73000|          41|
|  109| Neha|Engineering|Security Suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



Group by project and display the number of employees per project.

In [17]:
spark.sql('select project, count(empid)as count from employees_local group by project').show()

+---------------+-----+
|        project|count|
+---------------+-----+
|  Data Platform|    1|
|      AI Engine|    2|
| Product Launch|    1|
|Client Outreach|    1|
| Security Suite|    1|
|  Policy Revamp|    1|
|       Lead Gen|    1|
|   Social Media|    1|
|     Onboarding|    1|
+---------------+-----+



Drop the local view. Try querying again — what happens?

In [18]:
spark.catalog.dropTempView('employees_local')

True

In [19]:
spark.sql('select * from employees_local').show()

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `employees_local` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [employees_local], [], false


# PART B

Retrieve all "HR" employees working fewer than 38 hours/week.

In [20]:
spark.sql("select * from global_temp.employees_global where department = 'HR' and hoursperweek<38").show()

+-----+-----+----------+-------------+------+------------+
|EmpID| Name|Department|      Project|Salary|HoursPerWeek|
+-----+-----+----------+-------------+------+------------+
|  107|Priya|        HR|Policy Revamp| 58000|          37|
|  110|Farah|        HR|   Onboarding| 60000|          36|
+-----+-----+----------+-------------+------+------------+



Calculate the total salary payout for each department.

In [21]:
spark.sql('select department,sum(salary)as totalpay from global_temp.employees_global group by department').show()

+-----------+--------+
| department|totalpay|
+-----------+--------+
|      Sales|  143000|
|Engineering|  372000|
|  Marketing|  127000|
|         HR|  118000|
+-----------+--------+



For each employee, add a derived column Status :
If HoursPerWeek > 45 → 'Overworked'
Otherwise → 'Normal'

In [24]:
spark.sql("""
select *,
CASE
  WHEN HoursPerWeek >45 then 'overworked'
  ELSE 'normal'
END AS status
from global_temp.employees_global""").show()

+-----+-----+-----------+---------------+------+------------+----------+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|    status|
+-----+-----+-----------+---------------+------+------------+----------+
|  101| Ravi|Engineering|      AI Engine| 95000|          42|    normal|
|  102|Sneha|Engineering|  Data Platform| 87000|          45|    normal|
|  103|Kabir|  Marketing| Product Launch| 65000|          40|    normal|
|  104|Anita|      Sales|Client Outreach| 70000|          38|    normal|
|  105|Divya|Engineering|      AI Engine| 99000|          48|overworked|
|  106| Amit|  Marketing|   Social Media| 62000|          35|    normal|
|  107|Priya|         HR|  Policy Revamp| 58000|          37|    normal|
|  108|Manav|      Sales|       Lead Gen| 73000|          41|    normal|
|  109| Neha|Engineering| Security Suite| 91000|          46|overworked|
|  110|Farah|         HR|     Onboarding| 60000|          36|    normal|
+-----+-----+-----------+---------------+------+---

Count the total number of employees working on each "Project" .

In [26]:
spark.sql('select project,count(empid) from global_temp.employees_global group by project').show()

+---------------+------------+
|        project|count(empid)|
+---------------+------------+
|  Data Platform|           1|
|      AI Engine|           2|
| Product Launch|           1|
|Client Outreach|           1|
| Security Suite|           1|
|  Policy Revamp|           1|
|       Lead Gen|           1|
|   Social Media|           1|
|     Onboarding|           1|
+---------------+------------+



List employees whose salary is above the average salary in their department.

In [29]:
spark.sql('select * from global_temp.employees_global where salary >(select avg(salary) from global_temp.employees_global)').show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  102|Sneha|Engineering| Data Platform| 87000|          45|
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  109| Neha|Engineering|Security Suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



List employees whose salary is above the average salary in their department.

In [30]:
new_spark = SparkSession.builder.appName('new').getOrCreate()

In [32]:
new_spark.sql('select * from global_temp.employees_global').show()

+-----+-----+-----------+---------------+------+------------+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|
+-----+-----+-----------+---------------+------+------------+
|  101| Ravi|Engineering|      AI Engine| 95000|          42|
|  102|Sneha|Engineering|  Data Platform| 87000|          45|
|  103|Kabir|  Marketing| Product Launch| 65000|          40|
|  104|Anita|      Sales|Client Outreach| 70000|          38|
|  105|Divya|Engineering|      AI Engine| 99000|          48|
|  106| Amit|  Marketing|   Social Media| 62000|          35|
|  107|Priya|         HR|  Policy Revamp| 58000|          37|
|  108|Manav|      Sales|       Lead Gen| 73000|          41|
|  109| Neha|Engineering| Security Suite| 91000|          46|
|  110|Farah|         HR|     Onboarding| 60000|          36|
+-----+-----+-----------+---------------+------+------------+



# BONUS

Use a window function to assign rank to employees within each department based
on salary.

In [34]:
spark.sql('select *,rank() over(partition by department order by salary desc) as rank from global_temp.employees_global').show()

+-----+-----+-----------+---------------+------+------------+----+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|rank|
+-----+-----+-----------+---------------+------+------------+----+
|  105|Divya|Engineering|      AI Engine| 99000|          48|   1|
|  101| Ravi|Engineering|      AI Engine| 95000|          42|   2|
|  109| Neha|Engineering| Security Suite| 91000|          46|   3|
|  102|Sneha|Engineering|  Data Platform| 87000|          45|   4|
|  110|Farah|         HR|     Onboarding| 60000|          36|   1|
|  107|Priya|         HR|  Policy Revamp| 58000|          37|   2|
|  103|Kabir|  Marketing| Product Launch| 65000|          40|   1|
|  106| Amit|  Marketing|   Social Media| 62000|          35|   2|
|  108|Manav|      Sales|       Lead Gen| 73000|          41|   1|
|  104|Anita|      Sales|Client Outreach| 70000|          38|   2|
+-----+-----+-----------+---------------+------+------------+----+



Create another view (local or global) that only contains "Engineering"
employees.

In [35]:
spark.sql("select * from global_temp.employees_global where department='Engineering'").createOrReplaceTempView('Engineering')

In [36]:
spark.sql('select * from engineering').show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  102|Sneha|Engineering| Data Platform| 87000|          45|
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  109| Neha|Engineering|Security Suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



Create a SQL view that filters out all employees working < 38 hours and saves
it as "active_employees"

In [40]:
spark.sql('select * from global_temp.employees_global where hoursperweek>38').createOrReplaceTempView('active_employees')

In [41]:
spark.sql('select * from active_employees').show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  102|Sneha|Engineering| Data Platform| 87000|          45|
|  103|Kabir|  Marketing|Product Launch| 65000|          40|
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  108|Manav|      Sales|      Lead Gen| 73000|          41|
|  109| Neha|Engineering|Security Suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+

