**Scenario: Employee Work Data for a Tech Company**

Step 1: Prepare Data in PySpark

In [36]:
from pyspark.sql import Row
data = [
Row(EmpID=101, Name="Ravi", Department="Engineering", Project="AI Engine",
Salary=95000, HoursPerWeek=42),
Row(EmpID=102, Name="Sneha", Department="Engineering", Project="Data Platform",
Salary=87000, HoursPerWeek=45),
Row(EmpID=103, Name="Kabir", Department="Marketing", Project="Product Launch",
Salary=65000, HoursPerWeek=40),
Row(EmpID=104, Name="Anita", Department="Sales", Project="Client Outreach",
Salary=70000, HoursPerWeek=38),
Row(EmpID=105, Name="Divya", Department="Engineering", Project="AI Engine",
Salary=99000, HoursPerWeek=48),
Row(EmpID=106, Name="Amit", Department="Marketing", Project="Social Media",
Salary=62000, HoursPerWeek=35),
Row(EmpID=107, Name="Priya", Department="HR", Project="Policy Revamp",
Salary=58000, HoursPerWeek=37),
Row(EmpID=108, Name="Manav", Department="Sales", Project="Lead Gen", Salary=73000,
HoursPerWeek=41),
Row(EmpID=109, Name="Neha", Department="Engineering", Project="Security Suite",
Salary=91000, HoursPerWeek=46),
Row(EmpID=110, Name="Farah", Department="HR", Project="Onboarding", Salary=60000,
HoursPerWeek=36)
]

In [37]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("EmployeeData").getOrCreate()
df = spark.createDataFrame(data)
df.show()

+-----+-----+-----------+---------------+------+------------+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|
+-----+-----+-----------+---------------+------+------------+
|  101| Ravi|Engineering|      AI Engine| 95000|          42|
|  102|Sneha|Engineering|  Data Platform| 87000|          45|
|  103|Kabir|  Marketing| Product Launch| 65000|          40|
|  104|Anita|      Sales|Client Outreach| 70000|          38|
|  105|Divya|Engineering|      AI Engine| 99000|          48|
|  106| Amit|  Marketing|   Social Media| 62000|          35|
|  107|Priya|         HR|  Policy Revamp| 58000|          37|
|  108|Manav|      Sales|       Lead Gen| 73000|          41|
|  109| Neha|Engineering| Security Suite| 91000|          46|
|  110|Farah|         HR|     Onboarding| 60000|          36|
+-----+-----+-----------+---------------+------+------------+



**Step 2: Create Views**

Create a Local Temp View

In [38]:
df.createOrReplaceTempView("employees_local")

Create a Global Temp View

In [39]:
df.createOrReplaceGlobalTempView("employees_global")

**Part A: Exercises on Local View ( employees_local )**
1. List all employees working on the "AI Engine" project.

In [40]:
spark.sql("select * from employees_local where project='AI Engine';").show()

+-----+-----+-----------+---------+------+------------+
|EmpID| Name| Department|  Project|Salary|HoursPerWeek|
+-----+-----+-----------+---------+------+------------+
|  101| Ravi|Engineering|AI Engine| 95000|          42|
|  105|Divya|Engineering|AI Engine| 99000|          48|
+-----+-----+-----------+---------+------+------------+



2. Show all employees from the "Marketing" department with salaries greater than
60,000.

In [41]:
spark.sql("select * from employees_local where department='Marketing' and Salary>60000").show()

+-----+-----+----------+--------------+------+------------+
|EmpID| Name|Department|       Project|Salary|HoursPerWeek|
+-----+-----+----------+--------------+------+------------+
|  103|Kabir| Marketing|Product Launch| 65000|          40|
|  106| Amit| Marketing|  Social Media| 62000|          35|
+-----+-----+----------+--------------+------+------------+



3. Calculate the average salary for each department.

In [42]:
spark.sql("select department,avg(salary) as AVG_Sal from employees_local group by department").show()

+-----------+-------+
| department|AVG_Sal|
+-----------+-------+
|      Sales|71500.0|
|Engineering|93000.0|
|  Marketing|63500.0|
|         HR|59000.0|
+-----------+-------+



4. List the top 3 highest paid employees overall.

In [43]:
spark.sql("select * from employees_local order by salary desc limit 3").show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  109| Neha|Engineering|Security Suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



5. Find employees who work more than 40 hours per week.

In [44]:
spark.sql("select * from employees_local where HoursPerWeek>40").show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  102|Sneha|Engineering| Data Platform| 87000|          45|
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  108|Manav|      Sales|      Lead Gen| 73000|          41|
|  109| Neha|Engineering|Security Suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



6. Group by project and display the number of employees per project.

In [45]:
spark.sql("select Project,count(EmpID) as Emp_count from employees_local group by Project").show()

+---------------+---------+
|        Project|Emp_count|
+---------------+---------+
|  Data Platform|        1|
|      AI Engine|        2|
| Product Launch|        1|
|Client Outreach|        1|
| Security Suite|        1|
|  Policy Revamp|        1|
|       Lead Gen|        1|
|   Social Media|        1|
|     Onboarding|        1|
+---------------+---------+



7. Drop the local view. Try querying again — what happens?

In [46]:
spark.sql("drop view employees_local")
# spark.sql("select * from employees_local").show()

DataFrame[]

**Part B: Exercises on Global View ( employees_global )**
1. Retrieve all "HR" employees working fewer than 38 hours/week.

In [47]:
spark.sql("select * from global_temp.employees_global where department='HR'").show()

+-----+-----+----------+-------------+------+------------+
|EmpID| Name|Department|      Project|Salary|HoursPerWeek|
+-----+-----+----------+-------------+------+------------+
|  107|Priya|        HR|Policy Revamp| 58000|          37|
|  110|Farah|        HR|   Onboarding| 60000|          36|
+-----+-----+----------+-------------+------+------------+



2. Calculate the total salary payout for each department.

In [48]:
spark.sql("select department,sum(Salary) from global_temp.employees_global group by department").show()

+-----------+-----------+
| department|sum(Salary)|
+-----------+-----------+
|      Sales|     143000|
|Engineering|     372000|
|  Marketing|     127000|
|         HR|     118000|
+-----------+-----------+



3. For each employee, add a derived column Status :

If HoursPerWeek > 45 → 'Overworked'

Otherwise → 'Normal'

In [49]:
spark.sql("""select * ,
 case
 when HoursPerWeek>45 then 'Overworked'
 else 'Normal'
 End as Status
 from global_temp.employees_global
  """).show()

+-----+-----+-----------+---------------+------+------------+----------+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|    Status|
+-----+-----+-----------+---------------+------+------------+----------+
|  101| Ravi|Engineering|      AI Engine| 95000|          42|    Normal|
|  102|Sneha|Engineering|  Data Platform| 87000|          45|    Normal|
|  103|Kabir|  Marketing| Product Launch| 65000|          40|    Normal|
|  104|Anita|      Sales|Client Outreach| 70000|          38|    Normal|
|  105|Divya|Engineering|      AI Engine| 99000|          48|Overworked|
|  106| Amit|  Marketing|   Social Media| 62000|          35|    Normal|
|  107|Priya|         HR|  Policy Revamp| 58000|          37|    Normal|
|  108|Manav|      Sales|       Lead Gen| 73000|          41|    Normal|
|  109| Neha|Engineering| Security Suite| 91000|          46|Overworked|
|  110|Farah|         HR|     Onboarding| 60000|          36|    Normal|
+-----+-----+-----------+---------------+------+---

4. Count the total number of employees working on each "Project" .

In [50]:
spark.sql("select project,count(EmpID) from global_temp.employees_global group by project").show()

+---------------+------------+
|        project|count(EmpID)|
+---------------+------------+
|  Data Platform|           1|
|      AI Engine|           2|
| Product Launch|           1|
|Client Outreach|           1|
| Security Suite|           1|
|  Policy Revamp|           1|
|       Lead Gen|           1|
|   Social Media|           1|
|     Onboarding|           1|
+---------------+------------+



5. List employees whose salary is above the average salary in their department.

In [51]:
spark.sql("select * from global_temp.employees_global where salary>(select avg(salary) from global_temp.employees_global)").show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  102|Sneha|Engineering| Data Platform| 87000|          45|
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  109| Neha|Engineering|Security Suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



6. Open a new Spark session and query "global_temp.employees_global" from there.

In [52]:
new_spark=SparkSession.builder.appName("New_Session").getOrCreate()
new_spark.sql("select * from global_temp.employees_global").show()

+-----+-----+-----------+---------------+------+------------+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|
+-----+-----+-----------+---------------+------+------------+
|  101| Ravi|Engineering|      AI Engine| 95000|          42|
|  102|Sneha|Engineering|  Data Platform| 87000|          45|
|  103|Kabir|  Marketing| Product Launch| 65000|          40|
|  104|Anita|      Sales|Client Outreach| 70000|          38|
|  105|Divya|Engineering|      AI Engine| 99000|          48|
|  106| Amit|  Marketing|   Social Media| 62000|          35|
|  107|Priya|         HR|  Policy Revamp| 58000|          37|
|  108|Manav|      Sales|       Lead Gen| 73000|          41|
|  109| Neha|Engineering| Security Suite| 91000|          46|
|  110|Farah|         HR|     Onboarding| 60000|          36|
+-----+-----+-----------+---------------+------+------------+



**Bonus Challenges**
1. Use a window function to assign rank to employees within each department based
on salary.

In [53]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
windowSpec=Window.partitionBy("department").orderBy("salary")
df.withColumn("rank",row_number().over(windowSpec)).show()

+-----+-----+-----------+---------------+------+------------+----+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|rank|
+-----+-----+-----------+---------------+------+------------+----+
|  102|Sneha|Engineering|  Data Platform| 87000|          45|   1|
|  109| Neha|Engineering| Security Suite| 91000|          46|   2|
|  101| Ravi|Engineering|      AI Engine| 95000|          42|   3|
|  105|Divya|Engineering|      AI Engine| 99000|          48|   4|
|  107|Priya|         HR|  Policy Revamp| 58000|          37|   1|
|  110|Farah|         HR|     Onboarding| 60000|          36|   2|
|  106| Amit|  Marketing|   Social Media| 62000|          35|   1|
|  103|Kabir|  Marketing| Product Launch| 65000|          40|   2|
|  104|Anita|      Sales|Client Outreach| 70000|          38|   1|
|  108|Manav|      Sales|       Lead Gen| 73000|          41|   2|
+-----+-----+-----------+---------------+------+------------+----+



2. Create another view (local or global) that only contains "Engineering"
employees.

In [54]:
eng_df=df.filter(df.Department=="Engineering")
eng_df.createOrReplaceTempView("eng_employees")
eng_df.show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  102|Sneha|Engineering| Data Platform| 87000|          45|
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  109| Neha|Engineering|Security Suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



3. Create a SQL view that filters out all employees working < 38 hours and saves
it as "active_employees"

In [55]:
filt_view=df.filter(df.HoursPerWeek>38)
filt_view.createOrReplaceTempView("active_employees")
filt_view.show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  102|Sneha|Engineering| Data Platform| 87000|          45|
|  103|Kabir|  Marketing|Product Launch| 65000|          40|
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  108|Manav|      Sales|      Lead Gen| 73000|          41|
|  109| Neha|Engineering|Security Suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+

