# Scenario: Employee Work Data for a Tech Company
We'll simulate data across departments, projects, salaries, and work hours.

Step 1: Prepare Data in PySpark

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SparkSQLExample") \
    .getOrCreate()

from pyspark.sql import Row
data = [
Row(EmpID=101, Name="Ravi", Department="Engineering", Project="AI Engine",
Salary=95000, HoursPerWeek=42),
Row(EmpID=102, Name="Sneha", Department="Engineering", Project="Data Platform",
Salary=87000, HoursPerWeek=45),
Row(EmpID=103, Name="Kabir", Department="Marketing", Project="Product Launch",
Salary=65000, HoursPerWeek=40),
Row(EmpID=104, Name="Anita", Department="Sales", Project="Client Outreach",
Salary=70000, HoursPerWeek=38),
Row(EmpID=105, Name="Divya", Department="Engineering", Project="AI Engine",
Salary=99000, HoursPerWeek=48),
Row(EmpID=106, Name="Amit", Department="Marketing", Project="Social Media",
Salary=62000, HoursPerWeek=35),
Row(EmpID=107, Name="Priya", Department="HR", Project="Policy Revamp",
Salary=58000, HoursPerWeek=37),
Row(EmpID=108, Name="Manav", Department="Sales", Project="Lead Gen", Salary=73000,
HoursPerWeek=41),
Row(EmpID=109, Name="Neha", Department="Engineering", Project="Security Suite",
Salary=91000, HoursPerWeek=46),
Row(EmpID=110, Name="Farah", Department="HR", Project="Onboarding", Salary=60000,
HoursPerWeek=36)
]
df = spark.createDataFrame(data)
df.show(truncate=False)

+-----+-----+-----------+---------------+------+------------+
|EmpID|Name |Department |Project        |Salary|HoursPerWeek|
+-----+-----+-----------+---------------+------+------------+
|101  |Ravi |Engineering|AI Engine      |95000 |42          |
|102  |Sneha|Engineering|Data Platform  |87000 |45          |
|103  |Kabir|Marketing  |Product Launch |65000 |40          |
|104  |Anita|Sales      |Client Outreach|70000 |38          |
|105  |Divya|Engineering|AI Engine      |99000 |48          |
|106  |Amit |Marketing  |Social Media   |62000 |35          |
|107  |Priya|HR         |Policy Revamp  |58000 |37          |
|108  |Manav|Sales      |Lead Gen       |73000 |41          |
|109  |Neha |Engineering|Security Suite |91000 |46          |
|110  |Farah|HR         |Onboarding     |60000 |36          |
+-----+-----+-----------+---------------+------+------------+



Step 2: Create Views   

Create a Local Temp View



In [3]:
df.createOrReplaceTempView("employees_local")

Create a Global Temp View


In [4]:
df.createOrReplaceGlobalTempView("employees_global")

# Part A: Exercises on Local View ( employees_local )
1. List all employees working on the "AI Engine" project.
2. Show all employees from the "Marketing" department with salaries greater than
60,000.
3. Calculate the average salary for each department.
4. List the top 3 highest paid employees overall.
5. Find employees who work more than 40 hours per week.
6. Group by project and display the number of employees per project.
7. Drop the local view. Try querying again — what happens?

In [5]:
#List all employees working on the "AI Engine" project.
spark.sql('Select *from employees_local where Project ="AI Engine"').show()

+-----+-----+-----------+---------+------+------------+
|EmpID| Name| Department|  Project|Salary|HoursPerWeek|
+-----+-----+-----------+---------+------+------------+
|  101| Ravi|Engineering|AI Engine| 95000|          42|
|  105|Divya|Engineering|AI Engine| 99000|          48|
+-----+-----+-----------+---------+------+------------+



In [6]:
#Show all employees from the "Marketing" department with salaries greater than 60,000.
spark.sql('Select * from employees_local where Department="Marketing" and Salary >=60000' ).show()

+-----+-----+----------+--------------+------+------------+
|EmpID| Name|Department|       Project|Salary|HoursPerWeek|
+-----+-----+----------+--------------+------+------------+
|  103|Kabir| Marketing|Product Launch| 65000|          40|
|  106| Amit| Marketing|  Social Media| 62000|          35|
+-----+-----+----------+--------------+------+------------+



In [7]:
#Calculate the average salary for each department.
spark.sql('Select Department,avg(Salary) from employees_local group by Department').show()

+-----------+-----------+
| Department|avg(Salary)|
+-----------+-----------+
|      Sales|    71500.0|
|Engineering|    93000.0|
|  Marketing|    63500.0|
|         HR|    59000.0|
+-----------+-----------+



In [8]:
#List the top 3 highest paid employees overall.
spark.sql('Select * from employees_local order by Salary desc limit 3').show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  109| Neha|Engineering|Security Suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



In [9]:
#Find employees who work more than 40 hours per week.
spark.sql('Select * from employees_local where HoursPerWeek>40').show()

+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  102|Sneha|Engineering| Data Platform| 87000|          45|
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  108|Manav|      Sales|      Lead Gen| 73000|          41|
|  109| Neha|Engineering|Security Suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



In [10]:
#Group by project and display the number of employees per project.
spark.sql('Select Project,count(*) from employees_local group by Project').show()


+---------------+--------+
|        Project|count(1)|
+---------------+--------+
|  Data Platform|       1|
|      AI Engine|       2|
| Product Launch|       1|
|Client Outreach|       1|
| Security Suite|       1|
|  Policy Revamp|       1|
|       Lead Gen|       1|
|   Social Media|       1|
|     Onboarding|       1|
+---------------+--------+



# Part B: Exercises on Global View ( employees_global )
1. Retrieve all "HR" employees working fewer than 38 hours/week.
2. Calculate the total salary payout for each department.
3. For each employee, add a derived column Status :
If HoursPerWeek > 45 → 'Overworked'
Otherwise → 'Normal'
4. Count the total number of employees working on each "Project" .
5. List employees whose salary is above the average salary in their department.
6. Open a new Spark session and query "global_temp.employees_global" from there.

In [11]:
#Retrieve all "HR" employees working fewer than 38 hours/week.
spark.sql('Select * from global_temp.employees_global where Department="HR" and HoursPerWeek<38').show()

+-----+-----+----------+-------------+------+------------+
|EmpID| Name|Department|      Project|Salary|HoursPerWeek|
+-----+-----+----------+-------------+------+------------+
|  107|Priya|        HR|Policy Revamp| 58000|          37|
|  110|Farah|        HR|   Onboarding| 60000|          36|
+-----+-----+----------+-------------+------+------------+



In [15]:
#Calculate the total salary payout for each department.
spark.sql('Select Department,sum(Salary)  as Total_Salary from  global_temp.employees_global group by Department').show()

+-----------+------------+
| Department|Total_Salary|
+-----------+------------+
|      Sales|      143000|
|Engineering|      372000|
|  Marketing|      127000|
|         HR|      118000|
+-----------+------------+



In [16]:
#For each employee, add a derived column Status : If HoursPerWeek > 45 → 'Overworked' Otherwise → 'Normal'
spark.sql('''
    SELECT
        Name,
        Department,
        HoursPerWeek,
        CASE
            WHEN HoursPerWeek > 45 THEN 'Overworked'
            ELSE 'Normal'
        END AS Status
    FROM global_temp.employees_global
''').show()


+-----+-----------+------------+----------+
| Name| Department|HoursPerWeek|    Status|
+-----+-----------+------------+----------+
| Ravi|Engineering|          42|    Normal|
|Sneha|Engineering|          45|    Normal|
|Kabir|  Marketing|          40|    Normal|
|Anita|      Sales|          38|    Normal|
|Divya|Engineering|          48|Overworked|
| Amit|  Marketing|          35|    Normal|
|Priya|         HR|          37|    Normal|
|Manav|      Sales|          41|    Normal|
| Neha|Engineering|          46|Overworked|
|Farah|         HR|          36|    Normal|
+-----+-----------+------------+----------+



In [18]:
#Count the total number of employees working on each "Project" .
spark.sql('Select Project,count(EmpId)as Number_of_Employees from global_temp.employees_global group by Project').show()

+---------------+-------------------+
|        Project|Number_of_Employees|
+---------------+-------------------+
|  Data Platform|                  1|
|      AI Engine|                  2|
| Product Launch|                  1|
|Client Outreach|                  1|
| Security Suite|                  1|
|  Policy Revamp|                  1|
|       Lead Gen|                  1|
|   Social Media|                  1|
|     Onboarding|                  1|
+---------------+-------------------+



In [21]:
#List employees whose salary is above the average salary in their department.
spark.sql('''
    SELECT
        e.Name,
        e.Department,
        e.Salary
    FROM global_temp.employees_global e
    JOIN (
        SELECT
            Department,
            AVG(Salary) AS Avg_Salary
        FROM global_temp.employees_global
        GROUP BY Department
    ) d_avg
    ON e.Department = d_avg.Department
    WHERE e.Salary > d_avg.Avg_Salary
''').show()


+-----+-----------+------+
| Name| Department|Salary|
+-----+-----------+------+
| Ravi|Engineering| 95000|
|Divya|Engineering| 99000|
|Kabir|  Marketing| 65000|
|Manav|      Sales| 73000|
|Farah|         HR| 60000|
+-----+-----------+------+



In [22]:
#Open a new Spark session and query "global_temp.employees_global" from there.
from pyspark.sql import SparkSession

new_spark = SparkSession.builder \
    .appName("NewSession") \
    .getOrCreate()

In [23]:
new_spark.sql('SELECT * FROM global_temp.employees_global').show()


+-----+-----+-----------+---------------+------+------------+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|
+-----+-----+-----------+---------------+------+------------+
|  101| Ravi|Engineering|      AI Engine| 95000|          42|
|  102|Sneha|Engineering|  Data Platform| 87000|          45|
|  103|Kabir|  Marketing| Product Launch| 65000|          40|
|  104|Anita|      Sales|Client Outreach| 70000|          38|
|  105|Divya|Engineering|      AI Engine| 99000|          48|
|  106| Amit|  Marketing|   Social Media| 62000|          35|
|  107|Priya|         HR|  Policy Revamp| 58000|          37|
|  108|Manav|      Sales|       Lead Gen| 73000|          41|
|  109| Neha|Engineering| Security Suite| 91000|          46|
|  110|Farah|         HR|     Onboarding| 60000|          36|
+-----+-----+-----------+---------------+------+------------+



# Bonus Challenges
1. Use a window function to assign rank to employees within each department based
on salary.
2. Create another view (local or global) that only contains "Engineering"
employees.
3. Create a SQL view that filters out all employees working < 38 hours and saves
it as "active_employees" .

In [26]:
#Use a window function to assign rank to employees within each department based on salary.
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, rank, desc
spark.sql('''
    SELECT
        *,
        RANK() OVER (PARTITION BY Department ORDER BY Salary DESC) AS salary_rank
    FROM global_temp.employees_global
''').show()


+-----+-----+-----------+---------------+------+------------+-----------+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|salary_rank|
+-----+-----+-----------+---------------+------+------------+-----------+
|  105|Divya|Engineering|      AI Engine| 99000|          48|          1|
|  101| Ravi|Engineering|      AI Engine| 95000|          42|          2|
|  109| Neha|Engineering| Security Suite| 91000|          46|          3|
|  102|Sneha|Engineering|  Data Platform| 87000|          45|          4|
|  110|Farah|         HR|     Onboarding| 60000|          36|          1|
|  107|Priya|         HR|  Policy Revamp| 58000|          37|          2|
|  103|Kabir|  Marketing| Product Launch| 65000|          40|          1|
|  106| Amit|  Marketing|   Social Media| 62000|          35|          2|
|  108|Manav|      Sales|       Lead Gen| 73000|          41|          1|
|  104|Anita|      Sales|Client Outreach| 70000|          38|          2|
+-----+-----+-----------+-------------

In [27]:
#Create another view (local or global) that only contains "Engineering" employees.
spark.sql('''
    SELECT *
    FROM global_temp.employees_global
    WHERE Department = 'Engineering'
''').createOrReplaceGlobalTempView("engineering_employees")


In [28]:
# Create a SQL view that filters out all employees working < 38 hours and saves it as "active_employees" .
spark.sql('''
    SELECT *
    FROM global_temp.employees_global
    WHERE HoursPerWeek >= 38
''').createOrReplaceGlobalTempView("active_employees")
