In [1]:

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import when, avg, rank
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("pyspark_sql_views").getOrCreate()

data = [
    Row(empid=101, name="ravi", department="engineering", project="ai engine", salary=95000, hoursperweek=42),
    Row(empid=102, name="sneha", department="engineering", project="data platform", salary=87000, hoursperweek=45),
    Row(empid=103, name="kabir", department="marketing", project="product launch", salary=65000, hoursperweek=40),
    Row(empid=104, name="anita", department="sales", project="client outreach", salary=70000, hoursperweek=38),
    Row(empid=105, name="divya", department="engineering", project="ai engine", salary=99000, hoursperweek=48),
    Row(empid=106, name="amit", department="marketing", project="social media", salary=62000, hoursperweek=35),
    Row(empid=107, name="priya", department="hr", project="policy revamp", salary=58000, hoursperweek=37),
    Row(empid=108, name="manav", department="sales", project="lead gen", salary=73000, hoursperweek=41),
    Row(empid=109, name="neha", department="engineering", project="security suite", salary=91000, hoursperweek=46),
    Row(empid=110, name="farah", department="hr", project="onboarding", salary=60000, hoursperweek=36)
]
df = spark.createDataFrame(data)
df.show(truncate=False)


+-----+-----+-----------+---------------+------+------------+
|empid|name |department |project        |salary|hoursperweek|
+-----+-----+-----------+---------------+------+------------+
|101  |ravi |engineering|ai engine      |95000 |42          |
|102  |sneha|engineering|data platform  |87000 |45          |
|103  |kabir|marketing  |product launch |65000 |40          |
|104  |anita|sales      |client outreach|70000 |38          |
|105  |divya|engineering|ai engine      |99000 |48          |
|106  |amit |marketing  |social media   |62000 |35          |
|107  |priya|hr         |policy revamp  |58000 |37          |
|108  |manav|sales      |lead gen       |73000 |41          |
|109  |neha |engineering|security suite |91000 |46          |
|110  |farah|hr         |onboarding     |60000 |36          |
+-----+-----+-----------+---------------+------+------------+



In [5]:
df.createOrReplaceTempView("employees_local")
df.createOrReplaceGlobalTempView("employees_global")

In [7]:
spark.sql("""select * from employees_local where project='ai engine'""").show()

+-----+-----+-----------+---------+------+------------+
|empid| name| department|  project|salary|hoursperweek|
+-----+-----+-----------+---------+------+------------+
|  101| ravi|engineering|ai engine| 95000|          42|
|  105|divya|engineering|ai engine| 99000|          48|
+-----+-----+-----------+---------+------+------------+



In [9]:
spark.sql("""select * from employees_local where department='marketing' and salary>60000""").show()

+-----+-----+----------+--------------+------+------------+
|empid| name|department|       project|salary|hoursperweek|
+-----+-----+----------+--------------+------+------------+
|  103|kabir| marketing|product launch| 65000|          40|
|  106| amit| marketing|  social media| 62000|          35|
+-----+-----+----------+--------------+------+------------+



In [10]:
spark.sql("""select department, avg(salary) as avgsalary from employees_local group by department""").show()

+-----------+---------+
| department|avgsalary|
+-----------+---------+
|  marketing|  63500.0|
|      sales|  71500.0|
|engineering|  93000.0|
|         hr|  59000.0|
+-----------+---------+



In [12]:
spark.sql("""select * from employees_local order by salary desc limit 3""").show()

+-----+-----+-----------+--------------+------+------------+
|empid| name| department|       project|salary|hoursperweek|
+-----+-----+-----------+--------------+------+------------+
|  105|divya|engineering|     ai engine| 99000|          48|
|  101| ravi|engineering|     ai engine| 95000|          42|
|  109| neha|engineering|security suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



In [16]:
spark.sql("""select * from employees_local where hoursperweek>40""").show()

+-----+-----+-----------+--------------+------+------------+
|empid| name| department|       project|salary|hoursperweek|
+-----+-----+-----------+--------------+------+------------+
|  101| ravi|engineering|     ai engine| 95000|          42|
|  102|sneha|engineering| data platform| 87000|          45|
|  105|divya|engineering|     ai engine| 99000|          48|
|  108|manav|      sales|      lead gen| 73000|          41|
|  109| neha|engineering|security suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



In [20]:
spark.sql("""select project, count(*) as employee_count from employees_local group by project""").show()

+---------------+--------------+
|        project|employee_count|
+---------------+--------------+
| product launch|             1|
|      ai engine|             2|
|  data platform|             1|
|client outreach|             1|
|   social media|             1|
|     onboarding|             1|
|  policy revamp|             1|
| security suite|             1|
|       lead gen|             1|
+---------------+--------------+



In [21]:
spark.catalog.dropTempView("employees_local")

True

In [None]:
spark.sql("""select * from employee_local""")

In [26]:
spark.sql("""select * from global_temp.employees_global where department = 'hr' and hoursperweek < 38""").show()

+-----+-----+----------+-------------+------+------------+
|empid| name|department|      project|salary|hoursperweek|
+-----+-----+----------+-------------+------+------------+
|  107|priya|        hr|policy revamp| 58000|          37|
|  110|farah|        hr|   onboarding| 60000|          36|
+-----+-----+----------+-------------+------+------------+



In [29]:
spark.sql("""select department, sum(salary) as total_payout from global_temp.employees_global group by department""").show()

+-----------+------------+
| department|total_payout|
+-----------+------------+
|  marketing|      127000|
|      sales|      143000|
|engineering|      372000|
|         hr|      118000|
+-----------+------------+



In [34]:
spark.sql("""
select empid, name, department, hoursperweek,
case when hoursperweek>45 then 'overworked' else 'normal' end as status
from global_temp.employees_global
""").show()

+-----+-----+-----------+------------+----------+
|empid| name| department|hoursperweek|    status|
+-----+-----+-----------+------------+----------+
|  101| ravi|engineering|          42|    normal|
|  102|sneha|engineering|          45|    normal|
|  103|kabir|  marketing|          40|    normal|
|  104|anita|      sales|          38|    normal|
|  105|divya|engineering|          48|overworked|
|  106| amit|  marketing|          35|    normal|
|  107|priya|         hr|          37|    normal|
|  108|manav|      sales|          41|    normal|
|  109| neha|engineering|          46|overworked|
|  110|farah|         hr|          36|    normal|
+-----+-----+-----------+------------+----------+



In [36]:
spark.sql("""select project, count(*) as total_employees from global_temp.employees_global group by project""").show()

+---------------+---------------+
|        project|total_employees|
+---------------+---------------+
| product launch|              1|
|      ai engine|              2|
|  data platform|              1|
|client outreach|              1|
|   social media|              1|
|     onboarding|              1|
|  policy revamp|              1|
| security suite|              1|
|       lead gen|              1|
+---------------+---------------+



In [39]:
spark.sql("""
select * from global_temp.employees_global e
where salary > (select avg(salary) from global_temp.employees_global where department=e.department)
""").show()

+-----+-----+-----------+--------------+------+------------+
|empid| name| department|       project|salary|hoursperweek|
+-----+-----+-----------+--------------+------+------------+
|  103|kabir|  marketing|product launch| 65000|          40|
|  101| ravi|engineering|     ai engine| 95000|          42|
|  105|divya|engineering|     ai engine| 99000|          48|
|  108|manav|      sales|      lead gen| 73000|          41|
|  110|farah|         hr|    onboarding| 60000|          36|
+-----+-----+-----------+--------------+------+------------+



In [43]:
new_spark = SparkSession.builder.appName("new_session").getOrCreate()
new_spark.sql("select name from global_temp.employees_global").show()

+-----+
| name|
+-----+
| ravi|
|sneha|
|kabir|
|anita|
|divya|
| amit|
|priya|
|manav|
| neha|
|farah|
+-----+



In [47]:
window_spec = Window.partitionBy("department").orderBy(df.salary.desc())
df.withColumn("salary_rank", rank().over(window_spec)).show()

+-----+-----+-----------+---------------+------+------------+-----------+
|empid| name| department|        project|salary|hoursperweek|salary_rank|
+-----+-----+-----------+---------------+------+------------+-----------+
|  105|divya|engineering|      ai engine| 99000|          48|          1|
|  101| ravi|engineering|      ai engine| 95000|          42|          2|
|  109| neha|engineering| security suite| 91000|          46|          3|
|  102|sneha|engineering|  data platform| 87000|          45|          4|
|  110|farah|         hr|     onboarding| 60000|          36|          1|
|  107|priya|         hr|  policy revamp| 58000|          37|          2|
|  103|kabir|  marketing| product launch| 65000|          40|          1|
|  106| amit|  marketing|   social media| 62000|          35|          2|
|  108|manav|      sales|       lead gen| 73000|          41|          1|
|  104|anita|      sales|client outreach| 70000|          38|          2|
+-----+-----+-----------+-------------

In [60]:
df.filter(df.department == "engineering").createOrReplaceTempView("engineering_employees")
spark.sql("select * from engineering_employees").show()

+-----+-----+-----------+--------------+------+------------+
|empid| name| department|       project|salary|hoursperweek|
+-----+-----+-----------+--------------+------+------------+
|  101| ravi|engineering|     ai engine| 95000|          42|
|  102|sneha|engineering| data platform| 87000|          45|
|  105|divya|engineering|     ai engine| 99000|          48|
|  109| neha|engineering|security suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



In [65]:
spark.sql("create or replace temp view active_employees as select * from global_temp.employees_global where hoursperweek>=38")
spark.sql("select * from active_employees").show()

+-----+-----+-----------+---------------+------+------------+
|empid| name| department|        project|salary|hoursperweek|
+-----+-----+-----------+---------------+------+------------+
|  101| ravi|engineering|      ai engine| 95000|          42|
|  102|sneha|engineering|  data platform| 87000|          45|
|  103|kabir|  marketing| product launch| 65000|          40|
|  104|anita|      sales|client outreach| 70000|          38|
|  105|divya|engineering|      ai engine| 99000|          48|
|  108|manav|      sales|       lead gen| 73000|          41|
|  109| neha|engineering| security suite| 91000|          46|
+-----+-----+-----------+---------------+------+------------+

