In [0]:
%sql
-- SQL syntax
CREATE DATABASE IF NOT EXISTS example_db;
USE example_db;


In [0]:
%sql
-- SQL syntax
CREATE TABLE IF NOT EXISTS employees (
    id INT,
    name STRING,
    age INT,
    department STRING,
    salary DOUBLE
)
USING PARQUET;


In [0]:
%sql
INSERT INTO employees VALUES
(1, 'Arjun', 29, 'Engineering', 75000),
(2, 'Meera', 32, 'HR', 54000),
(3, 'Lakshmi', 28, 'Finance', 60000),
(4, 'Ramesh', 35, 'Engineering', 85000);


In [0]:
# Python code
data = [(1, 'Arjun', 29, 'Engineering', 75000),
        (2, 'Meera', 32, 'HR', 54000),
        (3, 'Lakshmi', 28, 'Finance', 60000),
        (4, 'Ramesh', 35, 'Engineering', 85000)]

columns = ['id', 'name', 'age', 'department', 'salary']
df = spark.createDataFrame(data, columns)
df.createOrReplaceTempView("employees_temp")


In [0]:
%sql
SELECT * FROM employees_temp WHERE age > 30;


id,name,age,department,salary
2,Meera,32,HR,54000
4,Ramesh,35,Engineering,85000


In [0]:
# Create departments DataFrame
departments_data = [('Engineering', 'Tech Team'),
                    ('HR', 'Human Resources'),
                    ('Finance', 'Accounting')]

departments_df = spark.createDataFrame(departments_data, ['department', 'team'])
departments_df.createOrReplaceTempView("departments_temp")

# SQL join
result = spark.sql("""
SELECT e.id, e.name, e.department, d.team
FROM employees_temp e
JOIN departments_temp d
ON e.department = d.department
""")
result.show()


+---+-------+-----------+---------------+
| id|   name| department|           team|
+---+-------+-----------+---------------+
|  1|  Arjun|Engineering|      Tech Team|
|  4| Ramesh|Engineering|      Tech Team|
|  3|Lakshmi|    Finance|     Accounting|
|  2|  Meera|         HR|Human Resources|
+---+-------+-----------+---------------+



In [0]:
%sql
SELECT department, AVG(salary) AS avg_salary, COUNT(*) AS total_employees
FROM employees_temp
GROUP BY department;


department,avg_salary,total_employees
Engineering,80000.0,2
HR,54000.0,1
Finance,60000.0,1


In [0]:
%sql
SELECT 
    id, 
    name, 
    salary,
    RANK() OVER (PARTITION BY department ORDER BY salary DESC) AS rank
FROM employees_temp;


id,name,salary,rank
4,Ramesh,85000,1
1,Arjun,75000,2
3,Lakshmi,60000,1
2,Meera,54000,1


In [0]:
df.createOrReplaceTempView("temp_view_name")


In [0]:
df.createOrReplaceGlobalTempView("global_temp_view_name")


In [0]:
%sql
SELECT * FROM temp_view_name WHERE age > 30;


id,name,age,department,salary
2,Meera,32,HR,54000
4,Ramesh,35,Engineering,85000
