In [1]:
import getpass as gp
from pyspark.sql import SparkSession, functions as F

In [2]:
user = gp.getuser()
spark = SparkSession.builder \
            .appName(f'{user}-Spark-SQL-Example') \
            .config('spark.sql.warehouse.dir', f'/user/{user}/warehouse') \
            .config("spark.sql.catalogImplementation", "hive") \
            .enableHiveSupport() \
            .master('yarn') \
            .getOrCreate()

In [3]:
spark

In [4]:
spark.sql("show databases").show()

+-------------------+
|          namespace|
+-------------------+
|  0001_av_ivy_tesco|
|       003402_hive1|
|   005198_ivy_tesco|
|   005212_ivy_tesco|
|005222_ivy_practice|
|005260_ivy_database|
|        00ivy_tesco|
|         00ivy_test|
|      07172021_nyse|
|    07172021_retail|
|       07172021_sms|
|        1230_trendy|
|    1230_trendytech|
|      1540retail_db|
|        1993_ankita|
|               1src|
|              26may|
|               2stg|
|               3etl|
|           44_tesco|
+-------------------+
only showing top 20 rows



In [5]:
user

'itv005077'

## Create Database `itv005077_retail`

In [6]:
spark.sql("CREATE DATABASE IF NOT EXISTS itv005077_retail")

In [7]:
spark.sql("SHOW DATABASES").filter("namespace = 'itv005077_retail'").show()

+----------------+
|       namespace|
+----------------+
|itv005077_retail|
+----------------+



## Use Database `itv005077_retail`

In [8]:
spark.sql("USE itv005077_retail")

In [9]:
spark.sql("SHOW TABLES")

database,tableName,isTemporary


### Create `ORDER` Table

In [10]:
spark.sql("CREATE TABLE IF NOT EXISTS itv005077_retail.orders (order_id integer, order_date string, order_status string)")

In [11]:
spark.sql("DESCRIBE TABLE itv005077_retail.orders")

col_name,data_type,comment
order_id,int,
order_date,string,
order_status,string,


In [12]:
spark.sql("DESCRIBE FORMATTED itv005077_retail.orders").show(truncate= False)

+----------------------------+---------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                        |comment|
+----------------------------+---------------------------------------------------------------------------------+-------+
|order_id                    |int                                                                              |null   |
|order_date                  |string                                                                           |null   |
|order_status                |string                                                                           |null   |
|                            |                                                                                 |       |
|# Detailed Table Information|                                                                                 |       |
|Database                    |it

In [13]:
!hadoop fs -ls 

Found 3 items
drwx------   - itv005077 supergroup          0 2023-04-15 10:10 .Trash
drwxr-xr-x   - itv005077 supergroup          0 2023-05-09 16:43 .sparkStaging
drwxr-xr-x   - itv005077 supergroup          0 2023-05-09 16:43 warehouse


In [14]:
!hadoop fs -ls warehouse

Found 1 items
drwxr-xr-x   - itv005077 supergroup          0 2023-05-09 16:43 warehouse/itv005077_retail.db


In [15]:
!hadoop fs -ls warehouse/itv005077_retail.db

Found 1 items
drwxr-xr-x   - itv005077 supergroup          0 2023-05-09 16:43 warehouse/itv005077_retail.db/orders


In [16]:
!hadoop fs -ls warehouse/itv005077_retail.db/orders

### Insert Rows into `itv005077_retail.orders` table

In [17]:
spark.sql("INSERT INTO itv005077_retail.orders VALUES(1, '2013-07-25', 'CLOSED')")

In [18]:
spark.sql('SELECT * FROM itv005077_retail.orders').show()

+--------+----------+------------+
|order_id|order_date|order_status|
+--------+----------+------------+
|       1|2013-07-25|      CLOSED|
+--------+----------+------------+



In [19]:
df = spark.sql("INSERT INTO itv005077_retail.orders VALUES(2, '2013-07-26', 'PENDING PAYMENT')")

In [20]:
spark.sql('SELECT * FROM itv005077_retail.orders').show()

+--------+----------+---------------+
|order_id|order_date|   order_status|
+--------+----------+---------------+
|       1|2013-07-25|         CLOSED|
|       2|2013-07-26|PENDING PAYMENT|
+--------+----------+---------------+



In [21]:
!hadoop fs -ls warehouse/itv005077_retail.db/orders

Found 2 items
-rwxr-xr-x   3 itv005077 supergroup         20 2023-05-09 16:43 warehouse/itv005077_retail.db/orders/part-00000-4e1d6bca-f5de-42e1-a6d1-8d63c28e9b2f-c000
-rwxr-xr-x   3 itv005077 supergroup         29 2023-05-09 16:43 warehouse/itv005077_retail.db/orders/part-00000-cb2f6185-cd09-4d30-90d3-bb9c3974680c-c000


In [22]:
!hadoop fs -head warehouse/itv005077_retail.db/orders/part-00000-56532ee4-c78a-431a-a953-702e8684052c-c000

head: `warehouse/itv005077_retail.db/orders/part-00000-56532ee4-c78a-431a-a953-702e8684052c-c000': No such file or directory


In [23]:
df = spark.sql('SELECT * FROM itv005077_retail.orders WHERE order_id = 2')

In [24]:
df.show()

+--------+----------+---------------+
|order_id|order_date|   order_status|
+--------+----------+---------------+
|       2|2013-07-26|PENDING PAYMENT|
+--------+----------+---------------+



### Drop table `itv005077_retail.orders`

In [25]:
spark.sql("DROP TABLE itv005077_retail.orders")

In [26]:
!hadoop fs -ls warehouse/itv005077_retail.db

### Drop Database `itv005077_retail`

In [27]:
spark.sql("DROP DATABASE itv005077_retail")

In [28]:
spark.sql("SHOW DATABASES").filter('namespace == "itv005077_retail"').show()

+---------+
|namespace|
+---------+
+---------+



In [29]:
!hadoop fs -ls warehouse

In [30]:
!hadoop fs -ls

Found 3 items
drwx------   - itv005077 supergroup          0 2023-04-15 10:10 .Trash
drwxr-xr-x   - itv005077 supergroup          0 2023-05-09 16:43 .sparkStaging
drwxr-xr-x   - itv005077 supergroup          0 2023-05-09 16:43 warehouse


In [31]:
# end of file
spark.stop()