In [1]:
import getpass as gp
from pyspark.sql import SparkSession

In [2]:
user = gp.getuser()
spark = SparkSession \
    .builder \
    .appName(f'{user}-Week-5-Assignment-1') \
    .config('spark.sql.warehouse.dir', f'/user/{user}/warehouse') \
    .config('spark.sql.catalogImplementation', 'hive') \
    .enableHiveSupport() \
    .master('yarn') \
    .getOrCreate()

In [3]:
spark

In [4]:
!hadoop fs -head /public/trendytech/groceries.csv

order_id,location,item,order_date,quantity
o1,Seattle,Bananas,01/01/2017,7
o2,Kent,Apples,02/01/2017,20
o3,Bellevue,Flowers,02/01/2017,10
o4,Redmond,Meat,03/01/2017,40
o5,Seattle,Potatoes,04/01/2017,9
o6,Bellevue,Bread,04/01/2017,5
o7,Redmond,Bread,05/01/2017,5
o8,Issaquah,Onion,05/01/2017,4
o9,Redmond,Cheese,05/01/2017,15
o10,Issaquah,Onion,06/01/2017,4
o11,Renton,Bread,05/01/2017,5
o12,Issaquah,Onion,07/01/2017,4
o13,Sammamish,Bread,07/01/2017,5
o14,Issaquah,Tomato,07/01/2017,6
o15,Issaquah,Meat,08/01/2017,3
o16,Issaquah,Meat,09/01/2017,5
o17,Issaquah,Meat,10/01/2017,6
o18,Bellevue,Bread,11/01/2017,7
o19,Bellevue,Bread,12/01/2017,54
o20,Bellevue,Bread,13/01/2017,34
o21,Bellevue,Bread,14/01/2017,25


In [5]:
from pyspark.sql import types as T

schema = T.StructType([
    T.StructField('order_id', T.StringType()),
    T.StructField('locaton', T.StringType()),
    T.StructField('item', T.StringType()),
    T.StructField('order_date', T.StringType()),
    T.StructField('quantity', T.IntegerType()),
])


df_groceries = spark.read \
    .format('csv') \
    .option('header', 'true') \
    .schema(schema) \
    .option('path', '/public/trendytech/groceries.csv') \
    .load()

In [6]:
df_groceries.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- locaton: string (nullable = true)
 |-- item: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- quantity: integer (nullable = true)



In [7]:
df_groceries.show(5, truncate=False)

+--------+--------+--------+----------+--------+
|order_id|locaton |item    |order_date|quantity|
+--------+--------+--------+----------+--------+
|o1      |Seattle |Bananas |01/01/2017|7       |
|o2      |Kent    |Apples  |02/01/2017|20      |
|o3      |Bellevue|Flowers |02/01/2017|10      |
|o4      |Redmond |Meat    |03/01/2017|40      |
|o5      |Seattle |Potatoes|04/01/2017|9       |
+--------+--------+--------+----------+--------+
only showing top 5 rows



In [8]:
from pyspark.sql import functions as F
df_groceries  = df_groceries.withColumn('order_date', F.to_date('order_date', 'mm/dd/yyyy'))

In [9]:
df_groceries.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- locaton: string (nullable = true)
 |-- item: string (nullable = true)
 |-- order_date: date (nullable = true)
 |-- quantity: integer (nullable = true)



In [10]:
df_groceries.show(5, truncate=False)

+--------+--------+--------+----------+--------+
|order_id|locaton |item    |order_date|quantity|
+--------+--------+--------+----------+--------+
|o1      |Seattle |Bananas |2017-01-01|7       |
|o2      |Kent    |Apples  |2017-01-01|20      |
|o3      |Bellevue|Flowers |2017-01-01|10      |
|o4      |Redmond |Meat    |2017-01-01|40      |
|o5      |Seattle |Potatoes|2017-01-01|9       |
+--------+--------+--------+----------+--------+
only showing top 5 rows



In [11]:
df_groceries.collect()

[Row(order_id='o1', locaton='Seattle', item='Bananas', order_date=datetime.date(2017, 1, 1), quantity=7),
 Row(order_id='o2', locaton='Kent', item='Apples', order_date=datetime.date(2017, 1, 1), quantity=20),
 Row(order_id='o3', locaton='Bellevue', item='Flowers', order_date=datetime.date(2017, 1, 1), quantity=10),
 Row(order_id='o4', locaton='Redmond', item='Meat', order_date=datetime.date(2017, 1, 1), quantity=40),
 Row(order_id='o5', locaton='Seattle', item='Potatoes', order_date=datetime.date(2017, 1, 1), quantity=9),
 Row(order_id='o6', locaton='Bellevue', item='Bread', order_date=datetime.date(2017, 1, 1), quantity=5),
 Row(order_id='o7', locaton='Redmond', item='Bread', order_date=datetime.date(2017, 1, 1), quantity=5),
 Row(order_id='o8', locaton='Issaquah', item='Onion', order_date=datetime.date(2017, 1, 1), quantity=4),
 Row(order_id='o9', locaton='Redmond', item='Cheese', order_date=datetime.date(2017, 1, 1), quantity=15),
 Row(order_id='o10', locaton='Issaquah', item='Onion

In [12]:
spark.sql('show databases').filter("namespace like '%itv005077%'").show()

+---------+
|namespace|
+---------+
+---------+



In [13]:
spark.sql('create database if not exists itv005077_database')

In [14]:
spark.sql('show databases').filter("namespace like '%itv005077%'").show()

+------------------+
|         namespace|
+------------------+
|itv005077_database|
+------------------+



In [15]:
spark.sql('use itv005077_database')

In [16]:
spark.sql('show tables')

database,tableName,isTemporary


In [17]:
df_groceries.createOrReplaceTempView('groceries')

In [18]:
spark.sql('describe extended groceries').show()

+----------+---------+-------+
|  col_name|data_type|comment|
+----------+---------+-------+
|  order_id|   string|   null|
|   locaton|   string|   null|
|      item|   string|   null|
|order_date|     date|   null|
|  quantity|      int|   null|
+----------+---------+-------+



In [19]:
spark.sql('show tables').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |groceries|       true|
+--------+---------+-----------+



In [20]:
spark.sql('select * from groceries').show(5, truncate=False)

+--------+--------+--------+----------+--------+
|order_id|locaton |item    |order_date|quantity|
+--------+--------+--------+----------+--------+
|o1      |Seattle |Bananas |2017-01-01|7       |
|o2      |Kent    |Apples  |2017-01-01|20      |
|o3      |Bellevue|Flowers |2017-01-01|10      |
|o4      |Redmond |Meat    |2017-01-01|40      |
|o5      |Seattle |Potatoes|2017-01-01|9       |
+--------+--------+--------+----------+--------+
only showing top 5 rows



In [21]:
spark.sql('create table if not exists itv005077_database.groceries_managed as select * from groceries')

In [22]:
spark.sql('show tables').show()

+------------------+-----------------+-----------+
|          database|        tableName|isTemporary|
+------------------+-----------------+-----------+
|itv005077_database|groceries_managed|      false|
|                  |        groceries|       true|
+------------------+-----------------+-----------+



In [23]:
spark.sql('describe extended itv005077_database.groceries_managed').show(truncate=False)

+----------------------------+----------------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                                     |comment|
+----------------------------+----------------------------------------------------------------------------------------------+-------+
|order_id                    |string                                                                                        |null   |
|locaton                     |string                                                                                        |null   |
|item                        |string                                                                                        |null   |
|order_date                  |date                                                                                          |null   |
|quantity                    |int                             

In [24]:
!hadoop fs -ls warehouse

Found 1 items
drwxr-xr-x   - itv005077 supergroup          0 2023-05-30 14:06 warehouse/itv005077_database.db


In [25]:
!hadoop fs -ls warehouse/itv005077_database.db

Found 1 items
drwxr-xr-x   - itv005077 supergroup          0 2023-05-30 14:06 warehouse/itv005077_database.db/groceries_managed


In [26]:
!hadoop fs -ls warehouse/itv005077_database.db/groceries_managed

Found 1 items
-rwxr-xr-x   3 itv005077 supergroup        666 2023-05-30 14:06 warehouse/itv005077_database.db/groceries_managed/part-00000-9476cd06-e687-4066-ab08-a718fa3a27a1-c000


In [27]:
!hadoop fs -head warehouse/itv005077_database.db/groceries_managed/part-00000-ba781f1c-a55b-4dcf-a272-abc77d939bcf-c000

head: `warehouse/itv005077_database.db/groceries_managed/part-00000-ba781f1c-a55b-4dcf-a272-abc77d939bcf-c000': No such file or directory


In [28]:
spark.sql('select * from itv005077_database.groceries_managed').show(5, truncate=False)

+--------+--------+--------+----------+--------+
|order_id|locaton |item    |order_date|quantity|
+--------+--------+--------+----------+--------+
|o1      |Seattle |Bananas |2017-01-01|7       |
|o2      |Kent    |Apples  |2017-01-01|20      |
|o3      |Bellevue|Flowers |2017-01-01|10      |
|o4      |Redmond |Meat    |2017-01-01|40      |
|o5      |Seattle |Potatoes|2017-01-01|9       |
+--------+--------+--------+----------+--------+
only showing top 5 rows



In [29]:
spark.sql('drop table itv005077_database.groceries_managed')

In [30]:
!hadoop fs -ls warehouse/itv005077_database.db

In [31]:
spark.sql('drop database itv005077_database')

In [32]:
!hadoop fs -ls warehouse

In [33]:
spark.stop()