In [1]:
import getpass
from pyspark.sql import SparkSession, functions as F, types as T

In [2]:
user = getpass.getuser()
spark = SparkSession \
    .builder \
    .appName(f'{user}-Week-5-Assignment-1-json') \
    .config('spark.sql.warehouse.dir', f'/user/{user}/warehouse') \
    .config('spark.sql.catalogImplementation', 'hive') \
    .enableHiveSupport() \
    .master('yarn') \
    .getOrCreate()

In [3]:
spark

In [4]:
!hadoop fs -ls /public/trendytech/orders_wh.json/

Found 2 items
-rw-r--r--   3 itv005857 supergroup          0 2023-05-04 07:42 /public/trendytech/orders_wh.json/_SUCCESS
-rw-r--r--   3 itv005857 supergroup    7064041 2023-05-04 07:42 /public/trendytech/orders_wh.json/part-00000-68544d18-9a34-443f-bf0e-1dd8103ff94e-c000.json


In [5]:
!hadoop fs -head /public/trendytech/orders_wh.json/part-00000-68544d18-9a34-443f-bf0e-1dd8103ff94e-c000.json

{"order_id":1,"order_date":"2013-07-25 00:00:00.0","customer_id":11599,"order_status":"CLOSED"}
{"order_id":2,"order_date":"2013-07-25 00:00:00.0","customer_id":256,"order_status":"PENDING_PAYMENT"}
{"order_id":3,"order_date":"2013-07-25 00:00:00.0","customer_id":12111,"order_status":"COMPLETE"}
{"order_id":4,"order_date":"2013-07-25 00:00:00.0","customer_id":8827,"order_status":"CLOSED"}
{"order_id":5,"order_date":"2013-07-25 00:00:00.0","customer_id":11318,"order_status":"COMPLETE"}
{"order_id":6,"order_date":"2013-07-25 00:00:00.0","customer_id":7130,"order_status":"COMPLETE"}
{"order_id":7,"order_date":"2013-07-25 00:00:00.0","customer_id":4530,"order_status":"COMPLETE"}
{"order_id":8,"order_date":"2013-07-25 00:00:00.0","customer_id":2911,"order_status":"PROCESSING"}
{"order_id":9,"order_date":"2013-07-25 00:00:00.0","customer_id":5657,"order_status":"PENDING_PAYMENT"}
{"order_id":10,"order_date":"2013-07-25 00:00:00.0","customer_id":5648,"order_status":"PENDING_PAYMENT"}
{"order_

In [6]:
schema = T.StructType([
    T.StructField('order_id', T.IntegerType()),
    T.StructField('order_date', T.TimestampType()),
    T.StructField('customer_id', T.IntegerType()),
    T.StructField('order_status', T.StringType()),
])

df = spark.read \
    .format('json') \
    .schema(schema) \
    .option('path', '/public/trendytech/orders_wh.json/part-00000-68544d18-9a34-443f-bf0e-1dd8103ff94e-c000.json') \
    .load()

In [7]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [8]:
df.show(5, truncate=False)

+--------+-------------------+-----------+---------------+
|order_id|order_date         |customer_id|order_status   |
+--------+-------------------+-----------+---------------+
|1       |2013-07-25 00:00:00|11599      |CLOSED         |
|2       |2013-07-25 00:00:00|256        |PENDING_PAYMENT|
|3       |2013-07-25 00:00:00|12111      |COMPLETE       |
|4       |2013-07-25 00:00:00|8827       |CLOSED         |
|5       |2013-07-25 00:00:00|11318      |COMPLETE       |
+--------+-------------------+-----------+---------------+
only showing top 5 rows



In [9]:
spark.sql('show databases').filter("namespace like '%itv005077%'").show()

+---------+
|namespace|
+---------+
+---------+



In [10]:
!hadoop fs -ls warehouse

In [11]:
spark.sql('create database if not exists itv005077_database')

In [12]:
spark.sql('show databases').filter("namespace like '%itv005077%'").show()

+------------------+
|         namespace|
+------------------+
|itv005077_database|
+------------------+



In [13]:
spark.sql('use itv005077_database')

In [14]:
spark.sql('show tables')

database,tableName,isTemporary


In [15]:
spark.sql('create table if not exists itv005077_database.orders_external \
using json \
location "/public/trendytech/orders_wh.json/part-00000-68544d18-9a34-443f-bf0e-1dd8103ff94e-c000.json"')

In [16]:
spark.sql('show tables')

database,tableName,isTemporary
itv005077_database,orders_external,False


In [17]:
spark.sql('describe extended itv005077_database.orders_external').show(truncate=False)

+----------------------------+------------------------------------------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                                                               |comment|
+----------------------------+------------------------------------------------------------------------------------------------------------------------+-------+
|customer_id                 |bigint                                                                                                                  |null   |
|order_date                  |string                                                                                                                  |null   |
|order_id                    |bigint                                                                                                                  |null   |
|order_status                |string    

In [18]:
spark.sql('select * from itv005077_database.orders_external').show(5, truncate=False)

+-----------+---------------------+--------+---------------+
|customer_id|order_date           |order_id|order_status   |
+-----------+---------------------+--------+---------------+
|11599      |2013-07-25 00:00:00.0|1       |CLOSED         |
|256        |2013-07-25 00:00:00.0|2       |PENDING_PAYMENT|
|12111      |2013-07-25 00:00:00.0|3       |COMPLETE       |
|8827       |2013-07-25 00:00:00.0|4       |CLOSED         |
|11318      |2013-07-25 00:00:00.0|5       |COMPLETE       |
+-----------+---------------------+--------+---------------+
only showing top 5 rows



In [19]:
spark.sql('create table if not exists itv005077_database.orders_managed \
as select * from itv005077_database.orders_external')

In [20]:
spark.sql('show tables').show()

+------------------+---------------+-----------+
|          database|      tableName|isTemporary|
+------------------+---------------+-----------+
|itv005077_database|orders_external|      false|
|itv005077_database| orders_managed|      false|
+------------------+---------------+-----------+



In [21]:
spark.sql('describe formatted itv005077_database.orders_managed').show(truncate=False)

+----------------------------+-------------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                                  |comment|
+----------------------------+-------------------------------------------------------------------------------------------+-------+
|customer_id                 |bigint                                                                                     |null   |
|order_date                  |string                                                                                     |null   |
|order_id                    |bigint                                                                                     |null   |
|order_status                |string                                                                                     |null   |
|                            |                                                     

In [22]:
spark.sql('select * from itv005077_database.orders_managed').show(5, truncate=False)

+-----------+---------------------+--------+---------------+
|customer_id|order_date           |order_id|order_status   |
+-----------+---------------------+--------+---------------+
|11599      |2013-07-25 00:00:00.0|1       |CLOSED         |
|256        |2013-07-25 00:00:00.0|2       |PENDING_PAYMENT|
|12111      |2013-07-25 00:00:00.0|3       |COMPLETE       |
|8827       |2013-07-25 00:00:00.0|4       |CLOSED         |
|11318      |2013-07-25 00:00:00.0|5       |COMPLETE       |
+-----------+---------------------+--------+---------------+
only showing top 5 rows



In [23]:
!hadoop fs -ls warehouse

Found 1 items
drwxr-xr-x   - itv005077 supergroup          0 2023-05-30 15:49 warehouse/itv005077_database.db


In [24]:
!hadoop fs -ls warehouse/itv005077_database.db

Found 1 items
drwxr-xr-x   - itv005077 supergroup          0 2023-05-30 15:49 warehouse/itv005077_database.db/orders_managed


In [25]:
!hadoop fs -ls warehouse/itv005077_database.db/orders_managed

Found 2 items
-rwxr-xr-x   3 itv005077 supergroup    2389173 2023-05-30 15:49 warehouse/itv005077_database.db/orders_managed/part-00000-fbc6c8f2-5c8f-4871-90cf-e9b66f07f0a2-c000
-rwxr-xr-x   3 itv005077 supergroup     610771 2023-05-30 15:49 warehouse/itv005077_database.db/orders_managed/part-00001-fbc6c8f2-5c8f-4871-90cf-e9b66f07f0a2-c000


In [26]:
df.rdd.getNumPartitions()

2

In [27]:
!hadoop fs -head warehouse/itv005077_database.db/orders_managed/part-00000-d5c4f3a4-c866-401d-8a81-eb1918f6133f-c000

head: `warehouse/itv005077_database.db/orders_managed/part-00000-d5c4f3a4-c866-401d-8a81-eb1918f6133f-c000': No such file or directory


In [28]:
!hdfs fsck -help

Usage: hdfs fsck <path> [-list-corruptfileblocks | [-move | -delete | -openforwrite] [-files [-blocks [-locations | -racks | -replicaDetails | -upgradedomains]]]] [-includeSnapshots] [-showprogress] [-storagepolicies] [-maintenance] [-blockId <blk_Id>] [-replicate]
	<path>	start checking from this path
	-move	move corrupted files to /lost+found
	-delete	delete corrupted files
	-files	print out files being checked
	-openforwrite	print out files opened for write
	-includeSnapshots	include snapshot data if the given path indicates a snapshottable directory or there are snapshottable directories under it
	-list-corruptfileblocks	print out list of missing blocks and files they belong to
	-files -blocks	print out block report
	-files -blocks -locations	print out locations for every block
	-files -blocks -racks	print out network topology for data-node locations
	-files -blocks -replicaDetails	print out each replica details 
	-files -blocks -upgradedomains	print out upgrade domains for every b

In [29]:
!hdfs fsck warehouse/itv005077_database.db/orders_managed/part-00001-d5c4f3a4-c866-401d-8a81-eb1918f6133f-c000 -files -blocks -locations

FileSystem is inaccessible due to:
java.io.FileNotFoundException: File does not exist: hdfs://m01.itversity.com:9000/user/itv005077/warehouse/itv005077_database.db/orders_managed/part-00001-d5c4f3a4-c866-401d-8a81-eb1918f6133f-c000
DFSck exiting.


In [30]:
spark.sql('drop table itv005077_database.orders_external')

In [31]:
!hadoop fs -ls /public/trendytech/orders_wh.json/

Found 2 items
-rw-r--r--   3 itv005857 supergroup          0 2023-05-04 07:42 /public/trendytech/orders_wh.json/_SUCCESS
-rw-r--r--   3 itv005857 supergroup    7064041 2023-05-04 07:42 /public/trendytech/orders_wh.json/part-00000-68544d18-9a34-443f-bf0e-1dd8103ff94e-c000.json


In [32]:
spark.sql('drop table itv005077_database.orders_managed')

In [33]:
!hadoop fs -ls warehouse

Found 1 items
drwxr-xr-x   - itv005077 supergroup          0 2023-05-30 15:49 warehouse/itv005077_database.db


In [34]:
!hadoop fs -ls warehouse/itv005077_database.db

In [35]:
spark.sql('show tables')

database,tableName,isTemporary


In [36]:
spark.sql('drop database itv005077_database')

In [37]:
!hadoop fs -ls warehouse

In [38]:
spark.stop()