In [1]:
import getpass as gp
from pyspark.sql import SparkSession, functions as F

In [2]:
user = gp.getuser()
spark = SparkSession.builder \
            .appName(f'{user}-dataframe-spark-sql-example') \
            .master('yarn') \
            .getOrCreate()

In [3]:
spark

In [4]:
!hadoop fs -ls -h /public/trendytech/datasets/ordersparquet

Found 3 items
-rw-r--r--   3 itv005857 supergroup          0 2023-05-04 07:58 /public/trendytech/datasets/ordersparquet/_SUCCESS
-rw-r--r--   3 itv005857 supergroup    389.0 K 2023-05-04 07:58 /public/trendytech/datasets/ordersparquet/part-00000-95703d6e-20f5-43be-94eb-1fade70892f8-c000.snappy.parquet
-rw-r--r--   3 itv005857 supergroup    124.4 K 2023-05-04 07:58 /public/trendytech/datasets/ordersparquet/part-00001-95703d6e-20f5-43be-94eb-1fade70892f8-c000.snappy.parquet


In [5]:
df_orders = spark.read.parquet('/public/trendytech/datasets/ordersparquet')

In [6]:
df_orders.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [7]:
df_orders.show(5, truncate= False)

+-----------+---------------------+--------+---------------+
|customer_id|order_date           |order_id|order_status   |
+-----------+---------------------+--------+---------------+
|11599      |2013-07-25 00:00:00.0|1       |CLOSED         |
|256        |2013-07-25 00:00:00.0|2       |PENDING_PAYMENT|
|12111      |2013-07-25 00:00:00.0|3       |COMPLETE       |
|8827       |2013-07-25 00:00:00.0|4       |CLOSED         |
|11318      |2013-07-25 00:00:00.0|5       |COMPLETE       |
+-----------+---------------------+--------+---------------+
only showing top 5 rows



In [8]:
spark.sql("show databases")

namespace
default


In [9]:
df_orders.select('*').show(5)

+-----------+--------------------+--------+---------------+
|customer_id|          order_date|order_id|   order_status|
+-----------+--------------------+--------+---------------+
|      11599|2013-07-25 00:00:...|       1|         CLOSED|
|        256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|
|      12111|2013-07-25 00:00:...|       3|       COMPLETE|
|       8827|2013-07-25 00:00:...|       4|         CLOSED|
|      11318|2013-07-25 00:00:...|       5|       COMPLETE|
+-----------+--------------------+--------+---------------+
only showing top 5 rows



In [10]:
df_orders = df_orders.withColumn('order_date', F.to_timestamp('order_date'))

In [11]:
df_orders.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [12]:
df_orders.show(5)

+-----------+-------------------+--------+---------------+
|customer_id|         order_date|order_id|   order_status|
+-----------+-------------------+--------+---------------+
|      11599|2013-07-25 00:00:00|       1|         CLOSED|
|        256|2013-07-25 00:00:00|       2|PENDING_PAYMENT|
|      12111|2013-07-25 00:00:00|       3|       COMPLETE|
|       8827|2013-07-25 00:00:00|       4|         CLOSED|
|      11318|2013-07-25 00:00:00|       5|       COMPLETE|
+-----------+-------------------+--------+---------------+
only showing top 5 rows



In [13]:
df_orders = df_orders.withColumnRenamed('order_status', 'status')

In [14]:
df_orders.printSchema()

root
 |-- customer_id: long (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_id: long (nullable = true)
 |-- status: string (nullable = true)



In [15]:
df_orders.show(5)

+-----------+-------------------+--------+---------------+
|customer_id|         order_date|order_id|         status|
+-----------+-------------------+--------+---------------+
|      11599|2013-07-25 00:00:00|       1|         CLOSED|
|        256|2013-07-25 00:00:00|       2|PENDING_PAYMENT|
|      12111|2013-07-25 00:00:00|       3|       COMPLETE|
|       8827|2013-07-25 00:00:00|       4|         CLOSED|
|      11318|2013-07-25 00:00:00|       5|       COMPLETE|
+-----------+-------------------+--------+---------------+
only showing top 5 rows



In [16]:
df_orders.select("customer_id", "status").show(5)

+-----------+---------------+
|customer_id|         status|
+-----------+---------------+
|      11599|         CLOSED|
|        256|PENDING_PAYMENT|
|      12111|       COMPLETE|
|       8827|         CLOSED|
|      11318|       COMPLETE|
+-----------+---------------+
only showing top 5 rows



In [17]:
# end of file
spark.stop()