In [1]:
import getpass as gp
from pyspark.sql import SparkSession, functions as F

In [2]:
user = gp.getuser()
spark = SparkSession.builder \
            .appName(f'{user}-first-dataframe-program') \
            .master('yarn') \
            .getOrCreate()

In [3]:
spark

In [4]:
!hadoop fs -ls -h /public/crime/csv

Found 1 items
-rw-r--r--   2 hdfs supergroup      1.4 G 2021-01-28 09:08 /public/crime/csv/crime_data.csv


In [5]:
!hadoop fs -head /public/crime/csv/crime_data.csv

ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
5679862,HN487108,07/24/2007 10:11:00 PM,054XX S ABERDEEN ST,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,false,false,0934,009,16,61,14,1169912,1868555,2007,04/15/2016 08:55:02 AM,41.794811309,-87.652466989,"(41.794811309, -87.652466989)"
5679863,HN488302,07/24/2007 01:00:00 PM,082XX S TALMAN AVE,0460,BATTERY,SIMPLE,STREET,false,false,0835,008,18,70,08B,1160134,1850078,2007,04/15/2016 08:55:02 AM,41.744314668,-87.688830696,"(41.744314668, -87.688830696)"
5679864,HN487195,07/24/2007 02:20:00 PM,034XX N MILWAUKEE AVE,0486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,false,true,1731,017,30,16,08B,1148204,1922490,2007,04/15/2016 08:55:02 AM,41.943259605,-87.730682304,"(41.943259605, -87.730682304)"
5679865,HN484199,07/21/2007 12:30:00 PM,035XX W BELMONT AVE,0820,THEFT,$500 AND UNDER,STREET,false,f

In [6]:
df = spark.read \
        .format('csv') \
        .option("header", "true") \
        .option('inferSchema', 'false') \
        .load('/public/crime/csv/*')

In [7]:
df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: string (nullable = true)
 |-- Domestic: string (nullable = true)
 |-- Beat: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Ward: string (nullable = true)
 |-- Community Area: string (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: string (nullable = true)
 |-- Y Coordinate: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Location: string (nullable = true)



In [8]:
df.show(5, truncate= False)

+-------+-----------+----------------------+---------------------+----+---------------+-----------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+----------------------+------------+-------------+-----------------------------+
|ID     |Case Number|Date                  |Block                |IUCR|Primary Type   |Description            |Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|Updated On            |Latitude    |Longitude    |Location                     |
+-------+-----------+----------------------+---------------------+----+---------------+-----------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+----------------------+------------+-------------+-----------------------------+
|5679862|HN487108   |07/24/2007 10:11:00 PM|054XX S ABERDEEN ST  |1320|CRIMINAL DAMAGE|TO V

## Read JSON file

In [9]:
!hadoop fs -ls -h /public/trendytech/datasets/orders.json

-rw-r--r--   3 itv005857 supergroup      6.7 M 2023-05-04 07:46 /public/trendytech/datasets/orders.json


In [10]:
!hadoop fs -head /public/trendytech/datasets/orders.json

{"order_id":1,"order_date":"2013-07-25 00:00:00.0","customer_id":11599,"order_status":"CLOSED"}
{"order_id":2,"order_date":"2013-07-25 00:00:00.0","customer_id":256,"order_status":"PENDING_PAYMENT"}
{"order_id":3,"order_date":"2013-07-25 00:00:00.0","customer_id":12111,"order_status":"COMPLETE"}
{"order_id":4,"order_date":"2013-07-25 00:00:00.0","customer_id":8827,"order_status":"CLOSED"}
{"order_id":5,"order_date":"2013-07-25 00:00:00.0","customer_id":11318,"order_status":"COMPLETE"}
{"order_id":6,"order_date":"2013-07-25 00:00:00.0","customer_id":7130,"order_status":"COMPLETE"}
{"order_id":7,"order_date":"2013-07-25 00:00:00.0","customer_id":4530,"order_status":"COMPLETE"}
{"order_id":8,"order_date":"2013-07-25 00:00:00.0","customer_id":2911,"order_status":"PROCESSING"}
{"order_id":9,"order_date":"2013-07-25 00:00:00.0","customer_id":5657,"order_status":"PENDING_PAYMENT"}
{"order_id":10,"order_date":"2013-07-25 00:00:00.0","customer_id":5648,"order_status":"PENDING_PAYMENT"}
{"order_

In [11]:
df = spark.read \
        .json("/public/trendytech/datasets/orders.json")

In [12]:
df.show(5, truncate= False)

+-----------+---------------------+--------+---------------+
|customer_id|order_date           |order_id|order_status   |
+-----------+---------------------+--------+---------------+
|11599      |2013-07-25 00:00:00.0|1       |CLOSED         |
|256        |2013-07-25 00:00:00.0|2       |PENDING_PAYMENT|
|12111      |2013-07-25 00:00:00.0|3       |COMPLETE       |
|8827       |2013-07-25 00:00:00.0|4       |CLOSED         |
|11318      |2013-07-25 00:00:00.0|5       |COMPLETE       |
+-----------+---------------------+--------+---------------+
only showing top 5 rows



### Read PARQUET File

In [13]:
!hadoop fs -ls -h /public/trendytech/datasets/ordersparquet

Found 3 items
-rw-r--r--   3 itv005857 supergroup          0 2023-05-04 07:58 /public/trendytech/datasets/ordersparquet/_SUCCESS
-rw-r--r--   3 itv005857 supergroup    389.0 K 2023-05-04 07:58 /public/trendytech/datasets/ordersparquet/part-00000-95703d6e-20f5-43be-94eb-1fade70892f8-c000.snappy.parquet
-rw-r--r--   3 itv005857 supergroup    124.4 K 2023-05-04 07:58 /public/trendytech/datasets/ordersparquet/part-00001-95703d6e-20f5-43be-94eb-1fade70892f8-c000.snappy.parquet


In [14]:
df = spark.read \
        .parquet('/public/trendytech/datasets/ordersparquet')

In [15]:
df.show(5)

+-----------+--------------------+--------+---------------+
|customer_id|          order_date|order_id|   order_status|
+-----------+--------------------+--------+---------------+
|      11599|2013-07-25 00:00:...|       1|         CLOSED|
|        256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|
|      12111|2013-07-25 00:00:...|       3|       COMPLETE|
|       8827|2013-07-25 00:00:...|       4|         CLOSED|
|      11318|2013-07-25 00:00:...|       5|       COMPLETE|
+-----------+--------------------+--------+---------------+
only showing top 5 rows



### Read ORC File

In [16]:
df = spark.read \
        .orc('/public/trendytech/datasets/ordersorc')

In [17]:
df.show(5)

+-----------+--------------------+--------+---------------+
|customer_id|          order_date|order_id|   order_status|
+-----------+--------------------+--------+---------------+
|      11599|2013-07-25 00:00:...|       1|         CLOSED|
|        256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|
|      12111|2013-07-25 00:00:...|       3|       COMPLETE|
|       8827|2013-07-25 00:00:...|       4|         CLOSED|
|      11318|2013-07-25 00:00:...|       5|       COMPLETE|
+-----------+--------------------+--------+---------------+
only showing top 5 rows



### Filter Rows
1. Using where `df.where('col == val')`
2. Using filter `df.filter('col == val`)

In [18]:
df.where('customer_id == 11599').show(5)

+-----------+--------------------+--------+------------+
|customer_id|          order_date|order_id|order_status|
+-----------+--------------------+--------+------------+
|      11599|2013-07-25 00:00:...|       1|      CLOSED|
|      11599|2013-10-03 00:00:...|   11397|    COMPLETE|
|      11599|2013-12-20 00:00:...|   23908|    COMPLETE|
|      11599|2014-06-27 00:00:...|   53545|     PENDING|
|      11599|2013-10-17 00:00:...|   59911|  PROCESSING|
+-----------+--------------------+--------+------------+



In [19]:
df.filter('order_status == "CLOSED"').show(5)

+-----------+--------------------+--------+------------+
|customer_id|          order_date|order_id|order_status|
+-----------+--------------------+--------+------------+
|      11599|2013-07-25 00:00:...|       1|      CLOSED|
|       8827|2013-07-25 00:00:...|       4|      CLOSED|
|       1837|2013-07-25 00:00:...|      12|      CLOSED|
|       1205|2013-07-25 00:00:...|      18|      CLOSED|
|      11441|2013-07-25 00:00:...|      24|      CLOSED|
+-----------+--------------------+--------+------------+
only showing top 5 rows



### Create Spark Table from Dataframe (df -> table)

In [20]:
df.createOrReplaceTempView('orders')

In [21]:
df = spark.sql('select * from orders where order_status = "COMPLETE"')

In [22]:
df.show()

+-----------+--------------------+--------+------------+
|customer_id|          order_date|order_id|order_status|
+-----------+--------------------+--------+------------+
|      12111|2013-07-25 00:00:...|       3|    COMPLETE|
|      11318|2013-07-25 00:00:...|       5|    COMPLETE|
|       7130|2013-07-25 00:00:...|       6|    COMPLETE|
|       4530|2013-07-25 00:00:...|       7|    COMPLETE|
|       2568|2013-07-25 00:00:...|      15|    COMPLETE|
|       2667|2013-07-25 00:00:...|      17|    COMPLETE|
|        333|2013-07-25 00:00:...|      22|    COMPLETE|
|       7562|2013-07-25 00:00:...|      26|    COMPLETE|
|        656|2013-07-25 00:00:...|      28|    COMPLETE|
|       3960|2013-07-25 00:00:...|      32|    COMPLETE|
|       4840|2013-07-25 00:00:...|      35|    COMPLETE|
|       2636|2013-07-25 00:00:...|      45|    COMPLETE|
|      10519|2013-07-25 00:00:...|      56|    COMPLETE|
|       1148|2013-07-25 00:00:...|      63|    COMPLETE|
|       5903|2013-07-25 00:00:.

### Create Dataframe from Spark Table (table -> df)

In [23]:
df = spark.read.table('orders')

In [24]:
df.show()

+-----------+--------------------+--------+---------------+
|customer_id|          order_date|order_id|   order_status|
+-----------+--------------------+--------+---------------+
|      11599|2013-07-25 00:00:...|       1|         CLOSED|
|        256|2013-07-25 00:00:...|       2|PENDING_PAYMENT|
|      12111|2013-07-25 00:00:...|       3|       COMPLETE|
|       8827|2013-07-25 00:00:...|       4|         CLOSED|
|      11318|2013-07-25 00:00:...|       5|       COMPLETE|
|       7130|2013-07-25 00:00:...|       6|       COMPLETE|
|       4530|2013-07-25 00:00:...|       7|       COMPLETE|
|       2911|2013-07-25 00:00:...|       8|     PROCESSING|
|       5657|2013-07-25 00:00:...|       9|PENDING_PAYMENT|
|       5648|2013-07-25 00:00:...|      10|PENDING_PAYMENT|
|        918|2013-07-25 00:00:...|      11| PAYMENT_REVIEW|
|       1837|2013-07-25 00:00:...|      12|         CLOSED|
|       9149|2013-07-25 00:00:...|      13|PENDING_PAYMENT|
|       9842|2013-07-25 00:00:...|      

In [25]:
# end of file
spark.stop()