In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:0.7.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config('spark.ui.port', '4050') \
    .getOrCreate()

In [4]:
!wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.parquet

--2022-06-12 18:18:52--  https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.parquet
Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.200.24
Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.200.24|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21686067 (21M) [application/x-www-form-urlencoded]
Saving to: ‘yellow_tripdata_2021-01.parquet’


2022-06-12 18:18:57 (4.36 MB/s) - ‘yellow_tripdata_2021-01.parquet’ saved [21686067/21686067]



In [5]:
!wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-02.parquet

--2022-06-12 18:19:05--  https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-02.parquet
Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.200.24
Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.200.24|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21777258 (21M) [application/x-www-form-urlencoded]
Saving to: ‘yellow_tripdata_2021-02.parquet’


2022-06-12 18:19:08 (8.19 MB/s) - ‘yellow_tripdata_2021-02.parquet’ saved [21777258/21777258]



In [43]:
df = spark.read.parquet('yellow_tripdata_2021-01.parquet')

In [44]:
df.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [46]:
delta_dir = ('./output/delta')
spark.sql('CREATE DATABASE IF NOT EXISTS RIDES')

spark.sql('''
    CREATE TABLE IF NOT EXISTS RIDES.YELLOW_TAXI(
         VendorID long,
         tpep_pickup_datetime timestamp,
         tpep_dropoff_datetime timestamp,
         passenger_count double,
         trip_distance double,
         RatecodeID double,
         store_and_fwd_flag string,
         PULocationID long,
         DOLocationID long,
         payment_type long,
         fare_amount double,
         extra double,
         mta_tax double,
         tip_amount double,
         tolls_amount double,
         improvement_surcharge double,
         total_amount double,
         congestion_surcharge double,
         airport_fee double
    ) USING DELTA
    LOCATION "{0}"
    '''.format(delta_dir)
)

DataFrame[]

In [47]:
df.createOrReplaceTempView('load_table')

In [48]:
def update():
    spark.sql("""MERGE INTO RIDES.YELLOW_TAXI
    USING load_table
       ON  RIDES.YELLOW_TAXI.VendorID = load_table.VendorID and
           RIDES.YELLOW_TAXI.tpep_pickup_datetime = load_table.tpep_pickup_datetime and
           RIDES.YELLOW_TAXI.tpep_dropoff_datetime = load_table.tpep_dropoff_datetime and
           RIDES.YELLOW_TAXI.PULocationID = load_table.PULocationID and
           RIDES.YELLOW_TAXI.DOLocationID = load_table.DOLocationID 
     WHEN NOT MATCHED THEN
          INSERT (VendorID,
            tpep_pickup_datetime,
            tpep_dropoff_datetime,
            passenger_count,
            trip_distance,
            RatecodeID,
            store_and_fwd_flag,
            PULocationID,
            DOLocationID,
            payment_type,
            fare_amount,
            extra,
            mta_tax,
            tip_amount,
            tolls_amount,
            improvement_surcharge,
            total_amount,
            congestion_surcharge,
            airport_fee) VALUES (VendorID,
            tpep_pickup_datetime,
            tpep_dropoff_datetime,
            passenger_count,
            trip_distance,
            RatecodeID,
            store_and_fwd_flag,
            PULocationID,
            DOLocationID,
            payment_type,
            fare_amount,
            extra,
            mta_tax,
            tip_amount,
            tolls_amount,
            improvement_surcharge,
            total_amount,
            congestion_surcharge,
            airport_fee)
    """)

In [49]:
spark.sql("SELECT count(1) FROM RIDES.YELLOW_TAXI").show()

+--------+
|count(1)|
+--------+
| 2741478|
+--------+



In [40]:
spark.read.parquet('yellow_tripdata_2021-01.parquet').createOrReplaceTempView('load_table')

In [41]:
update()

In [42]:
spark.sql("SELECT count(1) FROM RIDES.YELLOW_TAXI").show()

+--------+
|count(1)|
+--------+
| 2741478|
+--------+



In [39]:
spark.sql("SELECT COUNT(1) FROM load_table").show()

+--------+
|count(1)|
+--------+
| 1369769|
+--------+



In [50]:
from delta.tables import *

deltaTable = DeltaTable.forPath(spark, './output/delta')

fullHistoryDF = deltaTable.history()    # get the full history of the table

In [53]:
fullHistoryDF.show()

+-------+--------------------+------+--------+------------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+
|version|           timestamp|userId|userName|   operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|userMetadata|
+-------+--------------------+------+--------+------------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+
|      3|2022-06-12 18:58:...|  null|    null|       MERGE|[predicate -> (((...|null|    null|     null|          2|          null|        false|[numTargetRowsCop...|        null|
|      2|2022-06-12 18:57:...|  null|    null|       MERGE|[predicate -> (((...|null|    null|     null|          1|          null|        false|[numTargetRowsCop...|        null|
|      1|2022-06-12 18:53:...|  null|    null|       MERGE|[predicate -> (((...|null|    null|     n

In [55]:
deltaTable.history(0).count()

0

In [56]:
deltaTable.history(1).count()

1

In [57]:
deltaTable.history(2).count()

2

In [63]:
spark.sql("DESCRIBE HISTORY './output/delta/'").show()

+-------+--------------------+------+--------+------------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+
|version|           timestamp|userId|userName|   operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|userMetadata|
+-------+--------------------+------+--------+------------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+
|      3|2022-06-12 18:58:...|  null|    null|       MERGE|[predicate -> (((...|null|    null|     null|          2|          null|        false|[numTargetRowsCop...|        null|
|      2|2022-06-12 18:57:...|  null|    null|       MERGE|[predicate -> (((...|null|    null|     null|          1|          null|        false|[numTargetRowsCop...|        null|
|      1|2022-06-12 18:53:...|  null|    null|       MERGE|[predicate -> (((...|null|    null|     n

In [68]:
spark.sql('SELECT * FROM RIDES.YELLOW_TAXI VERSION AS OF 0')

ParseException: 
extraneous input 'AS' expecting {<EOF>, ';'}(line 1, pos 41)

== SQL ==
SELECT * FROM RIDES.YELLOW_TAXI VERSION; AS OF 0
-----------------------------------------^^^
