In [1]:
import findspark
findspark.init()
findspark.find()

from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (
            SparkSession
                .builder
                .appName("DeltaLakeApp")
    
                .master("local[4]")    
                .config("spark.dynamicAllocation.enabled", "false")     
    
    
                # Add package for Delta Lake
                .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0")
    
    
                # Add settings to use Delta Lake with Spark session
                .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    
                .config("spark.sql.catalog.spark_catalog", 
                        "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    
                .getOrCreate()
        )

sc = spark.sparkContext

spark

In [5]:
spark.sql("""

CREATE DATABASE IF NOT EXISTS TaxisDB

""")

DataFrame[]

In [6]:




spark.sql("""

CREATE TABLE TaxisDB.YellowTaxisdelta
(
    VendorId                int ,
lpep_pickup_datetime    timestamp,
lpep_dropoff_datetime   timestamp,
passenger_count         Double,
trip_distance          Double,
RatecodeID            Double,  
store_and_fwd_flag     String, 
PULocationID           int , 
DOLocationID           int ,
payment_type           int ,
fare_amount             Double, 
extra                  Double,  
mta_tax                 Double,  
tip_amount              Double,  
tolls_amount            Double,  
improvement_surcharge   Double, 
total_amount           Double, 
congestion_surcharge   Double, 
airport_fee            Double  
)

USING DELTA                  -- default is Parquet

LOCATION "C:/DataFiles/deltaThings/deltaNotebook/YellowTaxis.delta"

PARTITIONED BY (VendorId)    -- optional

COMMENT 'This table stores ride information for Yellow Taxis'

""")

DataFrame[]

In [8]:
spark.sql("""

DESCRIBE TABLE EXTENDED TaxisDB.YellowTaxisdelta

""").show(50, truncate=False)

+----------------------------+--------------------------------------------------------------+-------+
|col_name                    |data_type                                                     |comment|
+----------------------------+--------------------------------------------------------------+-------+
|VendorId                    |int                                                           |NULL   |
|lpep_pickup_datetime        |timestamp                                                     |NULL   |
|lpep_dropoff_datetime       |timestamp                                                     |NULL   |
|passenger_count             |double                                                        |NULL   |
|trip_distance               |double                                                        |NULL   |
|RatecodeID                  |double                                                        |NULL   |
|store_and_fwd_flag          |string                                              

In [9]:
spark.sql("""

INSERT INTO TaxisDB.YellowTaxisdelta

-- (VendorId, PickupTime, DropTime, PickupLocationId, DropLocationId, PassengerCount, TripDistance, RateCodeId, StoreAndFwdFlag, PaymentType, FareAmount, Extra, MtaTax, TipAmount, TollsAmount, ImprovementSurcharge, TotalAmount, CongestionSurcharge, AirportFee)

VALUES (3, '2022-12-01T00:00:00.000Z', '2022-12-01T00:15:34.000Z', 170, 140, 1.0, 2.9, 1.0, '1', 1, 13.0, 0.5, 0.5, 1.0, 0.0, 0.3, 15.3, 0.0, 0.0)

""")

DataFrame[]

In [10]:
spark.sql("""

SELECT * FROM TaxisDB.YellowTaxisdelta

""").show(truncate=False)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorId|lpep_pickup_datetime|lpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|3       |2022-12-01 05:30:00 |2022-12-01 05:45:34  |170.0          |140.0        |1.0       |2.9               |1           |1           |1           |13.0       |0.5  |0.5    |1.0      

In [12]:
spark.sql("""

DESCRIBE HISTORY TaxisDB.YellowTaxisdelta

""").show(truncate=False)

+-------+-----------------------+------+--------+------------+---------------------------------------------------------------------------------------------------------------------------------------+----+--------+---------+-----------+--------------+-------------+-----------------------------------------------------------+------------+-----------------------------------+
|version|timestamp              |userId|userName|operation   |operationParameters                                                                                                                    |job |notebook|clusterId|readVersion|isolationLevel|isBlindAppend|operationMetrics                                           |userMetadata|engineInfo                         |
+-------+-----------------------+------+--------+------------+---------------------------------------------------------------------------------------------------------------------------------------+----+--------+---------+-----------+--------------+-----