In [3]:
import findspark
findspark.init()
findspark.find()

from IPython.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (
            SparkSession
                .builder
                .appName("DeltaLakeApp")
    
                .master("local[4]")    
                .config("spark.dynamicAllocation.enabled", "false")     
    
    
                # Add package for Delta Lake
                .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0")
    
    
                # Add settings to use Delta Lake with Spark session
                .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    
                .config("spark.sql.catalog.spark_catalog", 
                        "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    
                .getOrCreate()
        )

sc = spark.sparkContext

spark

In [5]:
# Create schema for Yellow Taxi Data
 
taxiSchema = (
                    StructType
                    ([ 
                        StructField("VendorId"               , IntegerType()   , True),
                        StructField("lpep_pickup_datetime"   , TimestampType() , True),
                        StructField("lpep_dropoff_datetime"  , TimestampType() , True),                            
                        StructField("passenger_count"        , DoubleType()    , True),
                        StructField("trip_distance"          , DoubleType()    , True),
                        StructField("RatecodeID"             , DoubleType()    , True),                            
                        StructField("store_and_fwd_flag"     , StringType()    , True),
                        StructField("PULocationID"           , IntegerType()   , True),
                        StructField("DOLocationID"           , IntegerType()   , True),                            
                        StructField("payment_type"           , IntegerType()   , True),                            
                        StructField("fare_amount"            , DoubleType()    , True),
                        StructField("extra"                  , DoubleType()    , True),
                        StructField("mta_tax"                , DoubleType()    , True),
                        StructField("tip_amount"             , DoubleType()    , True),
                        StructField("tolls_amount"           , DoubleType()    , True),
                        StructField("improvement_surcharge"  , DoubleType()    , True),
                        StructField("total_amount"           , DoubleType()    , True),
                        StructField("congestion_surcharge"   , DoubleType()    , True),
                        StructField("airport_fee"            , DoubleType()    , True)
                    ])
               )

In [6]:
yellowTaxiDf = (
    spark
    .read
    .option("header", "true")
    .schema(taxiSchema)
    .csv("C:\DataFiles\YellowTaxis_202210.csv")
)

yellowTaxiDf.printSchema()

root
 |-- VendorId: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [7]:
spark.sql("""
create database if not exists TaxisDB
""")

DataFrame[]

In [9]:
(
    yellowTaxiDf
    .write
    .mode("overwrite")
    .partitionBy("VendorId")
    .format("parquet")
    .option("path","C:\DataFiles\DeltaLakeOutput\yellow.Taxis.parquet")
    .saveAsTable("TaxisDB.YellowTaxisParquet")
)

In [10]:
from delta import *

(
    yellowTaxiDf
    .write
    .mode("overwrite")
    .partitionBy("VendorId")
    .format("delta")
    .option("path","C:\DataFiles\DeltaLakeOutput\yellow.Taxis.delta")
    .saveAsTable("TaxisDB.YellowTaxisDelta")
)

In [15]:
spark.sql("""

DROP TABLE TaxisDB.YellowTaxis

""")

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `spark_catalog`.`TaxisDB`.`YellowTaxis` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.

In [14]:
spark.sql("""

CREATE TABLE TaxisDB.YellowTaxisdelta
(
    VendorId                int ,
lpep_pickup_datetime    timestamp,
lpep_dropoff_datetime   timestamp,
passenger_count         Double,
trip_distance          Double,
RatecodeID            Double,  
store_and_fwd_flag     String, 
PULocationID           int , 
DOLocationID           int ,
payment_type           int ,
fare_amount             Double, 
extra                  Double,  
mta_tax                 Double,  
tip_amount              Double,  
tolls_amount            Double,  
improvement_surcharge   Double, 
total_amount           Double, 
congestion_surcharge   Double, 
airport_fee            Double  
)

USING DELTA                  -- default is Parquet

LOCATION "C:/DataFiles/deltaThings/deltaNotebook/YellowTaxis.delta"

PARTITIONED BY (VendorId)    -- optional

COMMENT 'This table stores ride information for Yellow Taxis'

""")

AnalysisException: [TABLE_OR_VIEW_ALREADY_EXISTS] Cannot create table or view `TaxisDB`.`YellowTaxisdelta` because it already exists.
Choose a different name, drop or replace the existing object, or add the IF NOT EXISTS clause to tolerate pre-existing objects.