In [0]:
CREATE CATALOG IF NOT EXISTS main;

CREATE SCHEMA IF NOT EXISTS main.bronze;

In [0]:
%python

aws_account_id = dbutils.secrets.get('db-scope', 'aws_account_id')
env = 'dev'
aws_region = 'us-east-2'

s3_path = f"s3://ifood-architect-taxi-case-{env}-{aws_region}-{aws_account_id}"

In [0]:
%python


def autoloader_config(s3_path: str, key: str, schema_hint: str):

  source_path = f"{s3_path}/{key}"
  raw_path = f"{source_path}/raw"
  table_name = f"main.bronze.{key}"

  checkpoint_path = f"{source_path}/_checkpoint"
  table_path = f"{source_path}/delta_table_metadata"

  df = (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "parquet")
        .option("cloudFiles.schemaHints", schema_hint)
        .option("cloudFiles.schemaLocation", checkpoint_path)
        .load(raw_path)
  )

  (df.writeStream
  .format("delta")
  .outputMode("append")
  .option("checkpointLocation", checkpoint_path)
  .option("path", table_path)
  .trigger(availableNow=True)
  .toTable(table_name)
  )

In [0]:
%python

## yellow tripdata ingestion

yellow_schema_hint = """
  VendorID INT,
  tpep_pickup_datetime TIMESTAMP,
  tpep_dropoff_datetime TIMESTAMP,
  passenger_count BIGINT,
  trip_distance DOUBLE,
  RatecodeID BIGINT,
  store_and_fwd_flag STRING,
  PULocationID INT,
  DOLocationID INT,
  payment_type BIGINT,
  fare_amount DOUBLE,
  extra DOUBLE,
  mta_tax DOUBLE,
  tip_amount DOUBLE,
  tolls_amount DOUBLE,
  improvement_surcharge DOUBLE,
  total_amount DOUBLE,
  congestion_surcharge DOUBLE,
  Airport_fee DOUBLE
"""

autoloader_config(s3_path, "yellow_tripdata", yellow_schema_hint)

In [0]:
%python

## green tripdata ingestion 

green_schema_hint = """
    VendorID INT,
    lpep_pickup_datetime TIMESTAMP,
    lpep_dropoff_datetime TIMESTAMP,
    store_and_fwd_flag STRING,
    RatecodeID BIGINT,
    PULocationID INT,
    DOLocationID INT,
    passenger_count BIGINT,
    trip_distance DOUBLE,
    fare_amount DOUBLE,
    extra DOUBLE,
    mta_tax DOUBLE,
    tip_amount DOUBLE,
    tolls_amount DOUBLE,
    ehail_fee DOUBLE,
    improvement_surcharge DOUBLE,
    total_amount DOUBLE,
    payment_type BIGINT,
    trip_type BIGINT,
    congestion_surcharge DOUBLE
"""

autoloader_config(s3_path, "green_tripdata", green_schema_hint)

In [0]:
%python

## fhv tripdata ingestion 

fhv_schema_hint = """
    dispatching_base_num STRING,
    pickup_datetime TIMESTAMP,
    dropOff_datetime TIMESTAMP,
    PUlocationID DOUBLE,
    DOlocationID DOUBLE,
    SR_Flag INT,
    Affiliated_base_number STRING
"""

autoloader_config(s3_path, "fhv_tripdata", fhv_schema_hint)

In [0]:
%python

## fhvhv tripdata ingestion 

fhvhv_schema_hint = """
    hvfhs_license_num STRING,
    dispatching_base_num STRING,
    originating_base_num STRING,
    request_datetime TIMESTAMP,
    on_scene_datetime TIMESTAMP,
    pickup_datetime TIMESTAMP,
    dropoff_datetime TIMESTAMP,
    PULocationID BIGINT,
    DOLocationID BIGINT,
    trip_miles DOUBLE,
    trip_time BIGINT,
    base_passenger_fare DOUBLE,
    tolls DOUBLE,
    bcf DOUBLE,
    sales_tax DOUBLE,
    congestion_surcharge DOUBLE,
    airport_fee DOUBLE,
    tips DOUBLE,
    driver_pay DOUBLE,
    shared_request_flag STRING,
    shared_match_flag STRING,
    access_a_ride_flag STRING,
    wav_request_flag STRING,
    wav_match_flag STRING
"""

autoloader_config(s3_path, "fhvhv_tripdata", fhvhv_schema_hint)