In [0]:
CREATE SCHEMA IF NOT EXISTS silver;

In [0]:
SHOW TABLES FROM main.bronze;

In [0]:
%python
from pyspark.sql.functions import to_date, col, concat, date_format, lit, now, max
from pyspark.sql.dataframe import DataFrame


def load_into_silver(table_name: str, pickup_datetime: str, dropoff_datetime: str):

    df = spark.table(f"main.bronze.{table_name}") \
        .select(
            "VendorID"
            , "passenger_count"
            , "total_amount"
            , col(pickup_datetime)
            , col(dropoff_datetime)
            , to_date(col(pickup_datetime)).alias("date_pickup")
            , concat(date_format(col(pickup_datetime), 'yyyy-MM'), lit('-01')).alias("month_partition")
            , now().alias("updated_at")
        )
    
    if not spark.catalog.tableExists(f"main.silver.{table_name}"):
        df.write.mode("overwrite").partitionBy("month_partition").saveAsTable(f"main.silver.{table_name}")
    
    else:
        last_month = spark.table(f"main.silver.{table_name}").agg(max(col("month_partition"))).collect()[0][0]
        (df.filter(
            col("month_partition") >= last_month
            )
        ).write.mode("overwrite").partitionBy("month_partition").option("replaceWhere", f"month_partition >= '{last_month}'").saveAsTable(f"main.silver.{table_name}")


In [0]:
%python

load_into_silver("yellow_tripdata", "tpep_pickup_datetime", "tpep_dropoff_datetime")

In [0]:
%python

load_into_silver("green_tripdata", "lpep_pickup_datetime", "lpep_dropoff_datetime")