In [None]:
import datetime as dt
import functools
from pathlib import Path
import time

from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import col, lit, udf
from pyspark.sql.types import DateType, DoubleType, StructField, StructType, StringType, TimestampType

STAGING_DIR = Path('/staging/facility_info')
WAREHOUSE_ROOT = 's3:/'
CATALOG = 'warehouse'
NAMESPACE = 'bronze'
CYCLE_TABLE = 'cycle'
CYCLE_TABLE_FQN = f"{CATALOG}.{NAMESPACE}.{CYCLE_TABLE}"
CYCLE_START_TIME_DEFAULT = "08:30:00"

spark = SparkSession.builder.remote(os.environ["SPARK_REMOTE"]).getOrCreate()

In [None]:
def read_cycle(filepath: Path) -> DataFrame:
    df = spark.read.csv(
      path=str(filepath),
      header=True,
      inferSchema=False,
      enforceSchema=False,
    )
    to_ts = udf(lambda x: dt.datetime.strptime(f"{x} {CYCLE_START_TIME_DEFAULT}", "%d/%m/%Y %H:%M:%S"), TimestampType())
    df = df \
      .withColumnsRenamed({"cycle": "name", "start_date": "started", "end_date": "ended"}) \
      .withColumn("started", to_ts(col("started"))) \
      .withColumn("ended", to_ts(col("ended")))
    return df


In [None]:
%%time

%sql DROP TABLE IF EXISTS {CYCLE_TABLE_FQN} PURGE

In [None]:
%sql CREATE SCHEMA IF NOT EXISTS {CATALOG}.{NAMESPACE} LOCATION '{WAREHOUSE_ROOT}/{CATALOG}/{NAMESPACE}'

In [None]:
%sql CREATE TABLE IF NOT EXISTS {CYCLE_TABLE_FQN} (\
    name STRING,\
    started TIMESTAMP,\
    ended TIMESTAMP\
)\
PARTITIONED BY (name)
%sql DESCRIBE EXTENDED {CYCLE_TABLE_FQN}

In [None]:
%%time

cycles = Path(f"{STAGING_DIR}/Cycles.csv")
df = read_cycle(cycles)
df.printSchema()
df.show()
df.write.saveAsTable(f"{CYCLE_TABLE_FQN}", mode="append")

print()
print(f"Read cycle metadata into '{CYCLE_TABLE_FQN}' table.")
print()
