In [0]:
%run ./01-config

In [0]:
# Set up environment widget and get value
try:
    dbutils.widgets.text("Environment", "dev", "Set the current environment")
except Exception:
    pass

ENV = dbutils.widgets.get("Environment").strip().lower()

if ENV not in ("dev", "prod"):
    raise ValueError("env must be 'dev' or 'prod'")

# Set storage account per environment
STORAGE_ACCOUNT = {
    "dev":  "sttfldevuks",
    "prod": "sttflproduks",
}[ENV]

# Set base zone URLs
data_zone_url = f"abfss://tfl-{ENV}@{STORAGE_ACCOUNT}.dfs.core.windows.net/data-zone"
checkpoint_zone_url = f"abfss://tfl-{ENV}@{STORAGE_ACCOUNT}.dfs.core.windows.net/checkpoint-zone"

# Set derived paths
raw_url = f"{data_zone_url}/raw"
test_data_url = f"{data_zone_url}/test_data"
checkpoint_url = f"{checkpoint_zone_url}/checkpoints"

# Bootstrap infrastructure

# Create catalog
spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog}")

# Create schemas
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_landing}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_bronze}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_silver}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_gold}")

# Create external volumes
spark.sql(f"CREATE EXTERNAL VOLUME IF NOT EXISTS {raw_volume}  LOCATION '{raw_url}'")
spark.sql(f"CREATE EXTERNAL VOLUME IF NOT EXISTS {checkpoint_volume} LOCATION '{checkpoint_url}'")
if ENV != "prod":
    spark.sql(f"CREATE EXTERNAL VOLUME IF NOT EXISTS {test_volume} LOCATION '{test_data_url}'")

# Check accessibility of expected raw folders
for name, path in source_subdir.items():
    try:
        dbutils.fs.ls(path + "/")
        print(f"✅ ok: {name} -> {path}")
    except Exception as e:
        print(f"⚠️ not accessible yet: {name} -> {path} | {e}")

print("Bootstrap complete.")

In [0]:
spark.sql(f"DROP CATALOG IF EXISTS tfl_pipeline CASCADE")

In [0]:
# # Setup flag to track initialization state
# initialized = False

# def create_catalog(catalog):
#     # Create catalog if it does not exist
#     print(f"Creating catalog {catalog}...", end="")
#     spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog}")
#     print("Done")


# def create_schemas(catalog, schemas):
#     # Create each schema in the provided list if it does not exist
#     for schema in schemas:
#         print(f"Creating schema {catalog}.{schema}...", end="")
#         spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema}")
#         print("Done")


# def create_landing_volumes(catalog, schema, volumes):
#     # Create external volumes for landing data
#     for volume_name, volume_path in volumes.items():
#         print(f"Creating volume {catalog}.{schema}.{volume_name}...", end="")
#         spark.sql(f"CREATE EXTERNAL VOLUME IF NOT EXISTS {catalog}.{schema}.{volume_name} LOCATION '{volume_path}'")
#         print("Done")



# def create_line_status(catalog, schema=silver_schema):
#     # Create the line_status table if initialized
#     if initialized:
#         print(f"Creating table {catalog}.{schema}.line_status...", end="")
#         spark.sql(f"""
#             CREATE OR REPLACE TABLE {catalog}.{schema}.line_status(
#                 line_id STRING,
#                 service_type STRING,
#                 severity_code BIGINT,
#                 severity_description STRING,
#                 disruption_category STRING,
#                 disruption_description STRING,
#                 disruption_from_date TIMESTAMP,
#                 disruption_to_date TIMESTAMP,
#                 is_service_disrupted BOOLEAN,
#                 event_timestamp TIMESTAMP
#             )
#         """)
#         print("Done")
#     else:
#         raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")


# def create_bus_arrivals(catalog, schema=silver_schema):
#     # Create the bus_arrivals table if initialized
#     if initialized:
#         print(f"Creating table {catalog}.{schema}.bus_arrivals...", end="")
#         spark.sql(f"""
#             CREATE OR REPLACE TABLE {catalog}.{schema}.bus_arrivals(
#                 arrival_id STRING,
#                 operation_type BIGINT,
#                 vehicle_id STRING,
#                 naptan_id STRING,
#                 station_name STRING,
#                 line_id STRING,
#                 platform_name STRING,
#                 direction STRING,
#                 bearing BIGINT,
#                 trip_id BIGINT,
#                 base_version BIGINT,
#                 destination_naptan_id STRING,
#                 destination_name STRING,
#                 event_timestamp TIMESTAMP,
#                 time_to_station BIGINT,
#                 current_location STRING,
#                 towards STRING,
#                 expected_arrival TIMESTAMP,
#                 time_to_live TIMESTAMP
#             )
#         """)
#         print("Done")
#     else:
#         raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")



# def create_london_boroughs(catalog, schema=silver_schema):
#     # Create the london_boroughs table if initialized
#     if initialized:
#         print(f"Creating table {catalog}.{schema}.london_boroughs...", end="")
#         spark.sql(f"""
#             CREATE OR REPLACE TABLE {catalog}.{schema}.london_boroughs(
#                 borough_code STRING,
#                 borough_name STRING,
#                 hectares DOUBLE,
#                 shape_area DOUBLE,
#                 shape_length DOUBLE,
#                 geometry_geojson STRING
#             )
#         """)
#         print("Done")
#     else:
#         raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")



# def create_stop_points(catalog, schema=silver_schema):
#     # Create the stop_points table if initialized
#     if initialized:
#         print(f"Creating table {catalog}.{schema}.stop_points...", end="")
#         spark.sql(f"""
#             CREATE OR REPLACE TABLE {catalog}.{schema}.stop_points(
#                 naptan_id STRING,
#                 indicator STRING,
#                 ics_code BIGINT,
#                 stop_type STRING,
#                 common_name STRING,
#                 longitude DOUBLE,
#                 latitude DOUBLE
#             )    
#         """)
#         print("Done")
#     else:
#         raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")



# def create_bus_arrival_events(catalog, schema=silver_schema):
#     # Create the bus_arrival_events table if initialized
#     if initialized:
#         print(f"Creating table {catalog}.{schema}.bus_arrival_events...", end="")
#         spark.sql(f"""
#             CREATE OR REPLACE TABLE {catalog}.{schema}.bus_arrival_events(
#                 arrival_event_id BIGINT, 
#                 line_id STRING,
#                 vehicle_id STRING,
#                 naptan_id STRING,
#                 station_name STRING,
#                 platform_name STRING,
#                 direction STRING,
#                 destination_name STRING,
#                 time_to_station BIGINT,
#                 expected_arrival TIMESTAMP,
#                 time_to_live TIMESTAMP,
#                 is_service_disrupted BOOLEAN,
#                 severity_code BIGINT, 
#                 severity_description STRING, 
#                 event_timestamp TIMESTAMP
#             )     
#         """)
#         print("Done")
#     else:
#         raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")


# def create_bus_stops_geo(catalog, schema=silver_schema):
#     # Create the bus_stops_geo table if initialized
#     if initialized:
#         print(f"Creating table {catalog}.{schema}.bus_stops_geo...", end="")
#         spark.sql(f"""
#             CREATE OR REPLACE TABLE {catalog}.{schema}.bus_stops_geo(
#                 naptan_id STRING,
#                 stop_name STRING,
#                 stop_type STRING,
#                 borough_code STRING,
#                 borough_name STRING,
#                 longitude DOUBLE,
#                 latitude DOUBLE
#             )
#         """)
#         print("Done")
#     else:
#         raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")


# def create_line_disruption_geo(catalog, schema=silver_schema):
#     # Create the line_disruption_geo table if initialized
#     if initialized:
#         print(f"Creating table {catalog}.{schema}.line_disruption_geo...", end="")
#         spark.sql(f"""
#             CREATE OR REPLACE TABLE {catalog}.{schema}.line_disruption_geo(
#                 line_id STRING,
#                 service_type STRING,
#                 severity_code BIGINT,
#                 severity_description STRING,
#                 disruption_category STRING,
#                 disruption_description STRING,
#                 disruption_from_date TIMESTAMP,
#                 disruption_to_date TIMESTAMP,
#                 is_service_disrupted BOOLEAN,
#                 borough_code STRING,
#                 borough_name STRING,
#                 longitude DOUBLE,
#                 latitude DOUBLE,
#                 event_timestamp TIMESTAMP
#             )
#         """)
#         print("Done")
#     else:
#         raise ReferenceError("❌ Application database is not defined. Cannot create table in undefined database.")


# def create_bus_disruption_impact(catalog, schema=gold_schema):
#     # Create the disruption_impact table in the gold schema if initialized
#     if initialized:
#         print(f"Creating table {catalog}.{schema}.disruption_impact...", end="")
#         spark.sql(f"""
#             CREATE OR REPLACE TABLE {catalog}.{schema}.disruption_impact (
#                 line_id STRING,
#                 borough_name STRING,
#                 disruption_count BIGINT,
#                 max_severity_code BIGINT,
#                 max_severity_description STRING,
#                 disruption_start TIMESTAMP,
#                 disruption_end TIMESTAMP,
#                 service_disruption_events BIGINT
#             )
#         """)
#         print("Done")
#     else:
#         raise ReferenceError(
#             "❌ Application database is not defined. Cannot create table in undefined database."
#         )

# def setup_infrastructure(catalog):
#     # Set up the platform infrastructure: catalog, schemas, volumes, and tables
#     import time
#     global initialized

#     start = int(time.time())
#     print("Starting platform setup...")

#     create_catalog(catalog)
#     create_schemas(catalog, schemas)
#     create_landing_volumes(catalog=catalog,schema=landing_schema,volumes=landing_volumes)

#     initialized = True  # Set flag to allow table creation

#     # ---- Silver layer ----
#     create_line_status(catalog)
#     create_bus_arrivals(catalog)
#     create_london_boroughs(catalog)
#     create_stop_points(catalog)
#     create_bus_arrival_events(catalog)
#     create_bus_stops_geo(catalog)
#     create_line_disruption_geo(catalog)

#     print(f"Platform setup completed in {int(time.time()) - start} seconds")



# def assert_table(catalog, schema, table_name):
#     # Assert that a table exists in the given catalog and schema
#     exists = spark.sql(f"SHOW TABLES IN {catalog}.{schema}") \
#                 .filter(f"isTemporary = false AND tableName = '{table_name}'") \
#                 .count() == 1
#     assert exists, f"Table {catalog}.{schema}.{table_name} does not exist"
#     print(f"Found table {catalog}.{schema}.{table_name}: Success")



# def validate_setup(catalog):
#     # Validate that all required schemas and tables exist
#     import time
#     start = int(time.time())
#     print("\nStarting setup validation ...")

#     # ---- Silver ----
#     schema = silver_schema
#     schema_exists = spark.sql(f"SHOW SCHEMAS IN {catalog}") \
#         .filter(f"databaseName = '{schema}'") \
#         .count() == 1

#     assert schema_exists, f"The schema '{catalog}.{schema}' is missing"
#     print(f"Found schema {catalog}.{schema}: Success")

#     assert_table(catalog, schema, "line_status")
#     assert_table(catalog, schema, "bus_arrivals")
#     assert_table(catalog, schema, "london_boroughs")
#     assert_table(catalog, schema, "stop_points")
#     assert_table(catalog, schema, "bus_arrival_events")
#     assert_table(catalog, schema, "bus_stops_geo")
#     assert_table(catalog, schema, "line_disruption_geo")

#     print(f"Setup validation completed in {int(time.time()) - start} seconds")


# def cleanup(catalog, schema):
#     # Drop the specified schema and all its contents if it exists
#     try:
#         schema_exists = spark.sql(f"SHOW SCHEMAS IN {catalog}") \
#                     .filter(f"schemaName = '{schema}'") \
#                     .count()==1

#         if schema_exists:
#             print(f"Dropping schema {catalog}.{schema}...", end="")
#             spark.catalog.clearCache()
#             spark.sql(f"DROP SCHEMA {catalog}.{schema} CASCADE")
#             print("Done")
#         else:
#             print(f"Schema {catalog}.{schema} does not exist. Skipping drop.")

#     except Exception as e:
#         print(f"Error while dropping schema {catalog}.{schema}: {str(e)}")