In [28]:
import os
from typing import Mapping, Optional, Sequence

from dotenv import load_dotenv
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import asc, col, explode, split, to_timestamp

# Source
INCOMING_ROOT = "s3a://landing-isis/statusdisplay/incoming"

# Destination
CATALOG = "isis"
TARGET_DB = "cleaned"
SCHEDULE_TABLE = "running_schedule"
MAINTENANCE_TABLE = "maintenance_schedule"

load_dotenv()

True

In [3]:
spark = (
    SparkSession.builder
        .master("spark://data-accelerator.isis.cclrc.ac.uk:7077")
        .config("spark.hadoop.fs.s3a.access.key", os.environ["S3_ACCESS_KEY"])
        .config("spark.hadoop.fs.s3a.secret.key", os.environ["S3_ACCESS_SECRET"])
        .getOrCreate()
)
spark.active()

In [4]:
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{TARGET_DB}")
spark.sql(f"USE {CATALOG}.{TARGET_DB}")

In [34]:
table_ddl = f"""
CREATE TABLE IF NOT EXISTS {SCHEDULE_TABLE} (
  cycle_name STRING,
  interval_label STRING,
  interval_type STRING,
  started TIMESTAMP, 
  ended TIMESTAMP
)
USING iceberg
PARTITIONED BY (year(started))
"""
spark.sql(table_ddl)

In [35]:
%%time

# Load in temporary table
loadtype = "full"
ingest_date = "2024/11/24"
df = spark.read.json(f"{INCOMING_ROOT}/running_schedule/{loadtype}/{ingest_date}/*.json") \
  .withColumn("start", to_timestamp("start")) \
  .withColumn("end", to_timestamp("end"))
df.createOrReplaceTempView("schedule")

CPU times: user 2.11 ms, sys: 2.15 ms, total: 4.26 ms
Wall time: 522 ms


In [36]:
user_cycle_intervals = df.select(
    col("label").alias("cycle_name"),
    col("label").alias("interval_label"),
    col("type").alias("interval_type"),
    to_timestamp(col("start")).alias("started"),
    to_timestamp(col("end")).alias("ended")
).where(df.type == "cycle")
user_cycle_intervals.show(truncate=False)

+----------+--------------+-------------+-------------------+-------------------+
|cycle_name|interval_label|interval_type|started            |ended              |
+----------+--------------+-------------+-------------------+-------------------+
|2019/1    |2019/1        |cycle        |2019-06-04 07:30:00|2019-07-19 07:30:00|
|2019/2    |2019/2        |cycle        |2019-09-10 07:30:00|2019-10-25 07:30:00|
|2019/3    |2019/3        |cycle        |2019-11-12 08:30:00|2019-12-20 08:30:00|
|2019/4    |2019/4        |cycle        |2020-02-11 08:30:00|2020-03-18 12:00:00|
|2020/2    |2020/2        |cycle        |2020-09-08 07:30:00|2020-10-23 07:30:00|
|2020/3    |2020/3        |cycle        |2020-11-10 08:30:00|2020-12-17 08:30:00|
|2021/1    |2021/1        |cycle        |2021-04-27 07:30:00|2021-06-17 07:30:00|
|2021/2    |2021/2        |cycle        |2022-03-01 08:30:00|2022-04-08 07:30:00|
|2022/1    |2022/1        |cycle        |2022-05-03 07:30:00|2022-06-02 07:30:00|
|2022/2    |2022

In [39]:
preuser_cycle_intervals  = df.select(
    col("label").alias("interval_label"),
    to_timestamp(col("start")).alias("started"),
    to_timestamp(col("end")).alias("ended"),
    col("type").alias("interval_type")
).where(df.type == "pre-cycle")
preuser_cycle_intervals = preuser_cycle_intervals.join(user_cycle_intervals,
                             preuser_cycle_intervals.ended == user_cycle_intervals.started
                            ).select(user_cycle_intervals.cycle_name,
                                     preuser_cycle_intervals.interval_label,
                                     preuser_cycle_intervals.interval_type,
                                     preuser_cycle_intervals.started,
                                     preuser_cycle_intervals.ended)
preuser_cycle_intervals.show(truncate=False)

+----------+--------------+-------------+-------------------+-------------------+
|cycle_name|interval_label|interval_type|started            |ended              |
+----------+--------------+-------------+-------------------+-------------------+
|2021/2    |Run-up        |pre-cycle    |2022-02-21 08:30:00|2022-03-01 08:30:00|
|2022/1    |Run-up        |pre-cycle    |2022-04-25 07:30:00|2022-05-03 07:30:00|
|2022/2    |Run-up        |pre-cycle    |2022-06-20 07:30:00|2022-06-28 07:30:00|
|2022/3    |Run-up        |pre-cycle    |2022-09-03 07:30:00|2022-09-20 07:30:00|
|2022/4    |Run-up        |pre-cycle    |2022-10-31 08:30:00|2022-11-08 08:30:00|
|2022/5    |Run-up        |pre-cycle    |2023-01-27 08:30:00|2023-02-07 08:30:00|
|2023/1    |Run-up        |pre-cycle    |2023-04-17 07:30:00|2023-04-25 07:30:00|
|2023/2    |Run-up        |pre-cycle    |2023-06-19 07:30:00|2023-06-27 07:30:00|
|2023/3    |Run-up        |pre-cycle    |2023-09-04 07:30:00|2023-09-19 07:30:00|
|2023/4    |Run-

In [40]:
postuser_cycle_intervals  = df.select(
    col("label").alias("interval_label"),
    to_timestamp(col("start")).alias("started"),
    to_timestamp(col("end")).alias("ended"),
    col("type").alias("interval_type")
).where(df.type == "post-cycle")
postuser_cycle_intervals = postuser_cycle_intervals.join(user_cycle_intervals,
                             user_cycle_intervals.ended == postuser_cycle_intervals.started
                            ).select(user_cycle_intervals.cycle_name,
                                     postuser_cycle_intervals.interval_label,
                                     postuser_cycle_intervals.interval_type,
                                     postuser_cycle_intervals.started,
                                     postuser_cycle_intervals.ended)
postuser_cycle_intervals.show(truncate=False)

+----------+---------------+-------------+-------------------+-------------------+
|cycle_name|interval_label |interval_type|started            |ended              |
+----------+---------------+-------------+-------------------+-------------------+
|2021/2    |Machine physics|post-cycle   |2022-04-08 07:30:00|2022-04-11 07:30:00|
|2022/1    |Machine physics|post-cycle   |2022-06-02 07:30:00|2022-06-05 07:30:00|
|2022/2    |Machine physics|post-cycle   |2022-07-29 07:30:00|2022-08-01 07:30:00|
|2022/3    |Machine physics|post-cycle   |2022-10-14 07:30:00|2022-10-23 07:30:00|
|2022/4    |Machine physics|post-cycle   |2022-12-16 08:30:00|2022-12-21 08:30:00|
|2022/5    |Machine physics|post-cycle   |2023-03-31 07:30:00|2023-04-03 07:30:00|
|2023/1    |Machine physics|post-cycle   |2023-05-26 07:30:00|2023-05-29 07:30:00|
|2023/2    |Machine physics|post-cycle   |2023-08-04 07:30:00|2023-08-07 07:30:00|
|2023/3    |Machine physics|post-cycle   |2023-10-20 07:30:00|2023-10-23 07:30:00|
|202

In [42]:
running_schedule_df = user_cycle_intervals.union(preuser_cycle_intervals).union(postuser_cycle_intervals).orderBy(asc("started"))
running_schedule_df.show(n=100,truncate=False)
running_schedule_df.write.insertInto(SCHEDULE_TABLE)

+----------+---------------+-------------+-------------------+-------------------+
|cycle_name|interval_label |interval_type|started            |ended              |
+----------+---------------+-------------+-------------------+-------------------+
|2019/1    |2019/1         |cycle        |2019-06-04 07:30:00|2019-07-19 07:30:00|
|2019/2    |2019/2         |cycle        |2019-09-10 07:30:00|2019-10-25 07:30:00|
|2019/3    |2019/3         |cycle        |2019-11-12 08:30:00|2019-12-20 08:30:00|
|2019/4    |2019/4         |cycle        |2020-02-11 08:30:00|2020-03-18 12:00:00|
|2020/2    |2020/2         |cycle        |2020-09-08 07:30:00|2020-10-23 07:30:00|
|2020/3    |2020/3         |cycle        |2020-11-10 08:30:00|2020-12-17 08:30:00|
|2021/1    |2021/1         |cycle        |2021-04-27 07:30:00|2021-06-17 07:30:00|
|2021/2    |Run-up         |pre-cycle    |2022-02-21 08:30:00|2022-03-01 08:30:00|
|2021/2    |2021/2         |cycle        |2022-03-01 08:30:00|2022-04-08 07:30:00|
|202

In [None]:
table_ddl = f"""
CREATE TABLE IF NOT EXISTS {MAINTENANCE_TABLE} (
  label STRING,
  started TIMESTAMP, 
  ended TIMESTAMP
)
USING iceberg
PARTITIONED BY (year(started))
"""
spark.sql(table_ddl)

In [None]:
maintenance_df = \
  df.withColumn("tmp", explode("maintenanceDays")) \
    .withColumn("started", to_timestamp(split(col("tmp"), '/')[0])) \
    .withColumn("ended", to_timestamp(split(col("tmp"), '/')[1])) \
    .select("label", "started", "ended")
maintenance_df.show(truncate=False)

In [None]:
maintenance_df.write.insertInto(MAINTENANCE_TABLE)