In [None]:
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.Duration
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.{functions => F}
import org.apache.spark.sql.{types => T}
import org.apache.spark.sql.expressions.Window
import java.sql.Timestamp
import spark.implicits._

In [None]:
spark.catalog.listTables show

In [None]:
val df3 = spark.read.format("jdbc").table("bronze_houses")

In [None]:
df3.count

In [None]:
val df_bronze = spark
    .read
    .format("jdbc")
    .table("bronze_houses")

In [None]:
val df_bronze_1 = df_bronze
    .withColumn("_filename", $"_metadata.file_name")
    .withColumn("_modification_time", $"_metadata.file_modification_time")
    .withColumn("_start", F.lit("").cast(T.TimestampType))
    .withColumn("_end", F.lit("").cast(T.TimestampType))
    .withColumnRenamed("data-pk", "data_pk")
    .withColumnRenamed("data-lk", "data_lk")
    .drop("_metadata")

In [None]:
// rearrage the columns to a more intutitive order

In [None]:
val column_order = Seq("data_pk", "data_lk") ++ df_bronze_1.columns.filter(x => !Seq("data_pk", "data_lk").contains(x))

In [None]:
val df_bronze = df_bronze_1.select(column_order.map(col):_*)

In [None]:
val df_silver_schema = df_bronze.schema

In [None]:
// get silver table

In [None]:
val df_silver_before = spark.catalog.listTables().filter(row => row.name == "silver_house").count match {
    case 0 => spark.createDataFrame(sc.emptyRDD[Row], df_silver_schema)
    case _ => spark.read.schema(df_silver_schema).table("silver.houses")
}

In [None]:
// identify new entries

In [None]:
val new_data_1 = df_bronze.select("data_pk").except(df_silver_before.select("data_pk")).distinct().orderBy($"data_pk")

In [None]:
val new_data_2 = df_bronze.as("a")
    .join(
        new_data_1.as("b"),
        $"a.data_pk" === $"b.data_pk",
        "semi")
    .withColumn("_start", $"_modification_time")

In [None]:
val new_data_3_cols = column_order filter(x => !Seq("_start", "_end", "_modification_time", "_filename", "data-pos").contains(x)) map(col)
val windowSpec_3 = Window.partitionBy(new_data_3_cols:_*).orderBy("_start")
val new_data_3 = (new_data_2
    withColumn("_rn", F.row_number().over(windowSpec_3))
    where($"_rn" === 1)
    drop("_rn")
    withColumn("_start", $"_modification_time")
    )

In [None]:
val new_data_4_cols = Seq("data_pk") map(col)
val windowSpec_4 = Window.partitionBy(new_data_4_cols:_*).orderBy("_start")
val new_data_4 = (new_data_3
    withColumn("_end", lead($"_start",1).over(windowSpec_4))
    withColumn("_end", F.coalesce($"_end", F.lit(Timestamp.valueOf("9999-12-31 00:00:00.000"))))
)
// expr("_start - INTERVAL '1 microsecond'"))

In [None]:
new_data_4.count()

In [None]:
// timestamp_micros(unix_micros(Timestamp.valueOf("1968-09-05 07:30:15")))

In [None]:
val df_silver_before_1 = df_silver_before.union(new_data_4)

In [None]:
df_silver_before_1.write
    .format("parquet")
    .mode("overwrite")
    .saveAsTable("silver_houses")

In [None]:
spark.catalog.listTables show