In [1]:

from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
import ConnectionConfigKaloyan as cc
cc.setupEnvironment()


In [2]:
spark = cc.startLocalCluster("DIM_LOCK",1)
spark.getActiveSession()

In [3]:
# Loading database tables into spark data frames
cc.set_connectionProfile("default")

stations_df = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "stations") \
    .option("postgres", cc.get_Property("username")) \
    .option("Student_1234", cc.get_Property("password")) \
    .option("partitionColumn", "stationid") \
    .option("numPartitions", 1) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

locks_df = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "locks") \
    .option("postgres", cc.get_Property("username")) \
    .option("Student_1234", cc.get_Property("password")) \
    .option("partitionColumn", "lockid") \
    .option("numPartitions", 1) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()


In [4]:
#making temporary views from the dataframes
stations_df.createOrReplaceTempView("dimStations")
locks_df.createOrReplaceTempView("dimLocks")


In [5]:
#joining the temporary views
lock_dimension_df = spark.sql("""
    SELECT 
        l.lockid,                     
        s.stationid,
        s.stationnr,
        s.street,
        s.number,
        s.zipcode,
        s.district,
        s.gpscoord
    FROM 
        dimLocks l
    JOIN 
        dimStations s
    ON 
        l.stationid = s.stationid
""")

In [6]:
#displaying the dimension
lock_dimension_df.printSchema()
lock_dimension_df.show(5)

root
 |-- lockid: integer (nullable = true)
 |-- stationid: integer (nullable = true)
 |-- stationnr: string (nullable = true)
 |-- street: string (nullable = true)
 |-- number: string (nullable = true)
 |-- zipcode: string (nullable = true)
 |-- district: string (nullable = true)
 |-- gpscoord: string (nullable = true)

+------+---------+---------+-----------+------+-------+---------+-----------------+
|lockid|stationid|stationnr|     street|number|zipcode| district|         gpscoord|
+------+---------+---------+-----------+------+-------+---------+-----------------+
|     1|        1|      026|Meir (2000)|    84|   2000|ANTWERPEN|(51.2182,4.41241)|
|     2|        1|      026|Meir (2000)|    84|   2000|ANTWERPEN|(51.2182,4.41241)|
|     3|        1|      026|Meir (2000)|    84|   2000|ANTWERPEN|(51.2182,4.41241)|
|     4|        1|      026|Meir (2000)|    84|   2000|ANTWERPEN|(51.2182,4.41241)|
|     5|        1|      026|Meir (2000)|    84|   2000|ANTWERPEN|(51.2182,4.41241)|
+----

In [6]:
# Save the Lock Dimension to a Delta table (or overwrite existing table)
# data files
#lock_dimension_df.write.format("delta").mode("overwrite").saveAsTable("dimLock")

# parquet files
lock_dimension_df.repartition(1).write.format("parquet").mode("overwrite").saveAsTable("dimLock_pq")

In [7]:
spark.stop()