In [1]:

from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip
import ConnectionConfigKaloyan as cc
cc.setupEnvironment()


In [2]:
spark = cc.startLocalCluster("DIM_VEHICLE",4)
spark.getActiveSession()

In [3]:
# Loading database tables into spark data frames
cc.set_connectionProfile("default")

vehicles_df = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "vehicles") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "vehicleid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

bikelot_df = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "bikelots") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "bikelotid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()

biketype_df = spark.read \
    .format("jdbc") \
    .option("driver" , cc.get_Property("driver")) \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "bike_types") \
    .option("user", cc.get_Property("username")) \
    .option("password", cc.get_Property("password")) \
    .option("partitionColumn", "biketypeid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 20) \
    .load()


In [4]:
#making temporary views from the dataframes
vehicles_df.createOrReplaceTempView("dimVehicle")
bikelot_df.createOrReplaceTempView("dimBikelot")
biketype_df.createOrReplaceTempView("dimBiketype")


In [5]:
#joining the temporary views
vehicle_dimension_df = spark.sql("""
    SELECT 
        v.vehicleid,
        bl.biketypeid
    FROM 
        dimVehicle v
    JOIN 
        dimBikelot bl
    ON 
        v.bikelotid = bl.bikelotid
    JOIN 
        dimBiketype bt 
    ON
        bl.biketypeid = bt.biketypeid 
""")

In [6]:
#displaying the dimension
vehicle_dimension_df.printSchema()
vehicle_dimension_df.show(5)

root
 |-- vehicleid: integer (nullable = true)
 |-- biketypeid: integer (nullable = true)

+---------+----------+
|vehicleid|biketypeid|
+---------+----------+
|        1|         1|
|        2|         1|
|        3|         1|
|        4|         1|
|        5|         1|
+---------+----------+
only showing top 5 rows



In [7]:
# Save the Lock Dimension to a Delta table (or overwrite existing table)
vehicle_dimension_df.write.format("delta").mode("overwrite").saveAsTable("dimVehicle")

In [9]:
spark.stop()