In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
import ConnectionConfigKaloyan as cc
cc.setupEnvironment()

### Config stuff

# Creating the operational database
In order to run this demo you have to create a tutorial_op database and run the PostgreSQL_SalesOperational.sql script.

### Connection properties
ConnectionConfig.py (cc) is created and imported to simplify the database connection process.
Consult the file to get more insights.

### Session setup
"spark.driver.extraClassPath" is added. This is needed to include the necessary jars when running the sparkJobs.

In [3]:
from delta import configure_spark_with_delta_pip

builder = SparkSession.builder \
    .appName("DBConnectionTest") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .master("local[4]")
#This one must be added to be able to query a jdbc source
extra_packages = ["org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2","com.microsoft.sqlserver:mssql-jdbc:12.2.0.jre8"]
extra_packages = ["org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2","org.postgresql:postgresql:42.7.4"]

builder = configure_spark_with_delta_pip(builder, extra_packages=extra_packages)

spark = builder.getOrCreate()
builder.getOrCreate()
spark.sparkContext.setLogLevel("DEBUG")

### Reading a JDBC table
Read a table from sqlServer connection

#### Using the ConnectionConfig (cc) to make things easy
cc can make a connection url based on a connection profile in config.ini. To do this, first set the name of the connection.

#### Partitioning
As Spark is build to work in parallel reading from the database can also be done in parallel. In this case we define 4 partitions. Spark has to know how to split the data for every partition. Therefore you have to provide a partition column and a lower and upperbound. In this case the  on of the 4 queries that Spark will fire looks like "select * from dbo.sales where Order_ID <= 500 and Order_id > 250"

In [4]:
cc.set_connectionProfile("veloDB")
print(cc.create_jdbc())
velo_df = spark.read \
    .format("jdbc") \
    .option("driver" , "org.postgresql.Driver") \
    .option("url", cc.create_jdbc()) \
    .option("dbtable", "locks") \
    .option("postgres", cc.get_Property("username")) \
    .option("strongage25", cc.get_Property("password")) \
    .option("partitionColumn", "lockid") \
    .option("numPartitions", 4) \
    .option("lowerBound", 0) \
    .option("upperBound", 1001) \
    .load()
velo_df.show(1000)

jdbc:postgresql://localhost:5432/veloDB?user=postgres&password=strongage25&ssl=false
+------+-------------+---------+---------+
|lockid|stationlocknr|stationid|vehicleid|
+------+-------------+---------+---------+
|     1|            1|        1|     NULL|
|     2|            2|        1|     NULL|
|     3|            3|        1|     NULL|
|     4|            4|        1|     NULL|
|     5|            5|        1|     NULL|
|     6|            6|        1|     NULL|
|     7|            7|        1|     NULL|
|     8|            8|        1|     NULL|
|     9|            9|        1|     NULL|
|    10|           10|        1|     NULL|
|    11|           11|        1|     NULL|
|    12|           12|        1|     NULL|
|    13|           13|        1|     NULL|
|    14|           14|        1|     NULL|
|    15|           15|        1|     NULL|
|    16|           16|        1|     NULL|
|    17|           17|        1|     NULL|
|    18|           18|        1|     1664|
|    19|    

In [None]:
spark.stop()