In [None]:
"""
The following program uses jdbc for connecting to the MS SQL Server, therefore, mssql-jdbc jar file is needed.
It has been defined while creating a spark session, before running, the user must have the jar file in this folder.
JAR file can be downloaded from:
https://learn.microsoft.com/en-us/sql/connect/jdbc/download-microsoft-jdbc-driver-for-sql-server?view=sql-server-ver17

Since spark connects to an external resource, it uses HADOOP at low level. Therefore HADOOP_HOME env variable and HADOOP path
has to be defined in the system. If the hadoop gives error, we can download the hadoop .exe and .dll file from:
https://github.com/steveloughran/winutils/tree/master/hadoop-3.0.0/bin
"""

In [1]:
import os
from pyspark.sql import SparkSession

In [2]:
jar_path = os.path.abspath("./mssql-jdbc-12.10.1.jre11.jar")

In [3]:
spark = SparkSession.builder \
					.appName("ExportToParquet") \
					.config("spark.jars", jar_path) \
					.config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.2.3") \
    				.getOrCreate()

In [None]:
df = spark.read \
                .format("jdbc") \
                .option("url", "jdbc:sqlserver://localhost:1433;databaseName=bikeStores;encrypt=true;trustServerCertificate=true") \
				.option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
                .option("dbtable", "sales.orders") \
                .option("user", "sa") \
                .option("password", "<pas>") \
				.load()

In [5]:
df.show()

+--------+-----------+------------+----------+-------------+------------+--------+--------+
|order_id|customer_id|order_status|order_date|required_date|shipped_date|store_id|staff_id|
+--------+-----------+------------+----------+-------------+------------+--------+--------+
|       1|        259|           4|2016-01-01|   2016-01-03|  2016-01-03|       1|       2|
|       2|       1212|           4|2016-01-01|   2016-01-04|  2016-01-03|       2|       6|
|       3|        523|           4|2016-01-02|   2016-01-05|  2016-01-03|       2|       7|
|       4|        175|           4|2016-01-03|   2016-01-04|  2016-01-05|       1|       3|
|       5|       1324|           4|2016-01-03|   2016-01-06|  2016-01-06|       2|       6|
|       6|         94|           4|2016-01-04|   2016-01-07|  2016-01-05|       2|       6|
|       7|        324|           4|2016-01-04|   2016-01-07|  2016-01-05|       2|       6|
|       8|       1204|           4|2016-01-04|   2016-01-05|  2016-01-05|       

In [6]:
output_file = os.path.abspath("./orders.avro")
df.write \
	.format("avro") \
	.mode("overwrite") \
	.save(output_file)