# Unit Test SQL

In [1]:
import datetime
import pathlib

import delta
import pyspark
import pyspark.sql.functions as F
from delta import configure_spark_with_delta_pip

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.executor.memory", "10G")
    .config("spark.driver.memory", "25G")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .config("spark.sql.shuffle.partitions", "2")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-350-delta-310/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6683af6d-8889-4bd1-9903-6e7af5490286;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.1.0 in central
	found io.delta#delta-storage;3.1.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 101ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.1.0 from central in [default]
	io.delta#delta-storage;3.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default 

In [3]:
df = spark.createDataFrame(
    [
        ("socks", 7.55, datetime.date(2022, 5, 15)),
        ("handbag", 49.99, datetime.date(2022, 5, 16)),
        ("shorts", 35.00, datetime.date(2023, 1, 5)),
        ("socks", 25.00, datetime.date(2023, 12, 23)),
    ],
    ["item", "amount", "purchase_date"],
)

In [4]:
df.show()

                                                                                

+-------+------+-------------+
|   item|amount|purchase_date|
+-------+------+-------------+
|  socks|  7.55|   2022-05-15|
|handbag| 49.99|   2022-05-16|
| shorts|  35.0|   2023-01-05|
|  socks|  25.0|   2023-12-23|
+-------+------+-------------+



In [5]:
query = "SELECT * from {df} where amount > {amount}"

In [6]:
spark.sql(query, df=df, amount=30.0).show()

+-------+------+-------------+
|   item|amount|purchase_date|
+-------+------+-------------+
|handbag| 49.99|   2022-05-16|
| shorts|  35.0|   2023-01-05|
+-------+------+-------------+



In [7]:
df.createOrReplaceTempView("my_table")

In [8]:

"SELECT * from my_table where amount > 30.0"

spark.sql("SELECT * from my_table where amount > 30.0").show()

+-------+------+-------------+
|   item|amount|purchase_date|
+-------+------+-------------+
|handbag| 49.99|   2022-05-16|
| shorts|  35.0|   2023-01-05|
+-------+------+-------------+



In [15]:
expected_df = spark.createDataFrame(
    [
        ("handbag", 49.99, datetime.date(2022, 5, 16)),
        ("shorts", 35.00, datetime.date(2023, 1, 5)),
    ],
    ["item", "amount", "purchase_date"],
)

In [16]:
from pyspark.testing import assertDataFrameEqual
actual_df = spark.sql(query, df=df, amount=30.0)
assertDataFrameEqual(actual_df, expected_df)