# Quinn Demo

In [2]:
import delta
import pyspark
import pyspark.sql.functions as F
import quinn
from pyspark.sql.types import *

In [3]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [4]:
spark = delta.configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-everything/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1a4a58c6-982d-4c9b-910b-7d25c11e5565;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.2.0 in central
	found io.delta#delta-storage;2.2.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
:: resolution report :: resolve 276ms :: artifacts dl 10ms
	:: modules in use:
	io.delta#delta-core_2.12;2.2.0 from central in [default]
	io.delta#delta-storage;2.2.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     | 

23/02/11 13:50:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## validate_presence_of_columns

In [4]:
df = spark.createDataFrame([("bob", 3), ("sue", 5)]).toDF("first_name", "age")

In [5]:
df.show()

                                                                                

+----------+---+
|first_name|age|
+----------+---+
|       bob|  3|
|       sue|  5|
+----------+---+



In [6]:
quinn.validate_presence_of_columns(df, ["first_name", "age"])

In [7]:
quinn.validate_presence_of_columns(df, ["first_name", "age", "country"])

DataFrameMissingColumnError: The ['country'] columns are not included in the DataFrame with the following columns ['first_name', 'age']

## validate_schema

In [4]:
df = spark.createDataFrame([("bob", 3), ("sue", 5)]).toDF("first_name", "age")

In [5]:
df.show()

                                                                                

+----------+---+
|first_name|age|
+----------+---+
|       bob|  3|
|       sue|  5|
+----------+---+



In [6]:
df.printSchema()

root
 |-- first_name: string (nullable = true)
 |-- age: long (nullable = true)



In [7]:
matching_schema = StructType(
    [
        StructField("first_name", StringType(), True),
        StructField("age", LongType(), True),
    ]
)

In [8]:
quinn.validate_schema(df, matching_schema)

In [12]:
mismatched_schema = StructType(
    [
        StructField("some_whatever", IntegerType(), True),
        StructField("age", LongType(), True),
    ]
)

In [13]:
quinn.validate_schema(df, mismatched_schema)

DataFrameMissingStructFieldError: The [StructField('some_whatever', IntegerType(), True)] StructFields are not included in the DataFrame with the following StructFields StructType([StructField('first_name', StringType(), True), StructField('age', LongType(), True)])

## validate_absence_of_columns

In [4]:
df = spark.createDataFrame([("bob", 3), ("sue", 5)]).toDF("first_name", "age")

In [5]:
df.show()

                                                                                

+----------+---+
|first_name|age|
+----------+---+
|       bob|  3|
|       sue|  5|
+----------+---+



In [6]:
quinn.validate_absence_of_columns(df, ["favorite_color"])

In [7]:
quinn.validate_absence_of_columns(df, ["age"])

DataFrameProhibitedColumnError: The ['age'] columns are not allowed to be included in the DataFrame with the following columns ['first_name', 'age']

## Schema safe appends - need to make quinn better to make this flow!

In [24]:
df = spark.createDataFrame([("bob", 3), ("sue", 5)]).toDF("first_name", "age")

In [25]:
df.show()

+----------+---+
|first_name|age|
+----------+---+
|       bob|  3|
|       sue|  5|
+----------+---+



In [26]:
df.write.format("parquet").save("tmp/parquet1")

In [27]:
spark.read.format("parquet").load("tmp/parquet1").show()

+----------+---+
|first_name|age|
+----------+---+
|       sue|  5|
|       bob|  3|
+----------+---+



In [28]:
bad_append_df = spark.createDataFrame([("usa", 99), ("china", 66)]).toDF(
    "country", "age"
)

In [29]:
bad_append_df.schema

StructType([StructField('country', StringType(), True), StructField('age', LongType(), True)])

In [30]:
if quinn.validate_schema(df, bad_append_df.schema):
    bad_append_df.write.format("parquet").mode("append").save("tmp/parquet1")

SyntaxError: unexpected EOF while parsing (664740600.py, line 3)

In [31]:
good_append_df = spark.createDataFrame([("donald", 77), ("sergio", 44)]).toDF(
    "first_name", "age"
)

In [32]:
good_append_df.show()

+----------+---+
|first_name|age|
+----------+---+
|    donald| 77|
|    sergio| 44|
+----------+---+



In [22]:
if quinn.validate_schema(df, good_append_df.schema):
    good_append_df.write.format("parquet").mode("append").save("tmp/parquet1")

In [23]:
spark.read.format("parquet").load("tmp/parquet1").show()

+----------+---+
|first_name|age|
+----------+---+
|       bob|  3|
|       sue|  5|
+----------+---+



## create_df

In [13]:
from quinn.extensions import create_df

In [14]:
df = spark.create_df(
    [("jose", "a"), ("li", "b"), ("sam", "c")],
    [("name", StringType(), False), ("blah", StringType(), True)],
)

In [15]:
df.show()

+----+----+
|name|blah|
+----+----+
|jose|   a|
|  li|   b|
| sam|   c|
+----+----+



In [16]:
df.printSchema()

root
 |-- name: string (nullable = false)
 |-- blah: string (nullable = true)



In [19]:
df = spark.createDataFrame([("jose", "a"), ("li", "b"), ("sam", "c")]).toDF(
    "name", "blah"
)

In [20]:
df.show()

+----+----+
|name|blah|
+----+----+
|jose|   a|
|  li|   b|
| sam|   c|
+----+----+



In [21]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- blah: string (nullable = true)



## create-df2: Manually specifying schemas can be tedious

In [29]:
from pyspark.sql.types import StringType

df = spark.createDataFrame(
    data=[("jose", "a"), ("li", "b"), ("sam", "c")],
    schema=StructType(
        [
            StructField("name", StringType(), False),
            StructField("blah", StringType(), True),
        ]
    ),
)

In [30]:
df.show()

+----+----+
|name|blah|
+----+----+
|jose|   a|
|  li|   b|
| sam|   c|
+----+----+



In [31]:
df.printSchema()

root
 |-- name: string (nullable = false)
 |-- blah: string (nullable = true)



In [32]:
from quinn.extensions import create_df

df = spark.create_df(
    [("jose", "a"), ("li", "b"), ("sam", "c")],
    [("name", StringType(), False), ("blah", StringType(), True)],
)

In [33]:
df.show()

+----+----+
|name|blah|
+----+----+
|jose|   a|
|  li|   b|
| sam|   c|
+----+----+



In [34]:
df.printSchema()

root
 |-- name: string (nullable = false)
 |-- blah: string (nullable = true)



## single_space

In [50]:
df = spark.createDataFrame(
    [("This  is a   thing.",), ("More weird    spacing.",)]
).toDF("words")

In [51]:
df.show(truncate=False)

+----------------------+
|words                 |
+----------------------+
|This  is a   thing.   |
|More weird    spacing.|
+----------------------+



In [52]:
df.withColumn("words_clean", quinn.single_space(F.col("words"))).show(truncate=False)

+----------------------+-------------------+
|words                 |words_clean        |
+----------------------+-------------------+
|This  is a   thing.   |This is a thing.   |
|More weird    spacing.|More weird spacing.|
+----------------------+-------------------+



## remove_all_whitespace

## anti_trim

## remove_non_word_characters

In [55]:
df = spark.createDataFrame([("si%$#@!#$!@#mpsons",), ("I|lIke|CAts",)]).toDF("words")

In [57]:
df.show(truncate=False)

+------------------+
|words             |
+------------------+
|si%$#@!#$!@#mpsons|
|I|lIke|CAts       |
+------------------+



In [56]:
df.withColumn("words_clean", quinn.remove_non_word_characters(F.col("words"))).show(
    truncate=False
)

+------------------+-----------+
|words             |words_clean|
+------------------+-----------+
|si%$#@!#$!@#mpsons|simpsons   |
|I|lIke|CAts       |IlIkeCAts  |
+------------------+-----------+



## column_to_list

In [58]:
df = spark.createDataFrame([("bob", 3), ("sue", 5)]).toDF("first_name", "age")

In [60]:
df.show()

+----------+---+
|first_name|age|
+----------+---+
|       bob|  3|
|       sue|  5|
+----------+---+



In [61]:
quinn.column_to_list(df, "age")

[3, 5]

## two_columns_to_dictionary

In [65]:
df = spark.createDataFrame([(34, "crazydude"), (99, "firelover")]).toDF(
    "id", "username"
)

In [66]:
df.show()

+---+---------+
| id| username|
+---+---------+
| 34|crazydude|
| 99|firelover|
+---+---------+



In [68]:
quinn.two_columns_to_dictionary(df, "id", "username")

{34: 'crazydude', 99: 'firelover'}

## to_list_of_dictionaries

## show_output_to_df

In [80]:
df_str = """+---+---------+
| id| username|
+---+---------+
| 34|crazydude|
| 99|firelover|
+---+---------+"""

In [81]:
df = quinn.show_output_to_df(df_str, spark)

In [82]:
df.show()

+---+---------+
| id| username|
+---+---------+
| 34|crazydude|
| 99|firelover|
+---+---------+

