## Great Expectations

### Install neccessary packages

In [67]:
!pip install delta-spark==3.2.0 great_expectations==1.3.3



### Import neccessary packages

In [76]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import great_expectations as gx

### Initialize Spark-session with DeltaLake support

In [78]:
spark = SparkSession.builder \
    .appName("SupplierDataQuality") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

### Specify Schema and load data

In [79]:
schema = StructType([
    StructField("supplier_id", IntegerType(), True),
    StructField("supplier_name", StringType(), True),
    StructField("contact_email", StringType(), True),
    StructField("contact_phone", StringType(), True),
    StructField("address", StringType(), True)
])

df = spark.read.csv("/home/jovyan/work/lab15/data/data.csv", header=True, schema=schema)
df.show(truncate=False)

+-----------+-------------+---------------------+-------------+------------+
|supplier_id|supplier_name|contact_email        |contact_phone|address     |
+-----------+-------------+---------------------+-------------+------------+
|1          |Supplier A   |supplierA@example.com|1234567890   |123 Main St |
|2          |NULL         |supplierB@example.com|NULL         |456 Oak St  |
|3          |Supplier C   |supplierCexample.com |9876543210   |789 Pine St |
|4          |Supplier D   |NULL                 |7896541230   |NULL        |
|5          |Supplier E   |supplierE@example.com|NULL         |101 Birch St|
+-----------+-------------+---------------------+-------------+------------+



### Work with Great Expectations

Оскільки приклад в лабораторній був зроблений для старої версії great_expectations пакету та старої версії python, то було вирішино
виконати ЛР на найновішій версії great_expectations пакету, щоб не заморачуватися з міграцією на іншу версію python та пакету. Та навчимось працювати з даним пакетом в сучасній манері.


In [80]:
# Отримання контексту Great Expectations
context = gx.get_context()

# Ініціалізуємо назву DataSource
data_source_name = "suppliers_data"

# Додаємо DataSource в контекст
data_source = context.data_sources.add_spark(name=data_source_name)

# Визначаємо назву DataAsset'у
data_asset_name = "supplier_data_asset"

# Додаємо DataAsset в DataSource
data_asset = data_source.add_dataframe_asset(name=data_asset_name)

# Визначаємо назву Batcha
batch_definition_name = "supplier_batch_definition"

# Додаємо Batch до DataAsset'у
batch_definition = data_asset.add_batch_definition_whole_dataframe(
    batch_definition_name
)

# Додаємо створений dataframe до параметрів Batch
batch_parameters = {"dataframe": df}

# Додаємо правила перевірки
expectations = [
    gx.expectations.ExpectColumnValuesToNotBeNull(column="supplier_name"),
    gx.expectations.ExpectColumnValuesToMatchRegex(
        column="contact_email", regex=r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
    ),
    gx.expectations.ExpectColumnValuesToNotBeNull(column="address")
]

# Отримання Batch
batch = batch_definition.get_batch(batch_parameters=batch_parameters)

# Виконуємо перевірки
validation_results = [batch.validate(expectation) for expectation in expectations]

# Отримуємо дані перевірки та заносимо їх в DataFrame
dq_data = []
for result in validation_results:
    if result["success"] == True:
        status = 'PASSED'
    else:
        status = 'FAILED'
    dq_data.append((
        result["expectation_config"]["kwargs"]["column"],
        status,
        result["expectation_config"]["type"],
        result["result"]["unexpected_count"],
        result["result"]["element_count"],
        result["result"]["unexpected_percent"],
        float(100-result["result"]["unexpected_percent"])
    ))
dq_columns = ["column", "status", "expectation_type", "unexpected_count", "element_count", "unexpected_percent", "percent"]
dq_df = spark.createDataFrame(data=dq_data,schema=dq_columns)

dq_df.show(truncate=False)

INFO:great_expectations.data_context.types.base:Created temporary directory '/tmp/tmpwia7ptxn' for ephemeral docs site


Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/13 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

+-------------+------+-----------------------------------+----------------+-------------+------------------+-------+
|column       |status|expectation_type                   |unexpected_count|element_count|unexpected_percent|percent|
+-------------+------+-----------------------------------+----------------+-------------+------------------+-------+
|supplier_name|FAILED|expect_column_values_to_not_be_null|1               |5            |20.0              |80.0   |
|contact_email|FAILED|expect_column_values_to_match_regex|1               |5            |25.0              |75.0   |
|address      |FAILED|expect_column_values_to_not_be_null|1               |5            |20.0              |80.0   |
+-------------+------+-----------------------------------+----------------+-------------+------------------+-------+



### Write dataquality information to delta

In [65]:
dq_df.write.format("delta").mode("overwrite").save("/home/jovyan/work/lab15/processed_data")

### Check and show data from DeltaTable

In [66]:
processed_df = spark.read.format("delta").load("/home/jovyan/work/lab15/processed_data")

processed_df.show(truncate=False)

+-------------+------+-----------------------------------+----------------+-------------+------------------+-------+
|column       |status|expectation_type                   |unexpected_count|element_count|unexpected_percent|percent|
+-------------+------+-----------------------------------+----------------+-------------+------------------+-------+
|supplier_name|FAILED|expect_column_values_to_not_be_null|1               |5            |20.0              |80.0   |
|contact_email|PASSED|expect_column_values_to_match_regex|0               |5            |0.0               |100.0  |
|address      |FAILED|expect_column_values_to_not_be_null|1               |5            |20.0              |80.0   |
+-------------+------+-----------------------------------+----------------+-------------+------------------+-------+



In [77]:
spark.stop()