In [50]:
%env SPARK_VERSION=3.0.0 # TODO PUT YOUR VALUE

env: SPARK_VERSION=3.0.0 # TODO PUT YOUR VALUE


In [51]:
import pydeequ

from pyspark.sql import SparkSession, Row


# TODO create spark session with jdbc driver path
jdbc_driver_path = "/home/jovyan/work/mssql-jdbc-12.2.0.jre8.jar"
spark = SparkSession.builder \
    .appName("my-app-name") \
    .config("spark.driver.extraClassPath", jdbc_driver_path) \
    .config("spark.jars.packages", pydeequ.deequ_maven_coord) \
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord) \
    .getOrCreate()


In [54]:
url = "jdbc:sqlserver://host.docker.internal:1433;databaseName=AdventureWorks2012;encrypt=false"

table = "Production.Product"
user = "NewLogin" # your user creds here
password  = "1234567890" # your user creds here

# TODO connect to DB with Spark using JDBC connection to read the data
df = spark.read.format("jdbc") \
    .option("url", url) \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .load()


In [55]:
### Data Analyzers section
# TODO analyze data here

from pydeequ.analyzers import *

analysisResult = AnalysisRunner(spark) \
                    .onData(df) \
                    .addAnalyzer(Size()) \
                    .addAnalyzer(Completeness("Color")) \
                    .addAnalyzer(ApproxCountDistinct("Color")) \
                    .addAnalyzer(Size("Color=='Black'")) \
                    .addAnalyzer(Mean("ReorderPoint")) \
                    .run()

analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
analysisResult_df.show()

+-------+------------+--------------------+------------------+
| entity|    instance|                name|             value|
+-------+------------+--------------------+------------------+
|Dataset|           *|Size (where: Colo...|              93.0|
| Column|       Color|        Completeness|0.5079365079365079|
| Column|       Color| ApproxCountDistinct|               9.0|
|Dataset|           *|                Size|             504.0|
| Column|ReorderPoint|                Mean|401.36309523809524|
+-------+------------+--------------------+------------------+



In [56]:
### Data profiling section
# TODO profile data here


from pydeequ.profiles import *

result = ColumnProfilerRunner(spark) \
    .onData(df) \
    .run()

for col, profile in result.profiles.items():
    print(profile)

StandardProfiles for column: ModifiedDate: {
    "completeness": 1.0,
    "approximateNumDistinctValues": 2,
    "dataType": "String",
    "isDataTypeInferred": false,
    "typeCounts": {},
    "histogram": null
}
NumericProfiles for column: ReorderPoint: {
    "completeness": 1.0,
    "approximateNumDistinctValues": 6,
    "dataType": "Integral",
    "isDataTypeInferred": false,
    "typeCounts": {},
    "histogram": [
        [
            "45",
            5,
            0.00992063492063492
        ],
        [
            "750",
            156,
            0.30952380952380953
        ],
        [
            "375",
            167,
            0.33134920634920634
        ],
        [
            "75",
            97,
            0.19246031746031747
        ],
        [
            "600",
            25,
            0.0496031746031746
        ],
        [
            "3",
            54,
            0.10714285714285714
        ]
    ],
    "kll": "None",
    "mean": 401.36309523809

In [57]:
### Constraint Suggestions section
# TODO find meaninful constraints here

from pydeequ.suggestions import *

suggestionResult = ConstraintSuggestionRunner(spark) \
             .onData(df) \
             .addConstraintRule(DEFAULT()) \
             .run()

print
for sugg in suggestionResult['constraint_suggestions']:
    print(f"Constraint suggestion for \'{sugg['column_name']}\': {sugg['description']}")
    print(f"The corresponding Python code is: {sugg['code_for_constraint']}\n")

Constraint suggestion for 'ModifiedDate': 'ModifiedDate' is not null
The corresponding Python code is: .isComplete("ModifiedDate")

Constraint suggestion for 'ReorderPoint': 'ReorderPoint' has value range '375', '750', '75', '3', '600', '45'
The corresponding Python code is: .isContainedIn("ReorderPoint", ["375", "750", "75", "3", "600", "45"])

Constraint suggestion for 'ReorderPoint': 'ReorderPoint' is not null
The corresponding Python code is: .isComplete("ReorderPoint")

Constraint suggestion for 'ReorderPoint': 'ReorderPoint' has value range '375', '750', '75', '3' for at least 91.0% of values
The corresponding Python code is: .isContainedIn("ReorderPoint", ["375", "750", "75", "3"], lambda x: x >= 0.91, "It should be above 0.91!")

Constraint suggestion for 'ReorderPoint': 'ReorderPoint' has no negative values
The corresponding Python code is: .isNonNegative("ReorderPoint")

Constraint suggestion for 'ProductLine': 'ProductLine' has value range 'R ', 'M ', 'T ', 'S '
The correspo

In [67]:
### Constraint Verification section
# TODO check selected constraints here and make beautify the report

from pydeequ.checks import *
from pydeequ.verification import *

check = Check(spark, CheckLevel.Warning, "Review Check")

checkResult = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(
        check.isUnique("rowguid") \
        .isComplete("rowguid") \
        .isUnique("ProductID")  \
        .isUnique("Name") \
        .isComplete("Name") \
        .isNonNegative("Weight")  \
        .isNonNegative("ListPrice") \
        .isContainedIn("DaysToManufacture", ["0", "1", "2", "3", "4"])) \
    .run()

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.show()


+------------+-----------+------------+--------------------+-----------------+------------------+
|       check|check_level|check_status|          constraint|constraint_status|constraint_message|
+------------+-----------+------------+--------------------+-----------------+------------------+
+------------+-----------+------------+--------------------+-----------------+------------------+



In [79]:
import pandas as pd

pandasDF = checkResult_df.toPandas()

# Apply CSS styling to df
pandasDF.drop('check_level', inplace=True, axis=1)
formated_df = pandasDF.style.applymap(lambda x: 'background-color : red' if x == 'Failure' else 
                                          ('background-color : yellow' if x == 'Warning' else 
                                               ('background-color : green' if x == 'Success' else '')))

display(formated_df)
# Convert the DataFrame into an HTML table and write it to a file
html_table = formated_df.render()
with open("report.html", "w") as f:
    f.write(html_table)

Unnamed: 0,check,check_status,constraint,constraint_status,constraint_message
0,Review Check,Success,"UniquenessConstraint(Uniqueness(List(rowguid),None))",Success,
1,Review Check,Success,"CompletenessConstraint(Completeness(rowguid,None))",Success,
2,Review Check,Success,"UniquenessConstraint(Uniqueness(List(ProductID),None))",Success,
3,Review Check,Success,"UniquenessConstraint(Uniqueness(List(Name),None))",Success,
4,Review Check,Success,"CompletenessConstraint(Completeness(Name,None))",Success,
5,Review Check,Success,"ComplianceConstraint(Compliance(Weight is non-negative,COALESCE(CAST(Weight AS DECIMAL(20,10)), 0.0) >= 0,None))",Success,
6,Review Check,Success,"ComplianceConstraint(Compliance(ListPrice is non-negative,COALESCE(CAST(ListPrice AS DECIMAL(20,10)), 0.0) >= 0,None))",Success,
7,Review Check,Success,"ComplianceConstraint(Compliance(DaysToManufacture contained in 0,1,2,3,4,`DaysToManufacture` IS NULL OR `DaysToManufacture` IN ('0','1','2','3','4'),None))",Success,
