In [0]:
# pip install pyyaml


In [0]:
# dbutils.library.restartPython() 

In [0]:
spark.conf.set("fs.s3a.access.key", "")
spark.conf.set("fs.s3a.secret.key", "")
spark.conf.set("fs.s3a.endpoint", "s3.amazonaws.com")

In [0]:
import yaml
import time
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, DoubleType

class Validator:
    def __init__(self, df, config_path):
        self.df = df.withColumn("validation_flag", F.lit(""))
        
        # Load config (Updated for Databricks)
        config_content = (
            spark.read.text(config_path).collect()
        )  # Read file as text and collect it as a list
        config_str = "\n".join([row[0] for row in config_content])  # Convert to string
        self.config = yaml.safe_load(config_str)  # Load YAML from string

        # Define expected data types
        dtype_map = {
            "string": StringType(),
            "double": DoubleType()
        }
        self.expected_dtypes = {
            col: dtype_map[self.config["validation"]["expected_dtypes"][col]]
            for col in self.config["validation"].get("expected_dtypes", {})
            if self.config["validation"]["expected_dtypes"][col] in dtype_map
        }

    def add_flag(self, condition, issue):
        """Appends an issue to the validation_flag column."""
        self.df = self.df.withColumn(
            "validation_flag",
            F.when(condition, F.concat_ws(", ", F.col("validation_flag"), F.lit(issue)))
            .otherwise(F.col("validation_flag"))
        )

    def check_missing_values(self):
        """Checks for missing values in required columns."""
        for col in self.config["validation"].get("required_columns", []):
            self.add_flag(F.col(col).isNull(), f"{col}_missing")

    def check_data_types(self):
        """Validates column data types."""
        for col, expected_type in self.expected_dtypes.items():
            if col in self.df.columns:
                self.add_flag(
                    F.col(col).cast(expected_type) != F.col(col), f"{col}_dtype_mismatch"
                )

    def fix_categorical(self):
        """Replaces invalid categorical values with a default value."""
        for col, settings in self.config["validation"].get("categorical", {}).items():
            valid_values = settings["valid_values"]
            default = settings["default"]
            valid_values_expr = F.when(F.col(col).isin(valid_values), F.col(col)).otherwise(default)
            self.df = self.df.withColumn(col, valid_values_expr)

    def fix_regex(self):
        """Validates a column against a regex pattern and replaces invalid values."""
        for col, settings in self.config["validation"].get("regex", {}).items():
            pattern = settings["pattern"]
            replacement = settings["replacement"]
            mask = F.col(col).rlike(pattern)
            self.df = self.df.withColumn(col, F.when(mask, F.col(col)).otherwise(replacement))

    def run_validations(self):
        """Executes all validation steps."""
        self.check_missing_values()
        self.check_data_types()
        self.fix_categorical()
        self.fix_regex()
        return self.df

# Usage Example:
start_time = time.time()
config_path = "dbfs:/FileStore/tables/validation_configurations.yaml"  # Path to the YAML file
input_data = spark.sql("SELECT * FROM global_temp.input_data_view")
validator = Validator(input_data, config_path)
validated_data = validator.run_validations()
validated_data.count()
print(validated_data.count())
validated_data.createOrReplaceGlobalTempView("validated_data_view")
end_time = time.time()

# Calculate execution time
execution_time = end_time - start_time
print(f"Execution Time for Validating data: {execution_time:.6f} seconds")


100000
Execution Time for Validating data: 8.654894 seconds


In [0]:
validated_data.display()

transaction_id,transaction_parent_id,transaction_timestamp,transaction_type,order_status,order_type,mic_code,exchange_code,side,symbol,isin,price,quantity,adv30,trader_id,broker_id,currency_name,_rescued_data,transaction_date,validation_flag
trx_2025-03-18_50001,trx_2025-03-18_50001,2025-03-18 12:19:54.214092,Executions,Filled,IceBerg,XNAS,NASDAQ,Sell,BRZE,US5007541060,36.53939177924437,105367,119029333.33333331,T43736,B3,Euro,,2025-03-18,
trx_2025-03-18_50002,trx_2025-03-18_50002,2025-03-18 12:19:54.267365,Orders,Amend,Stop,XNAS,NASDAQ,Sell,KELYA,US5007541060,13.127679700816522,43042,35841333.33333333,T36087,B36,Euro,,2025-03-18,
trx_2025-03-18_50003,trx_2025-03-18_50003,2025-03-18 12:19:54.504216,Executions,Filled,Market,XASE,NASDAQ,Buy,IZM,US5007541060,1.5880707842925796,9810,4662666.666666666,T1475,B35,Singapore Dollar,,2025-03-18,
trx_2025-03-18_50004,trx_2025-03-18_50004,2025-03-18 12:19:54.544834,Orders,Cancelled,IceBerg,BATS,BATS,Sell,FPRO,US5007541060,22.77994991448326,1007,315333.3333333334,T22148,B94,Dollar,,2025-03-18,
trx_2025-03-18_50005,trx_2025-03-18_50005,2025-03-18 12:19:54.709943,Orders,New,Limit,XNAS,NASDAQ,Buy,POCI,US5007541060,,3149,1359666.6666666665,,B18,Dollar,,2025-03-18,", price_missing, trader_id_missing"
trx_2025-03-18_50006,trx_2025-03-18_50006,2025-03-18 12:19:54.822813,Executions,Filled,Limit,ARCX,NYSE ARCA,Sell,INCM,US5007541060,26.174078217631276,34676,15086333.333333334,T28445,B15,Dollar,,2025-03-18,
trx_2025-03-18_50007,trx_2025-03-18_50007,2025-03-18 12:19:54.913399,Orders,Cancelled,Limit,ARCX,NYSE ARCA,Sell,AVSU,US5007541060,61.74979835510254,1740,1808333.3333333333,T47150,B48,Singapore Dollar,,2025-03-18,
trx_2025-03-18_50008,trx_2025-03-18_50008,2025-03-18 12:19:55.006516,Orders,Cancelled,IceBerg,XNYS,NYSE,Buy,CURV,US5007541060,5.514525935403613,40655,29647000.0,T8983,B55,Singapore Dollar,,2025-03-18,
trx_2025-03-18_50009,trx_2025-03-18_50009,2025-03-18 12:19:55.227062,Orders,Amend,Market,XNAS,NASDAQ,Sell,KELYA,US5007541060,,86313,35841333.33333333,T18365,B19,Dollar,,2025-03-18,", price_missing"
trx_2025-03-18_50010,trx_2025-03-18_50010,2025-03-18 12:19:55.359510,Executions,Partially-Filled,Limit,XASE,NASDAQ,Buy,SNY,US5007541060,57.81859408999971,987001,257758666.66666663,T41309,B17,Singapore Dollar,,2025-03-18,
