In [0]:
import dlt
from pyspark.sql.functions import col, lit, count
from pyspark.sql.types import IntegerType, FloatType, StringType, DateType

#FinalTable
@dlt.table(
    name="FinancialTransactions_Checks",
    comment="Final table with data quality checks."
)
@dlt.expect("valid_client_number", "ClientNumber IS NOT NULL AND ClientNumber > 0")
@dlt.expect("valid_original_amount", "OriginalAmount IS NOT NULL AND OriginalAmount >= 0")
@dlt.expect("valid_onboarding_date", "OnboardingDate IS NOT NULL")
@dlt.expect("valid_amount_eur", "AmountEUR IS NOT NULL AND AmountEUR >= 0")
@dlt.expect("check_client_number_type", "CAST(ClientNumber AS INTEGER) IS NOT NULL")
@dlt.expect("check_amount_eur_type", "CAST(AmountEUR AS FLOAT) IS NOT NULL")
@dlt.expect("enterprise_size_range", "EnterpriseSize IN ('S', 'M', 'L')")
@dlt.expect("secured_amount_range", "SecuredAmountEUR >= 0 AND SecuredAmountEUR <= 500000")
def read_final_table():
    return spark.read.table("hive_metastore.default.financialtransactions")

# Source1
@dlt.table(
    name="Source_1_Checks",
    comment="Final table with data quality checks."
)
@dlt.expect("non_null_DataSource", "DataSource IS NOT NULL")
@dlt.expect("non_null_ClientGroup", "ClientGroup IS NOT NULL")
@dlt.expect("non_null_ClientNumber", "ClientNumber IS NOT NULL")
@dlt.expect("valid_ClientAmount", "ClientAmount >= 0")
@dlt.expect("non_null_Currency", "Currency IS NOT NULL")
@dlt.expect("valid_NumberOfEmployees", "NumberOfEmployees >= 0")
@dlt.expect("non_null_Location", "Location IS NOT NULL")
@dlt.expect("valid_ClientSince_format", "ClientSince RLIKE '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'")
@dlt.expect("non_null_EligibleForDiscount", "EligibleForDiscount IS NOT NULL")
@dlt.expect("valid_SnapshotDate_format", "SnapshotDate RLIKE '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'")
def final_table():
    df = spark.read.table("source_1")
    return df

# Source2
@dlt.table(
    name="Source_2_Checks",
    comment="Validated data from the source."
)
@dlt.expect("valid_clientsince", "ClientSince RLIKE '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'")
@dlt.expect("valid_snapshotdate", "SnapshotDate RLIKE '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'")
@dlt.expect("non_null_clientsince", "ClientSince IS NOT NULL")
@dlt.expect("non_null_snapshotdate", "SnapshotDate IS NOT NULL")
@dlt.expect("valid_hasloan", "HasLoan IN ('Yes', 'No')")
@dlt.expect("positive_clientamount", "ClientAmount > 0")
@dlt.expect("non_null_clientsystem", "SourceSystem IS NOT NULL")
@dlt.expect("positive_numberofemployees", "NumberOfEmployees > 0")
@dlt.expect("non_negative_clientnumber", "ClientNumber IS NOT NULL")
def source_data_validated():
    return dlt.read("source_2")

#Exchenge_Rates
@dlt.table(
    name="Exchange_Rates_Checks",
    comment="Exchange_Rates Table with data quality checks."
)
@dlt.expect_or_fail("non_null_currency", "Currency IS NOT NULL")
@dlt.expect_or_fail("valid_exchange_rate", "ExchangeRate > 0")
@dlt.expect_or_fail("non_null_snapshot_date", "SnapshotDate IS NOT NULL")
def exchange_rates_validated():
    return spark.read.table("hive_metastore.default.exchange_rates") # check the correct name

#Client_Secured
@dlt.table(
    name="Client_Secured_Ind_checks",
    comment="Table with data quality checks for Client_Secured_Ind."
)
@dlt.expect("valid_client_secured_ind", "ClientSecuredInd IN ('Y', 'N')")
@dlt.expect("non_null_client_number", "ClientNumber IS NOT NULL")
def client_secured_ind_validated():
    return spark.read.table("hive_metastore.default.client_secured_ind")

@dlt.view(
    name="Client_Secured_Ind_with_duplicates",
    comment="View to check for duplicates in Client_Secured_Ind."
)
def client_secured_ind_with_duplicates():
    df = dlt.read("Client_Secured_Ind_checks")
    duplicate_count = df.groupBy("ClientNumber").agg(count("*").alias("count"))
    duplicates = duplicate_count.filter(col("count") > 1)
    return df.join(duplicates, "ClientNumber", "left_outer").withColumn("is_duplicate", col("count").isNotNull())

@dlt.table(
    name="Final_Client_Secured_Ind",
    comment="Final table for Client_Secured_Ind with all validations."
)
@dlt.expect("no_duplicates", "is_duplicate = false")
def final_client_secured_ind():
    return dlt.read("Client_Secured_Ind_with_duplicates")


py4j.Py4JException: An exception was raised by the Python Proxy. Return Message: Traceback (most recent call last):
  File "/databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 642, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
  File "/databricks/spark/python/dlt/helpers.py", line 31, in call
    res = self.func()
  File "/root/.ipykernel/1450/command-2020870890715496-3359285086", line 55, in source_data_validated
    return dlt.read("source_2")
  File "/databricks/spark/python/dlt/api.py", line 598, in read
    pipeline.instance.get_scala_pipeline().read(name),
  File "/databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1355, in __call__
    return_value = get_return_value(
  File "/databricks/spark/python/pyspark/errors/exceptions/captured.py", line 230, in deco
    raise converted from None
pyspark.errors.exceptions.captured.AnalysisException: Failed to read dataset 'source_2'. Dataset is not defin