In [45]:
import sys

# Check if running in Fabric environment
if "notebookutils" in sys.modules:
    import sys

    notebookutils.fs.mount(
        "abfss://{{varlib:config_workspace_name}}@onelake.dfs.fabric.microsoft.com/{{varlib:config_lakehouse_name}}.Lakehouse/Files/",
        "/config_files",
    )  # type: ignore # noqa: F821
    mount_path = notebookutils.fs.getMountPath("/config_files")  # type: ignore # noqa: F821

    run_mode = "fabric"
    sys.path.insert(0, mount_path)

    # PySpark environment - spark session should be available

else:
    print("NotebookUtils not available, assumed running in local mode.")
    from ingen_fab.python_libs.pyspark.notebook_utils_abstraction import (
        NotebookUtilsFactory,
    )

    notebookutils = NotebookUtilsFactory.create_instance()

    spark = None

    mount_path = None
    run_mode = "local"

NotebookUtils not available, assumed running in local mode.


In [46]:


def load_python_modules_from_path(
    base_path: str, relative_files: list[str], max_chars: int = 1_000_000_000
):
    """
    Executes Python files from a Fabric-mounted file path using notebookutils.fs.head.

    Args:
        base_path (str): The root directory where modules are located.
        relative_files (list[str]): List of relative paths to Python files (from base_path).
        max_chars (int): Max characters to read from each file (default: 1,000,000).
    """
    success_files = []
    failed_files = []

    for relative_path in relative_files:
        full_path = f"file:{base_path}/{relative_path}"
        try:
            print(f"🔄 Loading: {full_path}")
            code = notebookutils.fs.head(full_path, max_chars)
            exec(code, globals())  # Use globals() to share context across modules
            success_files.append(relative_path)
        except Exception:
            failed_files.append(relative_path)
            print(f"❌ Error loading {relative_path}")

    print("\n✅ Successfully loaded:")
    for f in success_files:
        print(f" - {f}")

    if failed_files:
        print("\n⚠️ Failed to load:")
        for f in failed_files:
            print(f" - {f}")


def clear_module_cache(prefix: str):
    """Clear module cache for specified prefix"""
    for mod in list(sys.modules):
        if mod.startswith(prefix):
            print("deleting..." + mod)
            del sys.modules[mod]


# Always clear the module cache - We may remove this once the libs are stable
clear_module_cache("ingen_fab.python_libs")
clear_module_cache("ingen_fab")

deleting...ingen_fab.python_libs
deleting...ingen_fab.python_libs.common
deleting...ingen_fab.python_libs.common.config_utils
deleting...ingen_fab.python_libs.interfaces
deleting...ingen_fab.python_libs.interfaces.ddl_utils_interface
deleting...ingen_fab.python_libs.interfaces.data_store_interface
deleting...ingen_fab.python_libs.pyspark.lakehouse_utils
deleting...ingen_fab.python_libs.pyspark.ddl_utils
deleting...ingen_fab.python_libs.common.utils
deleting...ingen_fab.python_libs.common.utils.path_utils
deleting...ingen_fab.python_libs.interfaces.notebook_utils_interface
deleting...ingen_fab.python_libs.common.notebook_utils_base
deleting...ingen_fab.python_libs.pyspark.notebook_utils_abstraction
deleting...ingen_fab.python_libs.pyspark.parquet_load_utils
deleting...ingen_fab.python_libs.pyspark
deleting...ingen_fab.python_libs.common.spark_session_factory
deleting...ingen_fab
deleting...ingen_fab.notebook_utils.notebook_utils
deleting...ingen_fab.notebook_utils.base_notebook_compiler

In [47]:
if run_mode == "local":
    from ingen_fab.python_libs.common.config_utils import get_configs_as_object
    from ingen_fab.python_libs.pyspark.ddl_utils import ddl_utils
    from ingen_fab.python_libs.pyspark.lakehouse_utils import lakehouse_utils
    from ingen_fab.packages.sample_data.runtime.sample_data_manager import SampleDataManager
    from ingen_fab.python_libs.pyspark.notebook_utils_abstraction import (
        NotebookUtilsFactory,
    )

    notebookutils = NotebookUtilsFactory.create_instance()
else:
    files_to_load = [
        "ingen_fab/python_libs/common/config_utils.py",
        "ingen_fab/python_libs/pyspark/lakehouse_utils.py",
        "ingen_fab/python_libs/pyspark/ddl_utils.py",
        "ingen_fab/python_libs/pyspark/notebook_utils_abstraction.py",
    ]

    load_python_modules_from_path(mount_path, files_to_load)

In [48]:
import os
os.environ["LOCAL_SPARK_PROVIDER"] = "native"

In [49]:
target_lakehouse = lakehouse_utils(
    target_workspace_id=get_configs_as_object().config_workspace_id,
    target_lakehouse_id=get_configs_as_object().config_lakehouse_id,
)

Creating local Spark session with provider: native
Using Spark provider: native


ConnectionRefusedError: [Errno 111] Connection refused

In [None]:
target_lakehouse.spark.conf.get("spark.sql.catalog.spark_catalog")

'org.apache.spark.sql.delta.catalog.DeltaCatalog'

In [None]:
from pathlib import Path
from ingen_fab.packages.dbt.runtime.dynamic.dag_executor import DynamicDAGExecutor
from ingen_fab.packages.dbt.runtime.dynamic.dag_utils import DAGAnalyzer
from ingen_fab.packages.dbt.runtime.dynamic.visualization import DAGVisualizer


dbt_project = Path("./sample_project/dbt_project/")
dg = DynamicDAGExecutor(target_lakehouse.spark, dbt_project)
dag_analyzer = DAGAnalyzer(dbt_project)
dag_visualizer = DAGVisualizer(dbt_project)


In [44]:
dg.execute_dag()

25/09/13 10:41:01 WARN OptimisticTransaction: Change in the table id detected in txn. Table id for txn on table at file:/workspaces/i4f/tmp/spark/Tables/stg_adventureworks_categories was e53fb535-03cc-492c-b473-f7f9799f2ab9 when the txn was created and is now changed to fda249ab-6c48-4b78-be0a-cc8b7feee400.
25/09/13 10:41:01 WARN OptimisticTransaction: Change in the table id detected in txn. Table id for txn on table at file:/workspaces/i4f/tmp/spark/Tables/stg_adventureworks_subcategories was d625e846-c6d9-4b02-953e-e7ff3dc609c7 when the txn was created and is now changed to d61c15f9-a40c-4eed-b640-00d67c16e028.
25/09/13 10:41:01 WARN DeltaLog: Change in the table id detected while updating snapshot. 
Previous snapshot = Snapshot(path=file:/workspaces/i4f/tmp/spark/Tables/stg_adventureworks_categories/_delta_log, version=7, metadata=Metadata(e53fb535-03cc-492c-b473-f7f9799f2ab9,null,null,Format(parquet,Map()),{"type":"struct","fields":[{"name":"category_id","type":"long","nullable":tr

{'executed': ['seed.testproj.sample',
  'model.testproj.stg_adventureworks_categories',
  'model.testproj.stg_adventureworks_subcategories',
  'model.testproj.my_first_dbt_model',
  'model.testproj.stg_adventureworks_customers',
  'model.testproj.stg_adventureworks_products',
  'model.testproj.stg_adventureworks_persons',
  'test.testproj.source_unique_adventureworks_raw_aw__productcategory_productcategoryid.f76d91bab2',
  'test.testproj.source_not_null_adventureworks_raw_aw__productcategory_productcategoryid.34d7fed1dd',
  'test.testproj.source_not_null_adventureworks_raw_aw__productcategory_name.f4f0b70365',
  'test.testproj.source_unique_adventureworks_raw_aw__productsubcategory_productsubcategoryid.843a94a157',
  'test.testproj.source_not_null_adventureworks_raw_aw__productsubcategory_productsubcategoryid.25e4b3c309',
  'test.testproj.source_not_null_adventureworks_raw_aw__productsubcategory_productcategoryid.dc29ce363b',
  'test.testproj.source_not_null_adventureworks_raw_aw__prod

25/09/14 06:20:10 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1775347 ms exceeds timeout 120000 ms
25/09/14 06:20:10 WARN SparkContext: Killing executors is not supported by current scheduler.
25/09/14 06:20:16 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:342)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:132)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$

In [38]:
target_lakehouse.spark.sql("""
select
    productid as product_id,
    name as product_name,
    productnumber as product_number,
    color,
    standardcost as standard_cost,
    listprice as list_price,
    size,
    weight,
    productsubcategoryid as category_id,
    productmodelid as model_id,
    sellstartdate as sell_start_date,
    sellenddate as sell_end_date,
    discontinueddate as discontinued_date,
    case when sellenddate is not null or discontinueddate is not null 
         then true 
         else false 
    end as is_discontinued,
    modifieddate as modified_date
from aw__product                       
""").show()

+----------+-------------------+--------------+-----+-------------+----------+----+------+-----------+--------+--------------------+-------------+-----------------+---------------+--------------------+
|product_id|       product_name|product_number|color|standard_cost|list_price|size|weight|category_id|model_id|     sell_start_date|sell_end_date|discontinued_date|is_discontinued|       modified_date|
+----------+-------------------+--------------+-----+-------------+----------+----+------+-----------+--------+--------------------+-------------+-----------------+---------------+--------------------+
|         1|    Adjustable Race|       AR-5381|  NaN|          0.0|       0.0| NaN|   NaN|        NaN|     NaN|2008-04-30 00:00:...|          NaN|              NaN|           true|2014-02-08 10:01:...|
|         2|       Bearing Ball|       BA-8327|  NaN|          0.0|       0.0| NaN|   NaN|        NaN|     NaN|2008-04-30 00:00:...|          NaN|              NaN|           true|2014-02-08 1

In [None]:
target_lakehouse.spark.sql("""
create or replace table stg_adventureworks_customers
  as
select
    customerid as customer_id,
    personid as person_id,
    storeid as store_id,
    territoryid as territory_id,
    rowguid as row_guid,
    modifieddate as modified_date
from aw__customer""")

DataFrame[]

In [None]:
target_lakehouse.spark.sql("drop table stg_adventureworks_addresses")

DataFrame[]

In [None]:
target_lakehouse.spark.sql("""
   create or replace table stg_adventureworks_addresses  
  as
select
    1 as col1                    
"""
)

25/09/13 01:55:35 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/09/13 01:55:37 WARN OptimisticTransaction: Change in the table id detected in txn. Table id for txn on table at file:/workspaces/i4f/tmp/spark/Tables/stg_adventureworks_addresses was 22943c29-cfdb-4ef4-86fa-867d0bae05fe when the txn was created and is now changed to ba810d32-1a37-4c0e-9a89-99f6337dd328.
25/09/13 01:55:37 WARN DeltaLog: Change in the table id detected while updating snapshot. 
Previous snapshot = Snapshot(path=file:/workspaces/i4f/tmp/spark/Tables/stg_adventureworks_addresses/_delta_log, version=0, metadata=Metadata(22943c29-cfdb-4ef4-86fa-867d0bae05fe,null,null,Format(parquet,Map()),{"type":"struct","fields":[{"name":"col1","type":"integer","nullable":true,"metadata":{}}]},List(),Map(),Some(1757728508240)), logSegment=LogSegment(file:/workspaces/i4f/tmp/spark/Tables/stg_adventurew

DataFrame[]

In [None]:
target_lakehouse.spark.sql("DESCRIBE EXTENDED default.stg_adventureworks_addresses").show()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|                col1|                 int|   NULL|
|                    |                    |       |
|# Detailed Table ...|                    |       |
|                Name|spark_catalog.def...|       |
|                Type|             MANAGED|       |
|            Location|file:/workspaces/...|       |
|            Provider|               delta|       |
|    Table Properties|[delta.minReaderV...|       |
+--------------------+--------------------+-------+



In [None]:
sdm = SampleDataManager(target_lakehouse)

In [None]:
sdm.load_relational_dataset(
    name="adventureworks",
    prefix="aw_",
    save_to_tables = True
)

25/09/13 01:56:55 WARN TaskSetManager: Stage 70 contains a task of very large size (1456 KiB). The maximum recommended task size is 1000 KiB.
25/09/13 01:56:55 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/09/13 01:56:57 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/09/13 01:56:59 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/09/13 01:57:02 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/09/13 01:57:07 WARN TaskSetManager: Stage 106 contains a task of very large size (1425 KiB). The maximum recommended task size is 1000 KiB.
25/09/13 01:57:07 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of 

{'ProductCategory':    ProductCategoryID         Name                               rowguid  \
 0                  1        Bikes  CFBDA25C-DF71-47A7-B81B-64EE161AA37C   
 1                  2   Components  C657828D-D808-4ABA-91A3-AF2CE02300E9   
 2                  3     Clothing  10A7C342-CA82-48D4-8A38-46A2EB089B74   
 3                  4  Accessories  2BE3BE36-D9A2-4EEE-B593-ED895D97C2A6   
 
               ModifiedDate  
 0  2008-04-30 00:00:00.000  
 1  2008-04-30 00:00:00.000  
 2  2008-04-30 00:00:00.000  
 3  2008-04-30 00:00:00.000  ,
 'ProductSubcategory':     ProductSubcategoryID  ProductCategoryID               Name  \
 0                      1                  1     Mountain Bikes   
 1                      2                  1         Road Bikes   
 2                      3                  1      Touring Bikes   
 3                      4                  2         Handlebars   
 4                      5                  2    Bottom Brackets   
 5                      

In [None]:
target_lakehouse.list_tables()

['aw__address',
 'aw__customer',
 'aw__person',
 'aw__product',
 'aw__productcategory',
 'aw__productdescription',
 'aw__productmodel',
 'aw__productmodelproductdescriptionculture',
 'aw__productsubcategory',
 'aw__salesorderdetail',
 'aw__salesorderheader',
 'stg_adventureworks_addresses']

In [None]:
df = target_lakehouse.execute_query("SELECT 1 as col1")

In [None]:
target_lakehouse.write_to_table(df=df, table_name="test_table", mode="overwrite")

25/09/12 11:57:32 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `spark_catalog`.`default`.`test_table` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
25/09/12 11:57:32 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
25/09/12 11:57:32 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist


In [None]:
from ingen_fab.packages.data_profiling.runtime.persistence.enhanced_lakehouse_persistence import (
    EnhancedLakehousePersistence,
)

enhanced_lakehouse_persistence = EnhancedLakehousePersistence(
    lakehouse=target_lakehouse, spark=target_lakehouse.spark, table_prefix="tiered_profile"
)


In [None]:
table_facts_df.filter(f"table_name = '{table_name}'").collect()

In [None]:
enhanced_lakehouse_persistence.list_all_profiles()

enhanced_lakehouse_persistence.load_profile("synthetic_orders")  # type: ignore # noqa: F821
table_facts_df = enhanced_lakehouse_persistence.lakehouse.read_table(enhanced_lakehouse_persistence.fact_table_profiles_table)

In [None]:
table_facts_df.count()

2

25/09/09 10:21:58 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 8579576 ms exceeds timeout 120000 ms
25/09/09 10:21:58 WARN SparkContext: Killing executors is not supported by current scheduler.
25/09/09 10:22:06 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:342)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:132)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$

In [None]:
target_lakehouse.execute_query("Update config_data_profiling set active_yn = 'Y' where table_name = 'synthetic_orders'").show()

+-----------------+
|num_affected_rows|
+-----------------+
|                1|
+-----------------+



25/09/05 08:07:46 WARN UpdateCommand: Could not validate number of records due to missing statistics.


In [None]:

target_lakehouse.spark.catalog.("tiered_profile_profiles").

SyntaxError: invalid syntax (1319013832.py, line 1)

In [None]:
target_lakehouse.spark.sql("drop table tiered_profile_schemas")

DataFrame[]

In [None]:
from delta.tables import DeltaTable
from pathlib import Path
delta_table = DeltaTable.forPath(target_lakehouse.spark, f"file:///{Path.cwd()}/tmp/spark/Tables/config_data_profiling")
details = delta_table.detail().collect()[0]

25/08/24 06:51:33 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [None]:
import lazy_import
deploy_commands = lazy_import.lazy_module("ingen_fab.cli_utils.deploy_commands")
deploy_commands.deploy_to_environment("")



AttributeError: 'str' object has no attribute 'obj'

In [None]:
from ingen_fab.python_libs.pyspark.tiered_profiler import TieredProfiler


configs = get_configs_as_object()
lakehouse = lakehouse_utils(
    target_workspace_id=configs.config_workspace_id,
    target_lakehouse_id=configs.config_lakehouse_id,
)


# Initialize profiler using lakehouse_utils
print("\n🔌 Initializing Tiered Profiler with lakehouse_utils...")

# Create profiler with lakehouse_utils (persistence now in lakehouse)
profiler = TieredProfiler(
    lakehouse=lakehouse,
    table_prefix="test_l3_profile"
)

x = profiler.get_explorer()
x.list_available_profiles()

Creating local Spark session with provider: native


:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2.5.2/cache
The jars for the packages stored in: /root/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-613e822e-d9d5-491f-9a0d-a0c97c6581b2;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
:: resolution report :: resolve 82ms :: artifacts dl 3ms
	:: modules in use:
	io.delta#delta-spark_2.13;4.0.0 from central in [default]
	io.delta#delta-storage;4.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.13.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| numb


🔌 Initializing Tiered Profiler with lakehouse_utils...
Delta table already exists at path: file:////workspaces/i4f/tmp/spark/Tables/test_l3_profile_metadata, skipping creation.
Delta table already exists at path: file:////workspaces/i4f/tmp/spark/Tables/test_l3_profile_schemas, skipping creation.
Delta table already exists at path: file:////workspaces/i4f/tmp/spark/Tables/test_l3_profile_progress, skipping creation.


25/09/05 06:55:49 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

[{'table_name': 'test_fix_progress',
  'row_count': 0,
  'column_count': 12,
  'scan_timestamp': '2025-08-24 10:24:03'},
 {'table_name': 'test_fix_metadata',
  'row_count': 0,
  'column_count': 11,
  'scan_timestamp': '2025-08-24 10:24:01'},
 {'table_name': 'test_optimized_l3_schemas',
  'row_count': 2,
  'column_count': 6,
  'scan_timestamp': '2025-08-24 10:23:59'},
 {'table_name': 'test_optimized_l3_profiles',
  'row_count': 2,
  'column_count': 5,
  'scan_timestamp': '2025-08-24 10:23:56'},
 {'table_name': 'test_optimized_l3_progress',
  'row_count': 48,
  'column_count': 12,
  'scan_timestamp': '2025-08-24 10:23:54'},
 {'table_name': 'test_optimized_l3_metadata',
  'row_count': 48,
  'column_count': 11,
  'scan_timestamp': '2025-08-24 10:23:52'},
 {'table_name': 'test_complete_fix_profiles',
  'row_count': 1,
  'column_count': 5,
  'scan_timestamp': '2025-08-24 10:23:50'},
 {'table_name': 'test_complete_fix_progress',
  'row_count': 43,
  'column_count': 12,
  'scan_timestamp': '20

In [None]:
target_lakehouse.spark.sql("SELECT * FROM test_optimized_l3_progress").show()

+--------------------+--------------------+-------------------+-----------------+-------------------+-----------------+-------------------+-----------------+-------------------+----------+---------------+--------------------+
|          table_name|   level_1_completed|level_1_duration_ms|level_2_completed|level_2_duration_ms|level_3_completed|level_3_duration_ms|level_4_completed|level_4_duration_ms|last_error|last_error_time|        last_updated|
+--------------------+--------------------+-------------------+-----------------+-------------------+-----------------+-------------------+-----------------+-------------------+----------+---------------+--------------------+
| synthetic_customers|2025-08-24 10:19:...|               1189|             NULL|               NULL|             NULL|               NULL|             NULL|               NULL|      NULL|           NULL|2025-08-24 10:19:...|
|     profile_results|2025-08-24 10:19:...|               1125|             NULL|               

In [None]:
du = ddl_utils(
    target_workspace_id=get_configs_as_object().config_workspace_id,
    target_lakehouse_id=get_configs_as_object().config_lakehouse_id,
)

config_lakehouse = lakehouse_utils(
    target_workspace_id=get_configs_as_object().config_workspace_id,
    target_lakehouse_id=get_configs_as_object().config_lakehouse_id,
)

Found existing Spark session, reusing it.
Delta table already exists at path: file:////workspaces/i4f/tmp/spark/Tables/ddl_script_executions, skipping creation.
Skipping ddl_script_executions as it already exists
Found existing Spark session, reusing it.


In [None]:
# Sample configuration data for flat file ingestion testing - Universal schema (Lakehouse version)
from pyspark.sql import Row

# Import the universal schema definition
from pyspark.sql.types import (
    BooleanType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)

schema = StructType(
    [
        StructField("config_id", StringType(), nullable=False),
        StructField("config_name", StringType(), nullable=False),
        StructField("source_file_path", StringType(), nullable=False),
        StructField(
            "source_file_format", StringType(), nullable=False
        ),  # csv, json, parquet, avro, xml
        # Source location fields (optional - defaults to target or raw workspace)
        StructField(
            "source_workspace_id", StringType(), nullable=True
        ),  # Source workspace (defaults to target if null)
        StructField(
            "source_datastore_id", StringType(), nullable=True
        ),  # Source lakehouse/warehouse (defaults to raw if null)
        StructField(
            "source_datastore_type", StringType(), nullable=True
        ),  # 'lakehouse' or 'warehouse' (defaults to lakehouse)
        StructField(
            "source_file_root_path", StringType(), nullable=True
        ),  # Root path override (e.g., "Files", "Tables")
        # Target location fields
        StructField(
            "target_workspace_id", StringType(), nullable=False
        ),  # Universal field for workspace
        StructField(
            "target_datastore_id", StringType(), nullable=False
        ),  # Universal field for lakehouse/warehouse
        StructField(
            "target_datastore_type", StringType(), nullable=False
        ),  # 'lakehouse' or 'warehouse'
        StructField("target_schema_name", StringType(), nullable=False),
        StructField("target_table_name", StringType(), nullable=False),
        StructField(
            "staging_table_name", StringType(), nullable=True
        ),  # For warehouse COPY INTO staging
        StructField("file_delimiter", StringType(), nullable=True),  # for CSV files
        StructField("has_header", BooleanType(), nullable=True),  # for CSV files
        StructField("encoding", StringType(), nullable=True),  # utf-8, latin-1, etc.
        StructField("date_format", StringType(), nullable=True),  # for date columns
        StructField(
            "timestamp_format", StringType(), nullable=True
        ),  # for timestamp columns
        StructField(
            "schema_inference", BooleanType(), nullable=False
        ),  # whether to infer schema
        StructField(
            "custom_schema_json", StringType(), nullable=True
        ),  # custom schema definition
        StructField(
            "partition_columns", StringType(), nullable=True
        ),  # comma-separated list
        StructField(
            "sort_columns", StringType(), nullable=True
        ),  # comma-separated list
        StructField(
            "write_mode", StringType(), nullable=False
        ),  # overwrite, append, merge
        StructField("merge_keys", StringType(), nullable=True),  # for merge operations
        StructField(
            "data_validation_rules", StringType(), nullable=True
        ),  # JSON validation rules
        StructField(
            "error_handling_strategy", StringType(), nullable=False
        ),  # fail, skip, log
        StructField("execution_group", IntegerType(), nullable=False),
        StructField("active_yn", StringType(), nullable=False),
        StructField("created_date", StringType(), nullable=False),
        StructField("modified_date", StringType(), nullable=True),
        StructField("created_by", StringType(), nullable=False),
        StructField("modified_by", StringType(), nullable=True),
        # Advanced CSV configuration fields
        StructField("quote_character", StringType(), nullable=True),  # Default: '"'
        StructField(
            "escape_character", StringType(), nullable=True
        ),  # Default: '"' (Excel style)
        StructField("multiline_values", BooleanType(), nullable=True),  # Default: True
        StructField(
            "ignore_leading_whitespace", BooleanType(), nullable=True
        ),  # Default: False
        StructField(
            "ignore_trailing_whitespace", BooleanType(), nullable=True
        ),  # Default: False
        StructField("null_value", StringType(), nullable=True),  # Default: ""
        StructField("empty_value", StringType(), nullable=True),  # Default: ""
        StructField("comment_character", StringType(), nullable=True),  # Default: None
        StructField("max_columns", IntegerType(), nullable=True),  # Default: 100
        StructField(
            "max_chars_per_column", IntegerType(), nullable=True
        ),  # Default: 50000
        # New fields for incremental synthetic data import support
        StructField(
            "import_pattern", StringType(), nullable=True
        ),  # 'single_file', 'date_partitioned', 'wildcard_pattern'
        StructField(
            "date_partition_format", StringType(), nullable=True
        ),  # Date partition format (e.g., 'YYYY/MM/DD')
        StructField(
            "table_relationship_group", StringType(), nullable=True
        ),  # Group for related table imports
        StructField(
            "batch_import_enabled", BooleanType(), nullable=True
        ),  # Enable batch processing
        StructField(
            "file_discovery_pattern", StringType(), nullable=True
        ),  # Pattern for automatic file discovery
        StructField(
            "import_sequence_order", IntegerType(), nullable=True
        ),  # Order for related table imports
        StructField(
            "date_range_start", StringType(), nullable=True
        ),  # Start date for batch import
        StructField(
            "date_range_end", StringType(), nullable=True
        ),  # End date for batch import
        StructField(
            "skip_existing_dates", BooleanType(), nullable=True
        ),  # Skip already imported dates
        StructField(
            "source_is_folder", BooleanType(), nullable=True
        ),  # True for folder with part files, False for single file
    ]
)

# Sample configuration records for testing - Using synthetic data generator files
sample_configs = [
    Row(
        config_id="synthetic_customers_folder_001",
        config_name="Synthetic Data - Customers Folder (Retail OLTP Small)",
        source_file_path="synthetic_data/csv/series/retail_oltp_small/test_data_feb_small/flat/snapshot_customers/snapshot_customers_20240201.csv",
        source_file_format="csv",
        source_workspace_id="{{varlib:sample_lh_workspace_id}}",
        source_datastore_id="{{varlib:sample_lh_lakehouse_id}}",
        source_datastore_type="lakehouse",
        source_file_root_path=None,  # No root path override needed
        target_workspace_id="{{varlib:sample_lh_workspace_id}}",
        target_datastore_id="{{varlib:sample_lh_lakehouse_id}}",
        target_datastore_type="lakehouse",
        target_schema_name="raw",
        target_table_name="synthetic_customers",
        staging_table_name=None,
        file_delimiter=",",
        has_header=True,
        encoding="utf-8",
        date_format="yyyy-MM-dd",
        timestamp_format="yyyy-MM-dd HH:mm:ss",
        schema_inference=True,
        custom_schema_json=None,
        partition_columns="",
        sort_columns="customer_id",
        write_mode="overwrite",
        merge_keys="",
        data_validation_rules=None,
        error_handling_strategy="log",
        execution_group=1,  # Folder-based snapshot processing (Group 1)
        active_yn="Y",
        created_date="2024-01-15",
        modified_date=None,
        created_by="system",
        modified_by=None,
        quote_character='"',
        escape_character='"',
        multiline_values=True,
        ignore_leading_whitespace=False,
        ignore_trailing_whitespace=False,
        null_value="",
        empty_value="",
        comment_character=None,
        max_columns=100,
        max_chars_per_column=50000,
        # New fields for incremental synthetic data import support
        import_pattern="single_file",
        date_partition_format=None,
        table_relationship_group="retail_oltp_single",
        batch_import_enabled=False,
        file_discovery_pattern=None,
        import_sequence_order=1,
        date_range_start=None,
        date_range_end=None,
        skip_existing_dates=None,
        source_is_folder=True,  # Read all part files from the folder
    ),
    Row(
        config_id="synthetic_orders_folder_002",
        config_name="Synthetic Data - Orders Folder (Retail OLTP Small)",
        source_file_path="synthetic_data/parquet/series/retail_oltp_small/high_volume_summer_data/flat/orders/",
        source_file_format="parquet",
        source_workspace_id="{{varlib:sample_lh_workspace_id}}",
        source_datastore_id="{{varlib:sample_lh_lakehouse_id}}",
        source_datastore_type="lakehouse",
        source_file_root_path=None,  # No root path override needed
        target_workspace_id="{{varlib:sample_lh_workspace_id}}",
        target_datastore_id="{{varlib:sample_lh_lakehouse_id}}",
        target_datastore_type="lakehouse",
        target_schema_name="raw",
        target_table_name="synthetic_orders",
        staging_table_name=None,
        file_delimiter=",",
        has_header=True,
        encoding="utf-8",
        date_format="yyyyMMdd",
        timestamp_format="yyyy-MM-dd HH:mm:ss",
        schema_inference=True,
        custom_schema_json=None,
        partition_columns="",
        sort_columns="order_id",
        write_mode="overwrite",
        merge_keys="",
        data_validation_rules=None,
        error_handling_strategy="log",
        execution_group=1,
        active_yn="Y",
        created_date="2024-01-15",
        modified_date=None,
        created_by="system",
        modified_by=None,
        quote_character='"',
        escape_character='"',
        multiline_values=True,
        ignore_leading_whitespace=False,
        ignore_trailing_whitespace=False,
        null_value="",
        empty_value="",
        comment_character=None,
        max_columns=100,
        max_chars_per_column=50000,
        # New fields for incremental synthetic data import support
        import_pattern="date_partitioned",
        date_partition_format="yyyyMMdd",
        table_relationship_group="retail_oltp_single",
        batch_import_enabled=False,
        file_discovery_pattern="orders_*.parquet",
        import_sequence_order=1,
        date_range_start=None,
        date_range_end=None,
        skip_existing_dates=None,
        source_is_folder=True,  # Read all part files from the folder
    ),
]

# Create DataFrame and insert records
print(len(schema.fields))
print(len(sample_configs[0].asDict()))
df = target_lakehouse.get_connection.createDataFrame(sample_configs, schema)
target_lakehouse.drop_table("config_flat_file_ingestion")
target_lakehouse.write_to_table(
    df=df, table_name="config_flat_file_ingestion", mode="overwrite"
)

53
53


                                                                                

⚠ Alert: Registering table 'config_flat_file_ingestion' in the Hive catalog for local Spark.


25/08/20 22:47:05 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `spark_catalog`.`default`.`config_flat_file_ingestion` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
25/08/20 22:47:05 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
25/08/20 22:47:05 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist


In [None]:
config_lakehouse.execute_query("Select * from profile_results").show()

NameError: name 'config_lakehouse' is not defined

In [None]:
spark

In [None]:
target_lakehouse.spark.read.format("parquet").load(
    "tmp/spark/Files/synthetic_data/parquet/series/retail_oltp_small/high_volume_summer_data/flat/orders/orders_20240601.parquet/part-00023-a53a5973-8248-405a-9982-e270544646be-c000.snappy.parquet"
).show()

+--------+-----------+----------+----------+--------------+-----------+-------------+---------------+------------+--------------+
|order_id|customer_id|order_date|    status|payment_method|order_total|shipping_cost|discount_amount|shipped_date|delivered_date|
+--------+-----------+----------+----------+--------------+-----------+-------------+---------------+------------+--------------+
| 1150001|          2|2022-09-09|   Pending|    Debit Card|      170.1|          0.1|            0.1|        NULL|          NULL|
| 1150002|          3|2022-09-10|   Pending|        PayPal|      170.2|          0.2|            0.2|        NULL|          NULL|
| 1150003|          4|2022-09-11|   Pending| Bank Transfer|      170.3|          0.3|            0.0|        NULL|          NULL|
| 1150004|          5|2022-09-12|   Pending|          Cash|      170.4|          0.4|            0.0|        NULL|          NULL|
| 1150005|          6|2022-09-13|   Pending|   Credit Card|      170.5|          0.5|     

In [None]:
lu.execute_query("SELECT * from log_synapse_extract_run_log").show()

NameError: name 'lu' is not defined

In [None]:
lu.execute_query("SELECT * from config_extract_generation").show()

25/07/31 08:23:47 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+--------------------+---------+------------+---------------------+------------------+--------------------+-----------------+-------------------+------------+------------------+------------+----------+-------------+-----------+
|        extract_name|is_active|trigger_name|extract_pipeline_name|extract_table_name|extract_table_schema|extract_view_name|extract_view_schema|is_full_load|   execution_group|created_date|created_by|modified_date|modified_by|
+--------------------+---------+------------+---------------------+------------------+--------------------+-----------------+-------------------+------------+------------------+------------+----------+-------------+-----------+
|SAMPLE_CUSTOMERS_...|     true|        NULL|                 NULL|         customers|             default|             NULL|               NULL|        true|LAKEHOUSE_EXTRACTS|  2024-01-15|    system|         NULL|       NULL|
|SAMPLE_PRODUCTS_L...|     true|        NULL|                 NULL|          products|  

In [None]:
lu.execute_query("SELECT * from config_extract_generation_details").show()

25/07/31 08:51:08 WARN DeltaLog: Change in the table id detected while updating snapshot. 
Previous snapshot = Snapshot(path=file:/workspaces/ingen_fab/tmp/spark/Tables/config_extract_generation_details/_delta_log, version=1, metadata=Metadata(82c2237b-0ce7-44e7-9f94-c5248df68ee6,null,null,Format(parquet,Map()),{"type":"struct","fields":[{"name":"extract_name","type":"string","nullable":true,"metadata":{}},{"name":"file_generation_group","type":"string","nullable":true,"metadata":{}},{"name":"extract_container","type":"string","nullable":true,"metadata":{}},{"name":"extract_directory","type":"string","nullable":true,"metadata":{}},{"name":"extract_file_name","type":"string","nullable":true,"metadata":{}},{"name":"extract_file_name_timestamp_format","type":"string","nullable":true,"metadata":{}},{"name":"extract_file_name_period_end_day","type":"integer","nullable":true,"metadata":{}},{"name":"extract_file_name_extension","type":"string","nullable":true,"metadata":{}},{"name":"extract_f

+--------------------+---------------------+-----------------+-----------------+-------------------+----------------------------------+--------------------------------+---------------------------+--------------------------+--------------------------------+-----------------------------+------------------------+-------------------------------+--------------------------------+----------------------+--------------------------+---------------------------------+-------------+---------------+----------------------+-------------+---------------+----------------+--------------------+--------------------+---------------------+-----------------+------------+----------+-------------+-----------+
|        extract_name|file_generation_group|extract_container|extract_directory|  extract_file_name|extract_file_name_timestamp_format|extract_file_name_period_end_day|extract_file_name_extension|extract_file_name_ordering|file_properties_column_delimiter|file_properties_row_delimiter|file_properties_encod

In [None]:
lu.execute_query("SELECT * FROM log_flat_file_ingestion").show()

+------+---------+------------+--------------+------------+------+----------------+----------------------+-------------------------+-----------------+-----------------+----------------+---------------+---------------+--------------+----------------+-----------------+-----------------------+----------------------+-------------------------------+--------------------+---------------------+-------------------------+-----------------+-----------------+-------------------+------------+------------------------+-------------+-------------+--------------------------+--------------------+------------+----------+
|log_id|config_id|execution_id|job_start_time|job_end_time|status|source_file_path|source_file_size_bytes|source_file_modified_time|target_table_name|records_processed|records_inserted|records_updated|records_deleted|records_failed|source_row_count|staging_row_count|target_row_count_before|target_row_count_after|row_count_reconciliation_status|row_count_difference|data_read_duration_ms|s

In [None]:
import os

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ConfigCheck").getOrCreate()

# Method 1: Check SPARK_HOME and config file locations
print(f"SPARK_HOME: {os.environ.get('SPARK_HOME', 'Not set')}")
print(
    f"Expected config file: {os.environ.get('SPARK_HOME', '')}/conf/spark-defaults.conf"
)
df = spark.sql("SELECT 1 as test")
df.show()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/14 06:03:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


SPARK_HOME: /opt/bitnami/spark
Expected config file: /opt/bitnami/spark/conf/spark-defaults.conf


25/07/14 06:03:23 ERROR JavaUtils: Failed to create directory artifacts/spark-bf33e5a4-c086-4102-a9b8-7e3faf685181
java.nio.file.AccessDeniedException: /workspaces/ingen_fab/artifacts/spark-bf33e5a4-c086-4102-a9b8-7e3faf685181
	at java.base/sun.nio.fs.UnixException.translateToIOException(Unknown Source)
	at java.base/sun.nio.fs.UnixException.rethrowAsIOException(Unknown Source)
	at java.base/sun.nio.fs.UnixException.rethrowAsIOException(Unknown Source)
	at java.base/sun.nio.fs.UnixFileSystemProvider.createDirectory(Unknown Source)
	at java.base/java.nio.file.Files.createDirectory(Unknown Source)
	at java.base/java.nio.file.Files.createAndCheckIsDirectory(Unknown Source)
	at java.base/java.nio.file.Files.createDirectories(Unknown Source)
	at org.apache.spark.network.util.JavaUtils.createDirectory(JavaUtils.java:416)
	at org.apache.spark.util.SparkFileUtils.createDirectory(SparkFileUtils.scala:95)
	at org.apache.spark.util.SparkFileUtils.createDirectory$(SparkFileUtils.scala:94)
	at org.

Py4JJavaError: An error occurred while calling o35.showString.
: java.io.IOException: Failed to create a temp directory (under artifacts) after 10 attempts!
	at org.apache.spark.network.util.JavaUtils.createDirectory(JavaUtils.java:411)
	at org.apache.spark.util.SparkFileUtils.createDirectory(SparkFileUtils.scala:95)
	at org.apache.spark.util.SparkFileUtils.createDirectory$(SparkFileUtils.scala:94)
	at org.apache.spark.util.Utils$.createDirectory(Utils.scala:99)
	at org.apache.spark.util.Utils$.createTempDir(Utils.scala:249)
	at org.apache.spark.sql.artifact.ArtifactManager$.artifactRootDirectory$lzycompute(ArtifactManager.scala:468)
	at org.apache.spark.sql.artifact.ArtifactManager$.artifactRootDirectory(ArtifactManager.scala:467)
	at org.apache.spark.sql.artifact.ArtifactManager.artifactRootPath(ArtifactManager.scala:60)
	at org.apache.spark.sql.artifact.ArtifactManager.<init>(ArtifactManager.scala:70)
	at org.apache.spark.sql.internal.BaseSessionStateBuilder.$anonfun$artifactManager$2(BaseSessionStateBuilder.scala:395)
	at scala.Option.getOrElse(Option.scala:201)
	at org.apache.spark.sql.internal.BaseSessionStateBuilder.artifactManager(BaseSessionStateBuilder.scala:395)
	at org.apache.spark.sql.internal.BaseSessionStateBuilder.$anonfun$build$6(BaseSessionStateBuilder.scala:433)
	at org.apache.spark.sql.internal.SessionState.artifactManager$lzycompute(SessionState.scala:109)
	at org.apache.spark.sql.internal.SessionState.artifactManager(SessionState.scala:109)
	at org.apache.spark.sql.classic.SparkSession.artifactManager(SparkSession.scala:233)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$6(SQLExecution.scala:123)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:291)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$1(SQLExecution.scala:123)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:804)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId0(SQLExecution.scala:77)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:233)
	at org.apache.spark.sql.classic.Dataset.withAction(Dataset.scala:2232)
	at org.apache.spark.sql.classic.Dataset.head(Dataset.scala:1379)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2810)
	at org.apache.spark.sql.classic.Dataset.getRows(Dataset.scala:339)
	at org.apache.spark.sql.classic.Dataset.showString(Dataset.scala:375)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.base/java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
	at java.base/java.lang.Thread.run(Unknown Source)


In [None]:
builder = (
    SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)
from delta import configure_spark_with_delta_pip

configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
lu.read_file(
    file_path="synthetic_data/single/retail_oltp_small/customers.csv/",
    file_format="csv",
).count()

Reading file from: synthetic_data/single/retail_oltp_small/customers.csv/
First part of path: synthetic_data
Reading file from: file:////workspaces/ingen_fab/tmp/spark/Files/synthetic_data/single/retail_oltp_small/customers.csv/


1001