In [15]:

import sys

# Check if running in Fabric environment
if "notebookutils" in sys.modules:
    import sys
    
    notebookutils.fs.mount("abfss://{{varlib:config_workspace_name}}@onelake.dfs.fabric.microsoft.com/{{varlib:config_lakehouse_name}}.Lakehouse/Files/", "/config_files")  # type: ignore # noqa: F821
    mount_path = notebookutils.fs.getMountPath("/config_files")  # type: ignore # noqa: F821
    
    run_mode = "fabric"
    sys.path.insert(0, mount_path)

    
    # PySpark environment - spark session should be available
    
else:
    print("NotebookUtils not available, assumed running in local mode.")
    from ingen_fab.python_libs.pyspark.notebook_utils_abstraction import (
        NotebookUtilsFactory,
    )
    notebookutils = NotebookUtilsFactory.create_instance()
        
    spark = None
    
    mount_path = None
    run_mode = "local"

NotebookUtils not available, assumed running in local mode.


In [16]:
import traceback

def load_python_modules_from_path(base_path: str, relative_files: list[str], max_chars: int = 1_000_000_000):
    """
    Executes Python files from a Fabric-mounted file path using notebookutils.fs.head.
    
    Args:
        base_path (str): The root directory where modules are located.
        relative_files (list[str]): List of relative paths to Python files (from base_path).
        max_chars (int): Max characters to read from each file (default: 1,000,000).
    """
    success_files = []
    failed_files = []

    for relative_path in relative_files:
        full_path = f"file:{base_path}/{relative_path}"
        try:
            print(f"🔄 Loading: {full_path}")
            code = notebookutils.fs.head(full_path, max_chars)
            exec(code, globals())  # Use globals() to share context across modules
            success_files.append(relative_path)
        except Exception as e:
            failed_files.append(relative_path)
            print(f"❌ Error loading {relative_path}")

    print("\n✅ Successfully loaded:")
    for f in success_files:
        print(f" - {f}")

    if failed_files:
        print("\n⚠️ Failed to load:")
        for f in failed_files:
            print(f" - {f}")

def clear_module_cache(prefix: str):
    """Clear module cache for specified prefix"""
    for mod in list(sys.modules):
        if mod.startswith(prefix):
            print("deleting..." + mod)
            del sys.modules[mod]

# Always clear the module cache - We may remove this once the libs are stable
clear_module_cache("ingen_fab.python_libs")
clear_module_cache("ingen_fab")

deleting...ingen_fab.python_libs
deleting...ingen_fab.python_libs.common
deleting...ingen_fab.python_libs.common.config_utils
deleting...ingen_fab.python_libs.interfaces
deleting...ingen_fab.python_libs.interfaces.ddl_utils_interface
deleting...ingen_fab.python_libs.interfaces.data_store_interface
deleting...ingen_fab.python_libs.pyspark.lakehouse_utils
deleting...ingen_fab.python_libs.pyspark.ddl_utils
deleting...ingen_fab.python_libs.pyspark.notebook_utils_abstraction
deleting...ingen_fab.python_libs.pyspark.parquet_load_utils
deleting...ingen_fab.python_libs.pyspark
deleting...ingen_fab


In [17]:


if run_mode == "local":
    from ingen_fab.python_libs.common.config_utils import get_configs_as_object
    from ingen_fab.python_libs.pyspark.ddl_utils import ddl_utils
    from ingen_fab.python_libs.pyspark.lakehouse_utils import lakehouse_utils
    from ingen_fab.python_libs.pyspark.notebook_utils_abstraction import (
        NotebookUtilsFactory,
    )
    notebookutils = NotebookUtilsFactory.create_instance() 
else:
    files_to_load = [
        "ingen_fab/python_libs/common/config_utils.py",
        "ingen_fab/python_libs/pyspark/lakehouse_utils.py",
        "ingen_fab/python_libs/pyspark/ddl_utils.py",
        "ingen_fab/python_libs/pyspark/notebook_utils_abstraction.py"
    ]

    load_python_modules_from_path(mount_path, files_to_load)



In [18]:
lu = lakehouse_utils(
    target_workspace_id=get_configs_as_object().config_workspace_id,
    target_lakehouse_id=get_configs_as_object().config_lakehouse_id    
)

Found existing Spark session, reusing it.


In [34]:
lu.list_tables()

[Table(name='config_flat_file_ingestion', catalog='spark_catalog', namespace=['default'], description=None, tableType='EXTERNAL', isTemporary=False)]

In [24]:
lu.read_table("config_flat_file_ingestion").toPandas().head()

25/07/13 11:08:56 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Unnamed: 0,config_id,config_name,source_file_path,source_file_format,target_lakehouse_workspace_id,target_lakehouse_id,target_schema_name,target_table_name,file_delimiter,has_header,...,write_mode,merge_keys,data_validation_rules,error_handling_strategy,execution_group,active_yn,created_date,modified_date,created_by,modified_by
0,parquet_test_003,Parquet Customers Data Test,Files/sample_data/customers.parquet,parquet,{{varlib:config_workspace_id}},{{varlib:config_lakehouse_id}},raw,customers,,,...,overwrite,,,fail,2,Y,2024-01-15,,system,
1,json_test_002,JSON Products Data Test,Files/sample_data/products.json,json,{{varlib:config_workspace_id}},{{varlib:config_lakehouse_id}},raw,products,,,...,overwrite,,,log,1,Y,2024-01-15,,system,
2,csv_test_001,CSV Sales Data Test,Files/sample_data/sales_data.csv,csv,{{varlib:config_workspace_id}},{{varlib:config_lakehouse_id}},raw,sales_data,",",True,...,overwrite,,,fail,1,Y,2024-01-15,,system,


In [25]:
current_db = lu.execute_query("SELECT current_database()").collect()[0][0]
print(f"Current database: {current_db}")

Current database: default


In [26]:
databases =lu.execute_query("SHOW DATABASES").collect()
print(f"Available databases: {[row[0] for row in databases]}")

Available databases: ['default']


In [33]:
tables = lu.execute_query("SHOW TABLES").collect()
table_names = [row[1] for row in tables]  # Table name is usually in second column
print(f"Tables after creation: {table_names}")

Tables after creation: ['config_flat_file_ingestion']


In [29]:
lu.execute_query("select current_schema()").show()

+----------------+
|current_schema()|
+----------------+
|         default|
+----------------+



In [32]:
lu.spark.sql(
            f"CREATE TABLE IF NOT EXISTS config_flat_file_ingestion USING DELTA LOCATION '/workspaces/ingen_fab/tmp/spark/Tables/config_flat_file_ingestion'"
        )

25/07/13 12:04:45 WARN DeltaLog: Change in the table id detected while updating snapshot. 
Previous snapshot = Snapshot(path=file:/workspaces/ingen_fab/tmp/spark/Tables/config_flat_file_ingestion/_delta_log, version=1, metadata=Metadata(8c982d3c-1281-408d-94a8-0e62b4616be0,null,null,Format(parquet,Map()),{"type":"struct","fields":[{"name":"config_id","type":"string","nullable":true,"metadata":{}},{"name":"config_name","type":"string","nullable":true,"metadata":{}},{"name":"source_file_path","type":"string","nullable":true,"metadata":{}},{"name":"source_file_format","type":"string","nullable":true,"metadata":{}},{"name":"target_lakehouse_workspace_id","type":"string","nullable":true,"metadata":{}},{"name":"target_lakehouse_id","type":"string","nullable":true,"metadata":{}},{"name":"target_schema_name","type":"string","nullable":true,"metadata":{}},{"name":"target_table_name","type":"string","nullable":true,"metadata":{}},{"name":"file_delimiter","type":"string","nullable":true,"metadata

DataFrame[]

In [31]:
lu.spark.catalog.refreshTable("default.config_flat_file_ingestion")

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `default`.`config_flat_file_ingestion` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS. SQLSTATE: 42P01;
'UnresolvedRelation [default, config_flat_file_ingestion], [], false
