# Landing to raw 

This notebook creates **raw Delta tables** in Databricks from **Parquet files** generated by Azure Data Factory (ADF).  
It receives a **table list configuration** from the previous notebook and processes each table by:
- Loading the data from the landing zone.
- Renaming columns to meet Delta/SQL naming rules.
- Adding audit metadata fields.
- Enforcing schema consistency with existing Delta tables.
- Creating or overwriting the final raw table in the target schema.

Tables are processed **in parallel** to improve performance, and the notebook returns a JSON summary of the results.


In [None]:
import os
from datetime import datetime
import re
import json
from delta.tables import *
from pyspark.sql.functions import *
from multiprocessing.pool import ThreadPool
import threading

## Parameters

| Name              | Type     | Default | Description |
|-------------------|----------|---------|-------------|
| `load_id`         | string   | current timestamp | Load identifier for tracking. |
| `table_info_file` | string   | `table_info.csv` | CSV file with table names and optional column selection. |
| `n_threads`       | integer  | 5       | Number of threads for parallel table processing. |

In [None]:
dbutils.widgets.text("load_id","","")
load_id = dbutils.widgets.get("load_id") or datetime.now().strftime('%Y-%m-%d_%H-%M-%S')


dbutils.widgets.text("table_info_file", "","")
table_info_file = dbutils.widgets.get("table_info_file") or 'table_info.csv'

dbutils.widgets.text("n_threads", "","")
n_threads = dbutils.widgets.get("n_threads") or "5"
try:
  n_threads = int(n_threads)
except ValueError as e:
  n_threads = 5


print( "table_info_file:", table_info_file )
print( "load_id:", load_id )
print( "n_threads:", n_threads )

## Functions

### `df_rename_cols(df)`
Renames all columns by replacing non-alphanumeric characters with `_`.

### `add_audit_fields(df)`
Adds:
- `aud_creationdate`
- `aud_modifieddate`
- `aud_load_id`
- `aud_operation` (set to `'I'`)

### `enforce_schema_delta(df, delta_table_path)`
If the Delta table exists:
- Matches column types to the existing schema.
- Parses date columns using the configured date format.

### `create_db_table(df, table_name, delta_path, delta_path_spark)`
Writes DataFrame as Delta, creates/overwrites the schema and table in the Databricks metastore, and runs `VACUUM`.

### `process_table(table_name, config, encoding='UTF-8')`
Full processing sequence:
1. Read Parquet file.
2. Apply column renaming.
3. Add audit fields.
4. Enforce schema.
5. Create/overwrite Delta table.

### `process_table_thread(table_name, target_schema=None)`
Thread-safe wrapper for `process_table` with status logging and error handling.


In [None]:
def enforce_schema_delta(df, delta_table_path):
    if DeltaTable.isDeltaTable(spark, delta_table_path):
        df_delta = spark.read.format('delta').load(delta_table_path)
        delta_schema = df_delta.schema
        for field in delta_schema:
            try:
                name = field.name
                if name == re.sub('[^0-9a-zA-Z]', '_', table_metadata["date_col"]):
                    dateformat = 'yyyy-MM-dd'
                    if table_metadata['date_format'] != '':
                        dateformat = str(table_metadata['date_format'])
                    df = df.withColumn(partition_col, to_date(df[partition_col], dateformat))
                else:
                    df = df.withColumn(name, df[name].cast(field.dataType))
            except:
                pass
    else:
        print("not a delta table")
    return df

def df_rename_cols(df):
    for col in df.schema:
        df = df.withColumnRenamed(col.name, re.sub('[^0-9a-zA-Z]', '_', col.name))
    return df

def add_audit_fields(df):
    df = df \
        .withColumn('aud_creationdate', current_timestamp()) \
        .withColumn('aud_modifieddate', current_timestamp()) \
        .withColumn('aud_load_id', lit(load_id)) \
        .withColumn('aud_operation', lit('I'))
    return df

def create_db_table(df, table_name, delta_path, delta_path_spark):
    try:
        dbutils.fs.mkdirs(delta_path_spark)
    except:
        pass
    df.write.format('delta').mode('overwrite').option('overwriteSchema', True).save(delta_path_spark)
    schema_name = '<your_schema>'
    print(f"creating databricks table: {schema_name}.{table_name}")
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")
    spark.sql(f"DROP TABLE IF EXISTS {schema_name}.{table_name}")
    spark.sql(f'CREATE TABLE IF NOT EXISTS {schema_name}.{table_name} USING DELTA LOCATION "{delta_path_spark}"')
    spark.sql(f'VACUUM {schema_name}.{table_name}')

def process_table(table_name, config, encoding='UTF-8'):
    global topic_path_spark
    print(f"[DEBUG] process_table START for table_name={table_name}")
    delta_path = f"/dbfs/mnt/dls/path/to/raw/{table_name}"
    delta_path_spark = f"dbfs:/mnt/dls/path/to/raw/{table_name}"

    print(f"[DEBUG] delta_path: {delta_path}")
    print(f"[DEBUG] delta_path_spark: {delta_path_spark}")

    try:
        row = config.filter(col("table_name") == table_name).limit(1).collect()
        print(f"[DEBUG] config row: {row}")
    except Exception as e:
        print(f"[ERROR] Failed at config row extraction: {e}")
        raise

    try:
        if row and "columns" in row[0].asDict():
            cols_str = row[0]["columns"]
            if cols_str is not None and cols_str.strip() != "":
                cols = [c.strip() for c in cols_str.split(";") if c.strip() != ""]
            else:
                cols = "*"
        else:
            cols = "*"
        print(f"[DEBUG] columns to select: {cols}")
    except Exception as e:
        print(f"[ERROR] Failed at columns extraction: {e}")
        raise

    try:
        if cols != "*":
            df = spark.read.option("infer_schema", False).parquet(f"{topic_path_spark}/{table_name}").select(*cols)
        else:
            df = spark.read.option("infer_schema", False).parquet(f"{topic_path_spark}/{table_name}")
        print(f"[DEBUG] DataFrame loaded for table {table_name}")
    except Exception as e:
        print(f"[ERROR] Failed at DataFrame loading: {e}")
        raise

    try:
        df = df_rename_cols(df)
        print(f"[DEBUG] Columns renamed for table {table_name}")
    except Exception as e:
        print(f"[ERROR] Failed at df_rename_cols: {e}")
        raise

    try:
        df = add_audit_fields(df)
        print(f"[DEBUG] Audit fields added for table {table_name}")
    except Exception as e:
        print(f"[ERROR] Failed at add_audit_fields: {e}")
        raise

    try:
        df = enforce_schema_delta(df, delta_path_spark)
        print(f"[DEBUG] enforce_schema_delta done for table {table_name}")
    except Exception as e:
        print(f"[ERROR] Failed at enforce_schema_delta: {e}")
        raise

    try:
        create_db_table(df, table_name, delta_path, delta_path_spark, )
        print(f"[DEBUG] create_db_table done for table {table_name}")
    except Exception as e:
        print(f"[ERROR] Failed at create_db_table: {e}")
        raise

def process_table_thread( table_name, target_schema=None ):
  global thread_id_count, total, ok, ko
  global mx

  id = -1
  with mx:
    thread_id_count = thread_id_count + 1
    id = thread_id_count
    print(f"[{id}] table: {table_name} ... ")
  
  try:
    process_table( table_name, df_table_info_ext)
    with mx:
      print(f"[{id}] table: {table_name} ... OK")
      ok_tables.append(table_name)
  except Exception as e:
    err = str(e)
    # err_brief = str(err).split('\n')[0]
    with mx:
      print( f"[{id}] table: {table_name} ... ERROR -> {err}" )
      ko_tables.append({ 'table' : table_name, 'error' : err })

## Execution Flow

1. **Load configuration**  
   - Read `table_info.csv` from the landing zone.  
   - Join with extra CSV for column selection (optional).

2. **Start threaded processing**  
   - Initialize thread pool with `n_threads`.  
   - Process each table with `process_table_thread`.

3. **Collect results**  
   - Track successful and failed tables.  
   - Print JSON summary.

4. **Exit or raise error**  
   - If all tables succeed → return JSON.  
   - If any fail → raise exception with details.

In [None]:
topic_path = f"/dbfs/mnt/landing/path/to"
topic_path_spark = f"dbfs:/mnt/landing/path/to"

table_info_path = f"{topic_path}/{table_info_file}"
table_info_path_spark = f"{topic_path_spark}/{table_info_file}"

print( f"table info file path: '{table_info_path}'" )


df_table_info = spark.read.option("header", True).option("infer_schema", False).csv(table_info_path_spark).distinct()

df_from_csv = spark.read.format("csv").option("header", "true").option("encoding", "utf-8").load("file:/path/to/dataverse_table_info.csv")
df_from_csv = df_from_csv.select("table_name", "columns")

df_table_info_ext = df_table_info.join(df_from_csv, on="table_name", how="left")

In [None]:
thread_id_count = 0
ok_tables = list()
ko_tables = list()
mx = threading.Lock()

pool = ThreadPool(n_threads)
_ = pool.map( lambda row : process_table_thread( row['table_name'], (row['custom_target_schema']) if 'custom_target_schema' in df_table_info_ext.columns else None ) , df_table_info_ext.collect() )

In [None]:
to_return = {
  'status' : ( 'success' if len(ko_tables) == 0 else 'error' ),
  'ok_count' : len(ok_tables),
  'ko_count' : len(ko_tables),
  'ok_tables' : ok_tables,
  'ko_tables' : ko_tables,
  'status_detail' : ( '' if len(ko_tables) == 0 else 'failed tables during the loading process!' ),
}

print( json.dumps(to_return, indent=4) )

if to_return['status'] == 'success':
  dbutils.notebook.exit( to_return )
else:
  raise Exception( to_return )