In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import (
    concat_ws,
    md5,
    col,
    current_date,
    lit,
    current_timestamp,
    when,
    current_user,
)
from pyspark.sql.types import TimestampType

In [0]:
%run ./CreateOrReplaceTables

In [0]:
%run ./SCD_Strategy

In [0]:
def get_mnt(blobContainerName):
    if blobContainerName == "landing":
        mountPoint = "/mnt/mithiadls"
    elif blobContainerName == "bronze":
        mountPoint = "/mnt/mithiadls_bronze"
    elif blobContainerName == "silver":
        mountPoint = "/mnt/mithiadls_silver"
    elif blobContainerName == "gold":
        mountPoint = "/mnt/mithiadls_gold"
    return mountPoint

In [0]:
def generate_clean_file(mountPoint,entity_name,df,ingestion_layer):
    df.coalesce(1).write.mode("overwrite").parquet(f"{mountPoint}/{entity_name}/{ingestion_layer}_raw_{entity_name}")
    for file_name in dbutils.fs.ls(f"{mountPoint}/{entity_name}/{ingestion_layer}_raw_{entity_name}"):
        file_path = file_name.path
        if file_path.endswith(".parquet"):
            parquet_file_path = file_path
            break
        else:
            parquet_file_path = ""
    if parquet_file_path != "":
        dbutils.fs.cp(parquet_file_path,f"{mountPoint}/{entity_name}/{entity_name}.snappy.parquet")
    dbutils.fs.rm(f"{mountPoint}/{entity_name}/{ingestion_layer}_raw_{entity_name}",True)
        

In [0]:
def get_catalog_name(entity_name,ingestion_layer):
    catalog_name = ingestion_layer+"_incremental_schema" if entity_name.endswith('_incremental') else ingestion_layer+"_schema"
    return catalog_name

In [0]:
# def add_audit_columns(df,ingestion_layer,entity_name):
#     catalog_name = get_catalog_name(entity_name,ingestion_layer)
#     is_update = spark.sql(f"show tables in {catalog_name}").select('tableName').filter(f"tableName = '{entity_name.split('_inc')[0]}'").count() == 1
#     if is_update:
#         df = df.withColumn("modified_by",current_user())
#         df = df.withColumn("modified_date",current_timestamp())
#     else:
#         df = df.withColumn("created_by",current_user())
#         df = df.withColumn("created_date",current_timestamp())
#         df = df.withColumn("modified_by",lit(None).cast(TimestampType()))
#         df = df.withColumn("modified_date",lit(None).cast(TimestampType()))
#     return df

In [0]:
# def insert_(mountPoint,entity_name, ingestion_layer):
#     ingestion_layer = ingestion_layer+"_dev"
#     spark.sql(
#     f"""CREATE or REPLACE TABLE {ingestion_layer}.{entity_name}
# USING DELTA
# LOCATION 'dbfs:{mountPoint}/{entity_name}/{entity_name}_delta'
# AS
# SELECT * FROM parquet.`dbfs:{mountPoint}/{entity_name}/{entity_name}.snappy.parquet`;
# """
# )

In [0]:
def return_gold_query(entity):
    if entity == "arancione":
        query = f'''select
    a.ArancioneID, a.OnlineRetailer, a.SalesMonth, a.Title, a.Vintage, a.Variety, a.Score, a.ListPrice, a.Quantity,
    ap.ProductId, ap.Country, ap.DealerPrice, ap.Markup, ap.Province, ap.Region_1, ap.Region_2, ap.Winery, ap.Year

    from hive_metastore.silver_incremental_schema.arancione a left join hive_metastore.silver_incremental_schema.arancione_products ap on a.Title = ap.Title and a.Vintage = ap.Vintage and a.Variety = ap.Variety'''
    elif entity == "celeste":
        query = f'''select
    c.TransactionId, c.TransactionDate, c.OnlineRetailer, c.SalesMonth, c.SalesRegion, c.SalesCurrency, c.Title, c.Vintage, c.Variety, c.Score, c.ListPrice, c.Quantity,
    cp.ProductId, cp.Country, cp.DealerPrice, cp.Markup, cp.Province, cp.Region_1, cp.Region_2, cp.Winery, cp.Year

    from hive_metastore.silver_incremental_schema.celeste c left join hive_metastore.silver_incremental_schema.celeste_products cp on c.Title = cp.Title and c.Vintage = cp.Vintage and c.Variety = cp.Variety'''
    return query

In [0]:
def pk_map_bronze(entity_name):
    mapping = {
        "arancione": "ArancioneID",
        "celeste": "TransactionId",
        "celeste_products": "ProductID",
        "arancione_products": "ProductID"
    }
    
    return mapping[entity_name]

def pk_map_gold(entity_name):
    mapping = {
        "arancione": "ArancioneID,ProductID",
        "celeste": "TransactionId,ProductID"
    }
    
    return mapping[entity_name]