In [2]:
#
#  -- Learn Fabric Spark --
#
#  lesson 4 - dynamic full load
#    - rebuild tables
#    - show sample rows?
#    - show sampe data
#


StatementMeta(, 0aa7c35b-d609-41b7-98ab-4ed8307a390d, 4, Finished, Available)

In [3]:

var_path = 'raw/saleslt'
var_table = 'dim_currency'
var_delimiter = '|'
var_header = 'false'
var_schema = ""


StatementMeta(, 0aa7c35b-d609-41b7-98ab-4ed8307a390d, 5, Finished, Available)

In [4]:
#
#  F1 - remove existing tables
#

# drop bronze table
stmt = f'drop table if exists bronze_{var_table}_full;'
ret = spark.sql(stmt)

# drop silver table
stmt = f'drop table if exists silver_{var_table}_full;'
ret = spark.sql(stmt)


StatementMeta(, 0aa7c35b-d609-41b7-98ab-4ed8307a390d, 6, Finished, Available)

In [5]:
#
#  F2 - load data frame
#

   
# root path
path = 'Files/' + var_path

# tmp table
table = 'tmp_' + var_table.strip() + '_full'

# schema flag
schema_flag = True
try:
  
  # none test
  if var_schema is None:
    schema_flag = False

  # empty string 
  if not bool(var_schema.strip()):
    schema_flag = False

except:
  pass


# load all files w/o schema
if not schema_flag:
    df = spark.read.format("csv") \
        .option("header",var_header) \
        .option("delimiter", var_delimiter) \
        .option("recursiveFileLookup", "true") \
        .load(path)

# load all files w/ schema
else:
    df = spark.read.format("csv") \
        .schema(var_schema) \
        .option("header",var_header) \
        .option("delimiter", var_delimiter) \
        .option("recursiveFileLookup", "true") \
        .load(path)

# convert to view
df.createOrReplaceTempView(table)

# debugging
print(f"data lake path - {path}")
print(f"temporary view - {table}")


StatementMeta(, 0aa7c35b-d609-41b7-98ab-4ed8307a390d, 7, Finished, Available)

data lake path - Files/raw/saleslt
temporary view - tmp_dim_currency_full


In [6]:
#
#  F3 - create bronze table (all files)
#
  
# spark sql - assume 1 level nesting on dir
stmt = f"""
  create table bronze_{var_table}_full as
  select 
    *, 
    current_timestamp() as _load_date,
    split_part(input_file_name(), '/', 9) as _folder_name,
    split_part(split_part(input_file_name(), '/', 10), '?', 1) as _file_name
  from 
    tmp_{var_table}_full 
"""

# create table
ret = spark.sql(stmt)

# debugging
print(f"execute spark sql - \n {stmt}")


StatementMeta(, 0aa7c35b-d609-41b7-98ab-4ed8307a390d, 8, Finished, Available)

execute spark sql - 
 
  create table bronze_dim_currency_full as
  select 
    *, 
    current_timestamp() as _load_date,
    split_part(input_file_name(), '/', 9) as _folder_name,
    split_part(split_part(input_file_name(), '/', 10), '?', 1) as _file_name
  from 
    tmp_dim_currency_full 



In [7]:
#
#  F4 - create silver table (lastest file)
#
  
# spark sql
stmt = f"""
  create table silver_{var_table}_full as
  with cte_{var_table} as
  (
    select * 
    from bronze_{var_table}_full as l
    where l._folder_name = (select max(_folder_name) from bronze_{var_table}_full)
  )
  select 
    *
  from 
    cte_{var_table}
"""

# create table
ret = spark.sql(stmt)

# debugging
print(f"execute spark sql - \n {stmt}")


StatementMeta(, 0aa7c35b-d609-41b7-98ab-4ed8307a390d, 9, Finished, Available)

execute spark sql - 
 
  create table silver_dim_currency_full as
  with cte_dim_currency as
  (
    select * 
    from bronze_dim_currency_full as l
    where l._folder_name = (select max(_folder_name) from bronze_dim_currency_full)
  )
  select 
    *
  from 
    cte_dim_currency



In [8]:
#
#  Grab bronze count
#

try:
    sql_stmt = f"select count(*) from bronze_{var_table}_full"
    bronze_recs = spark.sql(sql_stmt).first()[0]
except:
    bronze_recs = 0

# show values
print(f"The bronze_{var_table}_full record count is {bronze_recs}")  


StatementMeta(, 0aa7c35b-d609-41b7-98ab-4ed8307a390d, 10, Finished, Available)

The bronze_dim_currency_full record count is 1458224


In [9]:
#
#  Grab silver count
#

try:
    sql_stmt = f"select count(*) from silver_{var_table}_full"
    silver_recs = spark.sql(sql_stmt).first()[0]
except:
    silver_recs = 0

# show values
print(f"The silver_{var_table}_full record count is {silver_recs}")  

StatementMeta(, 0aa7c35b-d609-41b7-98ab-4ed8307a390d, 11, Finished, Available)

The silver_dim_currency_full record count is 64515
