## Read Me
### This notebook created to read files from S3 location and save file contents into ingestion layer storage as external table

## Importing Required Packages

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql import DataFrame
from pyspark.sql.functions import col
import logging
import time
import datetime

## Getting Helper Functions from Helper Notebook

In [0]:
%run ./helper_notebook

## Defining Variables to be Used on Further Loops
### Variables could be read from helper notebook as well but for ease of readability defining as hard coded

In [0]:
## S3 Storage name
storage_location_name = "merkle-de-interview-case-study"

## Created file list as hard coded due to access unavailability to run dbutils.fs.ls command on storage location
file_list = ["item.csv","event.csv"]

## Schema name (Tables to be saved into)
schema_name = "bronze_layer"

## Metastore_name
location_name = "hive_metastore"

## External file location name (Azure Storage Account)
external_storage = "merkletaskstorage"

# Secret scope
sas_key_scope = "BlobStorage4"

# Secret key name
sas_key_name = "BLB_Strg_Access_KEY" 


## Saving File Names in Storage Location and Designated Dataframe Names in a Dict

In [0]:
#Creating structures to hold dataframe/source file names to be used in write operations
df_file_dict = {}
for file_name in file_list:
    df_file_dict[file_name] = file_name.split(".")[0]+"dfraw"
    
print(df_file_dict)

## Reading Source Files, Assign into Dataframe and Saving as External Table into Azure Blob Storage Location

In [0]:
for source_name,df_name in df_file_dict.items():
    # Reading csv files and assigning to defined values from df_file_dict as dataframe name / Casting all columns to string type
    # vars method used to assign different source file reads to different variable names // can not apply this with only assigning a read to 'value'
    vars()[df_name] = cast_to_string(read_csv_file(storage_location_name,df_name,[source_name])) 
    # printing read file size for debug
    shape_df = (vars()[df_name].count(),len(vars()[df_name].columns))
    print('{} files read from s3 bucket completed. Total {} rows & columns loaded into dataframe for {} bronze layer schema write'.format(f'{"Bronze_layer_pipeline"}',shape_df,df_name))
    # Create or use existing mount point
    mount_point = f"/mnt/{df_name}_raw_2"
    already_mounted = any(mount.mountPoint == mount_point for mount in dbutils.fs.mounts())
    if not already_mounted:
        spark.conf.set(f"fs.azure.sas.{df_name}.{external_storage}.blob.core.windows.net",
                       dbutils.secrets.get(scope = sas_key_scope, key = sas_key_name))
        
        dbutils.fs.mount(
            source = f"wasbs://{df_name}@{external_storage}.blob.core.windows.net/",
            mount_point = mount_point,
            extra_configs = {
                f"fs.azure.sas.{df_name}.{external_storage}.blob.core.windows.net": dbutils.secrets.get(scope = sas_key_scope, key = sas_key_name)
            }
        )
    # Writing dataframes to external Azure storage as csv formatted
    vars()[df_name].write.format("csv").mode("overwrite").option("mergeSchema", "true").option("quote", "").option("header", "true").options(delimiter='|').option("path", f"/mnt/{df_name}_raw_2").saveAsTable(f"{df_name}") 
    