## Read Me
### This notebook created to read files from S3 location and save file contents into ingestion layer 

## Importing Required Packages

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql import DataFrame
from pyspark.sql.functions import col
import logging
import time
import datetime

## Getting helper functions from helper notebook

In [0]:
%run ./helper_notebook

## Defining variables to be used further loops

In [0]:
## S3 Storage name
storage_location_name = "merkle-de-interview-case-study"

## Created file list as hard coded due to access unavailability to run dbutils.fs.ls command on storage location
file_list = ["item.csv","event.csv"]

## Schema name (Tables to be saved into)
schema_name = "bronze_layer"

## Metastore_name
location_name = "hive_metastore"

## External file location name
external_storage = "merkletaskstorage"


## Defining file names in storage location and dataframe lists

In [0]:
#Creating structures to hold dataframe/source file names to be used in write operations
df_file_dict = {}
for file_name in file_list:
    df_file_dict[file_name] = file_name.split(".")[0]+"dfraw"
    
print(df_file_dict)

## Reading source files assign into dataframe and saving as external table to azure blob storage location

In [0]:
for key,value in df_file_dict.items():
    print(key,value)
    # Reading csv files and assigning to defined values from df_file_dict as dataframe name / Casting all columns to string type
    # vars method used to assign different source file reads to different variable names // can not apply this with only assigning read to 'value'
    vars()[value] = cast_to_string(read_csv_file(storage_location_name,value,[key])) 
    # printing read file size for debug
    shape_df = (vars()[value].count(),len(vars()[value].columns))
    print('{} files read from s3 bucket completed. Total {} rows & columns loaded into dataframe for {} first layer schema write'.format(f'{"First_layer_pipeline"}',shape_df,value))
    # Creating mount on storage location per source table for external storage/ Changing setting to SAS key authentication mode
    spark.conf.set(f"fs.azure.sas.{value}.{external_storage}.blob.core.windows.net",dbutils.secrets.get(scope = "BlobStorage4", key = "BLB_Strg_Access_KEY"))
    dbutils.fs.mount(
    source = f"wasbs://{value}@{external_storage}.blob.core.windows.net/",
    mount_point = f"/mnt/{value}_raw_2",
    extra_configs = {f"fs.azure.sas.{value}.{external_storage}.blob.core.windows.net": dbutils.secrets.get(scope = "BlobStorage4", key = "BLB_Strg_Access_KEY")}
    )
    # Writing dataframes to external Azure storage as csv formatted
    vars()[value].write.format("csv").mode("append").option("mergeSchema", "true").option("header", "true").option("path", f"/mnt/{value}_raw_2").saveAsTable(f"{value}")
    #Unmount from storage locations
    dbutils.fs.unmount(f"/mnt/{value}_raw_2")
    