#NOTEBOOK 2/6: BRONZE TABLE

##1. DEFINING CONFIGURATION AND READING DATA

In [0]:
# Configuration
CATALOG = "healthcare_analytics"
SCHEMA_RAW = "raw_data"
SCHEMA_BRONZE = "bronze"
SCHEMA_SILVER = "silver"
SCHEMA_GOLD = "gold"
SCHEMA_ML = "ml_features"
BRONZE_TABLE = "healthcare_analytics.bronze.bronze_events"
EVENT_LOGS = "information_schema.event_logs"


In [0]:
# Read raw data
raw_df = spark\
    .read\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .csv("/Volumes/healthcare_analytics/raw_data/source_files/diabetic_data.csv")

In [0]:
# Display rows
display(raw_df)

##2. SELECTING SCHEMA AND IMPORTING FUNCTIONS

In [0]:
# Selecting bronze schema and healthcare_analytics catalog
spark.sql("USE CATALOG healthcare_analytics")
spark.sql("USE SCHEMA bronze")
spark.sql("SELECT current_catalog(), current_schema()").show()



In [0]:
from pyspark.sql.functions import current_timestamp, input_file_name, lit
from delta.tables import DeltaTable
from datetime import datetime

##3.CREATING AND WRITING BRONZE TABLE

In [0]:
bronze_df = raw_df \
    .withColumn("ingestion_timestamp", current_timestamp()) \
    .withColumn("ingestion_batch_id", lit(datetime.now().strftime("%Y%m%d_%H%M%S")))

In [0]:
bronze_table = "healthcare_analytics.bronze.bronze_events"


bronze_df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .option("delta.enableChangeDataFeed", "true") \
    .saveAsTable(bronze_table)

print("Bronze table created successfully!")

In [0]:
%sql
--READING SAMPLE DATA
SELECT * FROM healthcare_analytics.bronze.bronze_events
LIMIT 20

In [0]:
# OPTIMIZE bronze table
spark.sql("OPTIMIZE healthcare_analytics.bronze.bronze_events")
