# Read CSV File & Write it to Bronze Layer

## load cust_info file

In [0]:
from pyspark.sql.functions import current_timestamp

df = spark.read.option('header', 'True').option('inferSchema', 'True').csv('/Volumes/workspace/bronze/source_system_raw/source_crp/cust_info.csv')

df.withColumn("_ingest_time", current_timestamp())

df.write.mode("overwrite").saveAsTable("workspace.bronze.crm_cust_info")


In [0]:
%sql
--DROP TABLE IF EXISTS workspace.bronze.crm_cust_info;
--DROP TABLE IF EXISTS workspace.bronze.crm_prd_info;
--DROP TABLE IF EXISTS workspace.bronze.crm_sales_details;
--DROP TABLE IF EXISTS workspace.bronze.erp_cust_az12;
--DROP TABLE IF EXISTS workspace.bronze.erp_loc_a101;
--DROP TABLE IF EXISTS workspace.bronze.erp_px_cat_g1v2;

## load prd_info file

In [0]:
df = spark.read.option('header', 'True').option('inferSchema', 'True').csv('/Volumes/workspace/bronze/source_system_raw/source_crp/prd_info.csv')

df.withColumn("_ingest_time", current_timestamp())

df.write.mode("overwrite").saveAsTable("workspace.bronze.crm_prd_info")

## load sales_details file

In [0]:
df = spark.read.option('header', 'True').option('inferSchema', 'True').csv('/Volumes/workspace/bronze/source_system_raw/source_crp/sales_details.csv')

df.withColumn("_ingest_time", current_timestamp())

df.write.mode("overwrite").saveAsTable("workspace.bronze.crm_sales_details")

## load erp cust info

In [0]:
df = spark.read.option('header', 'True').option('inferSchema', 'True').csv('/Volumes/workspace/bronze/source_system_raw/source_erp/CUST_AZ12.csv')

df.withColumn("_ingest_time", current_timestamp())

df.write.mode("overwrite").saveAsTable('workspace.bronze.erp_cust')

## load cust location details

In [0]:
df = spark.read.option('header', 'True').option('inferSchema', 'True').csv('/Volumes/workspace/bronze/source_system_raw/source_erp/LOC_A101.csv')

df.withColumn("_ingest_time", current_timestamp())

df.write.mode("overwrite").saveAsTable('workspace.bronze.erp_loc')

## load catgory details

In [0]:
df = spark.read.option('header', 'True').option('inferSchema', 'True').csv('/Volumes/workspace/bronze/source_system_raw/source_erp/PX_CAT_G1V2.csv')

df.withColumn("_ingest_time", current_timestamp())

df.write.mode('overwrite').saveAsTable('workspace.bronze.erp_cat')


# improved code to dev_project

In [0]:
INGESTION_CONFIG = [
    {
        "source": "crm",
        "path": "/Volumes/dev_project/bronze/source_system/source_crp/cust_info.csv",
        "table": "crm_cust_info"
    },
    {
        "source": "crm",
        "path": "/Volumes/dev_project/bronze/source_system/source_crp/prd_info.csv",
        "table": "crm_prd_info"
    },
    {
        "source": "crm",
        "path": "/Volumes/dev_project/bronze/source_system/source_crp/sales_details.csv",
        "table": "crm_sales_details"
    },
    {
        "source": "erp",
        "path": "/Volumes/dev_project/bronze/source_system/source_erp/CUST_AZ12.csv",
        "table": "erp_cust_az12"
    },
    {
        "source": "erp",
        "path": "/Volumes/dev_project/bronze/source_system/source_erp/LOC_A101.csv",
        "table": "erp_loc_a101"
    },
    {
        "source": "erp",
        "path": "/Volumes/dev_project/bronze/source_system/source_erp/PX_CAT_G1V2.csv",
        "table": "erp_px_cat_g1v2"
    }
]


In [0]:
from pyspark.sql.functions import current_timestamp

for item in INGESTION_CONFIG:

    print(f"Ingesting {item['source']} â†’ dev_project.bronze.{item['table']}")

    df = (
        spark.read
             .option("header", True)
             .option("inferSchema", True)
             .csv(item["path"])
    )

    # Add ingestion metadata
    df = (
        df.withColumn("_ingest_time", current_timestamp())
    )

    (
        df.write
          .mode("overwrite")
          .format("delta")
          .option("overwriteSchema", "true")
          .saveAsTable(f"dev_project.bronze.{item['table']}")
    )
     

In [0]:
%sql
select * from dev_project.bronze.crm_cust_info