In [0]:
# Define the paths to your uploaded CSV files in the Databricks File System (DBFS)
products_csv_path = "big_data_project.bronze.amazon_products"
categories_csv_path = "big_data_project.bronze.amazon_products"

# Define the names for our new databases (schemas)
bronze_db_name = "bronze_db"
silver_db_name = "silver_db"
gold_db_name = "gold_db"

In [0]:
%sql
-- Create your new Catalog (the main project folder)
CREATE CATALOG IF NOT EXISTS ${var.catalog_name};

-- Select your catalog so all new schemas are created inside it
USE CATALOG ${var.catalog_name};

-- Create your three schemas (databases) inside the catalog
CREATE SCHEMA IF NOT EXISTS ${var.bronze_schema_name};
CREATE SCHEMA IF NOT EXISTS ${var.silver_schema_name};
CREATE SCHEMA IF NOT EXISTS ${var.gold_schema_name};

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, BooleanType

# Define the schema for the products CSV
products_schema = StructType([
    StructField("asin", StringType(), True),
    StructField("title", StringType(), True),
    StructField("imgUrl", StringType(), True),
    StructField("productURL", StringType(), True),
    StructField("stars", DoubleType(), True),
    StructField("reviews", IntegerType(), True),
    StructField("price", DoubleType(), True),
    StructField("listPrice", DoubleType(), True),
    StructField("category_id", IntegerType(), True),
    StructField("isBestSeller", BooleanType(), True),
    StructField("boughtInLastMonth", IntegerType(), True)
])

# Define the schema for the categories CSV
categories_schema = StructType([
    StructField("category_id", IntegerType(), True),
    StructField("category_name", StringType(), True)
])

In [0]:
# Read the products CSV with our defined schema
products_df = spark.read.csv(
    products_csv_path,
    schema=products_schema,
    header=True
)

# Write the data to a new Delta table in the bronze database
products_df.write.format("delta").mode("overwrite").saveAsTable(f"{bronze_db_name}.products")

# Show the new table
display(spark.sql(f"SELECT * FROM {bronze_db_name}.products LIMIT 10"))

In [0]:
# Read the categories CSV
categories_df = spark.read.csv(
    categories_csv_path,
    schema=categories_schema,
    header=True
)

# Write the data to a new Delta table in the bronze database
categories_df.write.format("delta").mode("overwrite").saveAsTable(f"{bronze_db_name}.categories")

# Show the new table
display(spark.sql(f"SELECT * FROM {bronze_db_name}.categories LIMIT 10"))