Create Schemas for tables

In [0]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from datetime import date

customer_schema = T.StructType(
        [
            T.StructField("customer_id", T.IntegerType()),
            T.StructField("full_name", T.StringType()),
            T.StructField("address", T.StringType()),
            T.StructField("state", T.StringType()),
            T.StructField("country", T.StringType())
        ]    
)

order_schema = T.StructType(
        [
            T.StructField("order_id", T.IntegerType()),
            T.StructField("customer_id", T.IntegerType()),
            T.StructField("order_date", T.DateType()),
            T.StructField("amount", T.DoubleType()),
        ]    
)

item_schema = T.StructType(
        [
            T.StructField("item_id", T.IntegerType()),
            T.StructField("order_id", T.IntegerType()),
            T.StructField("product_id", T.IntegerType()),
            T.StructField("unique_price", T.IntegerType()),
            T.StructField("quantity", T.IntegerType())       
        ]    
)

product_schema = T.StructType(
        [
            T.StructField("product_id", T.IntegerType()),
            T.StructField("name", T.StringType()),
            T.StructField("price", T.IntegerType()),
            T.StructField("category", T.StringType())
        ]    
)

Datos

In [0]:
customers = [
    (1, "John Smith", "123 Palm Ave", "Florida", "USA"),
    (2, "Alice Johnson", "456 Ocean Dr", "Florida", "USA"),
    (3, "Bob Brown", "789 Sunset Blvd", "California", "USA"),
    (4, "Maria Lopez", "321 Main St", "Texas", "USA")
]

products = [
    (1, "Toy Car", 15.00, "Toys"),
    (2, "Lego Set", 50.00, "Toys"),
    (3, "Doll", 25.00, "Toys"),
    (4, "Laptop", 1000.00, "Electronics"),
    (5, "Headphones", 200.00, "Electronics"),
    (6, "Book", 20.00, "Books")
]

orders = [
    (1, 1, date(2018, 3, 15), 100.00),
    (2, 1, date(2019, 7, 22), 200.00),
    (3, 2, date(2018, 11, 5), 300.00),
    (4, 2, date(2019, 2, 14), 150.00),
    (5, 3, date(2019, 5, 10), 500.00),
    (6, 4, date(2018, 8, 19), 250.00)
]

items = [
    (1, 1, 1, 15.00, 2),
    (2, 1, 6, 20.00, 1),
    (3, 2, 2, 50.00, 2),
    (4, 2, 3, 25.00, 1),
    (5, 3, 2, 50.00, 3),
    (6, 3, 4, 1000.00, 1),
    (7, 4, 1, 15.00, 5),
    (8, 4, 5, 200.00, 1),
    (9, 5, 4, 1000.00, 1),
    (10, 5, 6, 20.00, 2),
    (11, 6, 3, 25.00, 2),
    (12, 6, 6, 20.00, 3)
]

Create Daframes

In [0]:
df_customers = spark.createDataFrame(customers, customer_schema)
df_customers.show(10, False)

df_orders = spark.createDataFrame(orders, order_schema)
df_orders.show(12, False)

df_items = spark.createDataFrame(items, item_schema)
df_items.show(10, False)

df_products = spark.createDataFrame(products, product_schema)
df_products.show(10, False)

Registrar las tablas temporales (SQL oppcional)

In [0]:
df_customers.createOrReplaceTempView("Customer")
df_orders.createOrReplaceTempView("Order")
df_products.createOrReplaceTempView("Product")
df_items.createOrReplaceTempView("Item")

Crear tablas en el Metasktore (schema and bronze volumen Ansira)

In [0]:
ansira_path_bronze = "/Volumes/workspace/ansira/bronze/"
bronze_table_customer = "ansira.customer"

# Guardar como Delta en broze
df_customers.write.format("delta").mode("overwrite").save(ansira_path_bronze + "customer")

spark.sql(f"DROP TABLE IF EXISTS {bronze_table_customer}")
spark.sql(f"CREATE TABLE {bronze_table_customer} AS SELECT * FROM Customer")

spark.sql(f"SELECT * FROM {bronze_table_customer}").show()

In [0]:
bronze_table_order = "ansira.order"

# Guardar como Delta en broze
df_customers.write.format("delta").mode("overwrite").save(ansira_path_bronze + "order")

spark.sql(f"DROP TABLE IF EXISTS {bronze_table_order}")
spark.sql(f"CREATE TABLE {bronze_table_order} AS SELECT * FROM order")

spark.sql(f"SELECT * FROM {bronze_table_order}").show()

In [0]:
bronze_table_item = "ansira.item"

# Guardar como Delta en broze
df_customers.write.format("delta").mode("overwrite").save(ansira_path_bronze + "item")

spark.sql(f"DROP TABLE IF EXISTS {bronze_table_item}")
spark.sql(f"CREATE TABLE {bronze_table_item} AS SELECT * FROM item")

spark.sql(f"SELECT * FROM {bronze_table_item}").show()

In [0]:
bronze_table_product = "ansira.product"

# Guardar como Delta en broze
df_customers.write.format("delta").mode("overwrite").save(ansira_path_bronze + "product")

spark.sql(f"DROP TABLE IF EXISTS {bronze_table_product}")
spark.sql(f"CREATE TABLE {bronze_table_product} AS SELECT * FROM product")

spark.sql(f"SELECT * FROM {bronze_table_product}").show()