In [0]:
# Importação de bibliotecas:

from pyspark.sql.functions import current_timestamp, expr
from pyspark.sql.types import StructType, StructField, StringType


In [0]:
schema_bronze = StructType([
    StructField("id_envio", StringType(), True),
    StructField("corredor_de_armazenagem", StringType(), True),
    StructField("metodo_de_envio", StringType(), True),
    StructField("ligações_do_cliente", StringType(), True),
    StructField("avaliação_do_cliente", StringType(), True),
    StructField("preço", StringType(), True),
    StructField("qtd_itens", StringType(), True),
    StructField("importancia", StringType(), True),
    StructField("genero", StringType(), True),
    StructField("desconto", StringType(), True),
    StructField("peso_g", StringType(), True),
    StructField("Chegou_no_tempo", StringType(), True),
    StructField("Destino", StringType(), True),
    StructField("DataEnvio", StringType(), True),
    StructField("dataEntrega", StringType(), True),
    StructField("avaliacaoEntrega", StringType(), True)
])

In [0]:
# Importação dos dados de um banco S3 da Amazon:

df_bronze = spark.read.csv(
        "s3a://grao-direto-mmk/raw/grain_logistic_shipping.csv",
        header=True,
        sep=";",
        schema=schema
    )


In [0]:
if df_bronze.schema == schema_bronze:
    print("Schema da Bronze OK!")
else:
    print("Schema da Bronze NÃO bate com o esperado!")

In [0]:
# Adicionando coluna de Data e Hora da Carga:

df_bronze = (df_bronze
            .withColumn('data_hora_carga', 
            expr('current_timestamp() - INTERVAL 3 HOURS'))
        )

print("Schema da Bronze:")
df_bronze.printSchema()

In [0]:
# (Opcional) Salvando arquivo na camada bronze no S3
# OBS: Isso gera um save a mais!

'''
df_bronze.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .save("s3a://grao-direto-mmk/bronze/grain_logistic_shipping")
'''

In [0]:
# Criação de um Database Bronze no Delta Lake 
# Criação da tabela Bronze no Delta Lake no WorkSpace do Databricks;

df_bronze.write.format("delta")\
            .option("mergeSchema", "true") \
            .mode("overwrite") \
            .saveAsTable("bronze.grain_logistic_shipping")


In [0]:
%sql

-- Testando consulta SQL

SELECT * FROM bronze.grain_logistic_shipping
