In [2]:
from data_processing import Sparkinit
from data_setup.configuracoes import formatar_sql
from pyspark.sql.types import StructType, StructField, TimestampType, LongType, StringType, DecimalType, BinaryType
spark_start = Sparkinit()

spark = spark_start.buscar_sessao_spark()

print(f"WebUI SparkJobs: {spark.sparkContext.uiWebUrl}")
spark.getActiveSession()


WebUI SparkJobs: http://BHS-NOTE188:4040


# Vendas

In [5]:
import os
# Carregando o dataframe para verificar a estrutura das colunas e tipos de dados
dados = spark.read.format("parquet").load(os.path.abspath(r"C:\Users\gustavo.lopes\Documentos\GitHub\desafio_panvel-data_engineer\datalake\transient\VENDAS"))

dados.printSchema()

dados.show(n=1, vertical=True)
dados.columns

root
 |-- d_dt_vd: timestamp (nullable = true)
 |-- n_id_fil: long (nullable = true)
 |-- n_id_vd_fil: long (nullable = true)
 |-- v_cli_cod: string (nullable = true)
 |-- n_vlr_tot_vd: decimal(18,6) (nullable = true)
 |-- n_vlr_tot_desc: decimal(14,4) (nullable = true)
 |-- v_cpn_eml: string (nullable = true)
 |-- tp_pgt: string (nullable = true)

-RECORD 0------------------------------
 d_dt_vd        | 2023-10-12 21:00:00  
 n_id_fil       | 2356284              
 n_id_vd_fil    | 34366442231          
 v_cli_cod      | 016E6FCC4F98832719BC 
 n_vlr_tot_vd   | 55.960000            
 n_vlr_tot_desc | 13.9900              
 v_cpn_eml      | NAO                  
 tp_pgt         | A VISTA              
only showing top 1 row



['d_dt_vd',
 'n_id_fil',
 'n_id_vd_fil',
 'v_cli_cod',
 'n_vlr_tot_vd',
 'n_vlr_tot_desc',
 'v_cpn_eml',
 'tp_pgt']

In [6]:
# Criando a tabela temporária e realizando a consulta que depois usaremos para transformar a camada raw
dados.createOrReplaceTempView("vendas_tmp")

query = formatar_sql("""SELECT 
    COALESCE(date_format(vt.d_dt_vd, 'yyyy-MM-dd HH:mm:ss'), NULL) AS data_emissao,
    CAST(vt.n_id_fil AS BIGINT) AS codigo_filial,
    CAST(vt.n_id_vd_fil AS BIGINT) AS id_venda_filial,
    COALESCE(CAST(vt.v_cli_cod AS STRING), '') AS codigo_cliente,
    CAST(vt.n_vlr_tot_vd AS DECIMAL(38, 2)) AS valor_total_venda,
    CAST(vt.n_vlr_tot_desc AS DECIMAL(38, 2)) AS valor_total_desconto,
    CASE 
        WHEN vt.v_cpn_eml  = 'SIM' THEN True
        ELSE False
    END AS enviado_email,
    COALESCE(CAST(vt.tp_pgt AS STRING), '') AS tipo_pagamento
FROM vendas_tmp as vt""")

spark.sql(query).show()

+-------------------+-------------+---------------+--------------------+-----------------+--------------------+-------------+--------------+
|       data_emissao|codigo_filial|id_venda_filial|      codigo_cliente|valor_total_venda|valor_total_desconto|enviado_email|tipo_pagamento|
+-------------------+-------------+---------------+--------------------+-----------------+--------------------+-------------+--------------+
|2023-10-12 21:00:00|      2356284|    34366442231|016E6FCC4F98832719BC|            55.96|               13.99|        false|       A VISTA|
|2023-10-12 21:00:00|      2221184|    35550863931|035D148EADC74B6C6D2F|            31.48|               25.21|        false|              |
|2023-10-27 21:00:00|      2188984|    37392732531|030C1011214A3317E850|             6.49|               13.14|        false|       A VISTA|
|2023-10-08 21:00:00|      2608284|     3672652731|04710AFAF1FD9C48EBC3|            52.99|               40.27|        false|              |
|2023-10-09 2

# Pedidos

In [11]:
import os
# Carregando o dataframe para verificar a estrutura das colunas e tipos de dados
dados = spark.read.format("parquet").load(os.path.abspath(r"C:\Users\gustavo.lopes\Documentos\GitHub\desafio_panvel-data_engineer\datalake\transient\PEDIDOS"))

dados.printSchema()

dados.show(n=1, vertical=True)
dados.columns

root
 |-- n_id_pdd: long (nullable = true)
 |-- d_dt_eft_pdd: date (nullable = true)
 |-- d_dt_entr_pdd: timestamp (nullable = true)
 |-- v_cnl_orig_pdd: string (nullable = true)
 |-- v_uf_entr_pdd: string (nullable = true)
 |-- v_lc_ent_pdd: string (nullable = true)
 |-- n_vlr_tot_pdd: decimal(38,2) (nullable = true)

-RECORD 0-----------------------------
 n_id_pdd       | 1187021679777       
 d_dt_eft_pdd   | 2023-09-13          
 d_dt_entr_pdd  | 2023-09-13 21:49:15 
 v_cnl_orig_pdd | L                   
 v_uf_entr_pdd  | RS                  
 v_lc_ent_pdd   | VIAMAO              
 n_vlr_tot_pdd  | 19.99               
only showing top 1 row



['n_id_pdd',
 'd_dt_eft_pdd',
 'd_dt_entr_pdd',
 'v_cnl_orig_pdd',
 'v_uf_entr_pdd',
 'v_lc_ent_pdd',
 'n_vlr_tot_pdd']

In [13]:
# Criando a tabela temporária e realizando a consulta que depois usaremos para transformar a camada raw
dados.createOrReplaceTempView("pedidos_tmp")

query = formatar_sql("""SELECT 
    CAST(pd.n_id_pdd AS BIGINT) AS id_pedido,
    COALESCE(CAST(pd.d_dt_eft_pdd AS DATE), NULL) AS data_realizacao_pedido,
    COALESCE(date_format(pd.d_dt_entr_pdd, 'yyyy-MM-dd HH:mm:ss'), NULL) AS data_entrega,
    CASE 
        WHEN pd.v_cnl_orig_pdd = 'L' THEN 'Loja'
        WHEN pd.v_cnl_orig_pdd = 'A' THEN 'App'
        WHEN pd.v_cnl_orig_pdd = 'S' THEN 'Site'
    END AS canal_origem_pedido,
    COALESCE(CAST(pd.v_uf_entr_pdd AS STRING), '') AS UF_pedido,
    COALESCE(CAST(pd.v_lc_ent_pdd AS STRING), '') AS destino_entrega,
    CAST(pd.n_vlr_tot_pdd AS DECIMAL(38,2)) AS valor_total_pedido
FROM pedidos_tmp as pd""")

spark.sql(query).show()

+-------------+----------------------+-------------------+-------------------+---------+---------------+------------------+
|    id_pedido|data_realizacao_pedido|       data_entrega|canal_origem_pedido|UF_pedido|destino_entrega|valor_total_pedido|
+-------------+----------------------+-------------------+-------------------+---------+---------------+------------------+
|1187021679777|            2023-09-13|2023-09-13 21:49:15|               Loja|       RS|         VIAMAO|             19.99|
|1187888931657|            2023-09-29|2023-09-29 21:11:38|               Loja|       RS|   PORTO ALEGRE|             36.98|
|1187806295857|            2023-09-28|2023-09-28 21:00:10|               Loja|       RS|   PORTO ALEGRE|             29.56|
|1187490058337|            2023-09-22|2023-09-22 20:59:02|               Loja|       RS|   PORTO ALEGRE|             37.97|
|1186976289657|            2023-09-12|2023-09-12 21:40:09|               Loja|       RS|   PORTO ALEGRE|             39.93|
|1187367

# Pedidos Vendas

In [14]:
import os
# Carregando o dataframe para verificar a estrutura das colunas e tipos de dados
dados = spark.read.format("parquet").load(os.path.abspath(r"C:\Users\gustavo.lopes\Documentos\GitHub\desafio_panvel-data_engineer\datalake\transient\PEDIDO_VENDA"))

dados.printSchema()

dados.show(n=1, vertical=True)
dados.columns

root
 |-- n_id_fil: long (nullable = true)
 |-- n_id_vd_fil: long (nullable = true)
 |-- n_id_pdd: long (nullable = true)

-RECORD 0--------------------
 n_id_fil    | 2408784       
 n_id_vd_fil | 34262326131   
 n_id_pdd    | 1189183932027 
only showing top 1 row



['n_id_fil', 'n_id_vd_fil', 'n_id_pdd']

In [16]:
# Criando a tabela temporária e realizando a consulta que depois usaremos para transformar a camada raw
dados.createOrReplaceTempView("pedido_venda_tmp")

query = formatar_sql("""SELECT 
    CAST(pv.n_id_fil AS BIGINT) AS codigo_filial,
    CAST(pv.n_id_vd_fil AS BIGINT) AS id_venda_filial,
    CAST(pv.n_id_pdd AS BIGINT) AS id_pedido
FROM pedido_venda_tmp as pv""")

spark.sql(query).show()

+-------------+---------------+-------------+
|codigo_filial|id_venda_filial|    id_pedido|
+-------------+---------------+-------------+
|      2408784|    34262326131|1189183932027|
|       242984|    35434048331|1189596350677|
|      2657984|     3474198431|1192614919827|
|      2517984|      409543431|1188810414807|
|      2457084|     4339980931|1192522099777|
|       229684|    35465235531|1188385018407|
|      2620184|     3789478231|1187171860557|
|      2174984|    35255133331|1186640698547|
|      2662184|     3752751431|1188256482257|
|       249284|    35437182231|1185665835127|
|      2626484|     3573693431|1192236782787|
|      2462684|    34129728831|1193157735357|
|      2471784|     4054827931|1191233879657|
|      2310784|    34374246731|1187244935557|
|      2453584|    34547578131|1184629304547|
|      2593584|     4082528131|1184834489897|
|       236684|    39168868431|1191362606727|
|      2454284|    34532390431|1193198429067|
|      2480184|     3964413431|118