In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count
from pyspark.sql import functions as F


In [2]:
# Criar a sessão Spark
spark = SparkSession.builder.appName("VendasETL").getOrCreate()


In [3]:
#desabilita o leitor vetorizado do Parquet no Spark, permitindo fazer a leitura dos dados decimais (18,6)
spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")


In [4]:
df_vendas = spark.read.parquet('C:\\Users\\Theuzao\\Desktop\\panvel_datalake\\data\\raw\\vendas.parquet')

In [5]:
df_vendas.printSchema()

root
 |-- d_dt_vd: timestamp (nullable = true)
 |-- n_id_fil: long (nullable = true)
 |-- n_id_vd_fil: long (nullable = true)
 |-- v_cli_cod: string (nullable = true)
 |-- n_vlr_tot_vd: decimal(18,6) (nullable = true)
 |-- n_vlr_tot_desc: decimal(14,4) (nullable = true)
 |-- v_cpn_eml: string (nullable = true)
 |-- tp_pgt: string (nullable = true)



In [6]:
# Converter todas as colunas para String após a leitura
df_vendas = df_vendas.select([F.col(col_name).cast("string").alias(col_name) for col_name in df_vendas.columns])


In [7]:
df_vendas.printSchema()

root
 |-- d_dt_vd: string (nullable = true)
 |-- n_id_fil: string (nullable = true)
 |-- n_id_vd_fil: string (nullable = true)
 |-- v_cli_cod: string (nullable = true)
 |-- n_vlr_tot_vd: string (nullable = true)
 |-- n_vlr_tot_desc: string (nullable = true)
 |-- v_cpn_eml: string (nullable = true)
 |-- tp_pgt: string (nullable = true)



In [8]:
# Contagem de valores nulos

df_vendas.select([count(when(col(c).isNull(), c)).alias(c) for c in df_vendas.columns]).show()

+-------+--------+-----------+---------+------------+--------------+---------+-------+
|d_dt_vd|n_id_fil|n_id_vd_fil|v_cli_cod|n_vlr_tot_vd|n_vlr_tot_desc|v_cpn_eml| tp_pgt|
+-------+--------+-----------+---------+------------+--------------+---------+-------+
|      0|       0|          0|  5206154|           0|             0|        0|5268260|
+-------+--------+-----------+---------+------------+--------------+---------+-------+



In [9]:
# Filtrar vendas onde v_cli_cod é nulo
vendas_sem_cliente = df_vendas.filter(F.col("v_cli_cod").isNull())

# Mostrar as vendas sem cliente
vendas_sem_cliente.show(truncate=False)


+-------------------+--------+-------------+---------+------------+--------------+---------+-------+
|d_dt_vd            |n_id_fil|n_id_vd_fil  |v_cli_cod|n_vlr_tot_vd|n_vlr_tot_desc|v_cpn_eml|tp_pgt |
+-------------------+--------+-------------+---------+------------+--------------+---------+-------+
|2023-08-05 21:00:00|2189684 |37557459731  |null     |49.990000   |0.0000        |NAO      |null   |
|2023-08-20 21:00:00|2593584 |4090431631   |null     |37.970000   |17.9000       |NAO      |null   |
|2023-08-14 21:00:00|2388484 |36624197731  |null     |35.020000   |0.0000        |NAO      |null   |
|2023-08-03 21:00:00|260484  |36544360831  |null     |18.600000   |0.0000        |NAO      |null   |
|2023-08-28 21:00:00|2349284 |34160952331  |null     |9.990000    |7.9800        |NAO      |null   |
|2023-08-03 21:00:00|269584  |35963337531  |null     |12.990000   |0.0000        |NAO      |null   |
|2023-08-12 21:00:00|2209984 |36340815731  |null     |37.480000   |0.0000        |NAO      

In [10]:
# Substituir todos os valores nulos por "Desconhecido" em todas as colunas do DataFrame final
df_vendas_final = df_vendas.fillna("Desconhecido")

#lembrar que os dados nullos do v_cli_cod tbm foi alterado para desconhecido, é possivel fazer uma investigação maior

In [11]:
df_vendas_final.show()

+-------------------+--------+-----------+--------------------+------------+--------------+---------+------------+
|            d_dt_vd|n_id_fil|n_id_vd_fil|           v_cli_cod|n_vlr_tot_vd|n_vlr_tot_desc|v_cpn_eml|      tp_pgt|
+-------------------+--------+-----------+--------------------+------------+--------------+---------+------------+
|2023-10-12 21:00:00| 2356284|34366442231|016E6FCC4F98832719BC|   55.960000|       13.9900|      NAO|     A VISTA|
|2023-10-12 21:00:00| 2221184|35550863931|035D148EADC74B6C6D2F|   31.480000|       25.2100|      NAO|Desconhecido|
|2023-10-27 21:00:00| 2188984|37392732531|030C1011214A3317E850|    6.490000|       13.1400|      NAO|     A VISTA|
|2023-10-08 21:00:00| 2608284| 3672652731|04710AFAF1FD9C48EBC3|   52.990000|       40.2700|      NAO|Desconhecido|
|2023-10-09 21:00:00|  238084|37059405031|028DBA5BBB05DDF47E4C|   18.540000|        8.3800|      NAO|     A VISTA|
|2023-10-23 21:00:00| 2687384|  403806931|012F0A4B6592C311F2ED|   31.900000|    

In [13]:
# Salvar o DataFrame em formato CSV
df_vendas_final.write.mode("overwrite").option("header", "true").csv("C:\\Users\\Theuzao\\Desktop\\panvel_datalake\\data\\transformed\\vendas_transformed")

In [14]:
# Encerra a sessão Spark
spark.stop()