In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, when, count
from pyspark.sql import functions as F

In [2]:
# Criar a sessão Spark
spark = SparkSession.builder.appName("ItensVendasETL").getOrCreate()

In [3]:
# Lê o arquivo Parquet
df_itens_vendas = spark.read.parquet('C:\\Users\\Theuzao\\Desktop\\panvel_datalake\\data\\raw\\itens_vendas.parquet')

In [4]:
# Mostra o schema
df_itens_vendas.printSchema()

root
 |-- n_id_fil: long (nullable = true)
 |-- n_id_vd_fil: long (nullable = true)
 |-- n_id_it: long (nullable = true)
 |-- v_rc_elt: string (nullable = true)
 |-- v_it_vd_conv: string (nullable = true)
 |-- n_vlr_pis: decimal(38,2) (nullable = true)
 |-- n_vlr_vd: decimal(38,2) (nullable = true)
 |-- n_vlr_desc: decimal(38,2) (nullable = true)
 |-- n_qtd: decimal(38,4) (nullable = true)



In [5]:
# Imprime o Dataframe
df_itens_vendas.show() 

+--------+-------------+---------+--------+------------+---------+--------+----------+------+
|n_id_fil|  n_id_vd_fil|  n_id_it|v_rc_elt|v_it_vd_conv|n_vlr_pis|n_vlr_vd|n_vlr_desc| n_qtd|
+--------+-------------+---------+--------+------------+---------+--------+----------+------+
| 2326184|   4104333431|433914026|     NAO|         NAO|     0.00|   38.49|      8.50|1.0000|
|  238084|  37050030831|405284776|     NAO|         NAO|     0.14|    8.49|      0.00|1.0000|
| 2439584|   4272478331|403805896|     NAO|         NAO|     0.00|   17.99|      0.00|1.0000|
| 2618784|   3678370531|502317426|     NAO|         NAO|     0.00|    3.10|      0.00|1.0000|
|  253484|  37104903231|403469456|     NAO|         NAO|     0.00|   17.99|      8.00|1.0000|
| 2433984|  34687630131|405275806|     NAO|         NAO|     0.21|   12.49|      0.00|1.0000|
|  220584|3467614187631|403614016|     NAO|         NAO|     0.00|    4.49|      0.00|1.0000|
| 2212084|  36067773731|411572226|     NAO|         NAO|    

In [9]:
# Converter todas as colunas para String após a leitura
df_itens_vendas = df_itens_vendas.select([F.col(col_name).cast("string").alias(col_name) for col_name in df_itens_vendas.columns])


In [24]:
# Contagem de valores nulos
df_itens_vendas.select([count(when(col(c).isNull(), c)).alias(c) for c in df_itens_vendas.columns]).show()

+--------+-----------+-------+--------+------------+---------+--------+----------+-----+
|n_id_fil|n_id_vd_fil|n_id_it|v_rc_elt|v_it_vd_conv|n_vlr_pis|n_vlr_vd|n_vlr_desc|n_qtd|
+--------+-----------+-------+--------+------------+---------+--------+----------+-----+
|       0|          0|      0|       0|           0|        0|       0|         0|    0|
+--------+-----------+-------+--------+------------+---------+--------+----------+-----+



In [10]:
# Salvar o DataFrame em formato CSV
df_itens_vendas.write.mode("overwrite").option("header", "true").csv("C:\\Users\\Theuzao\\Desktop\\panvel_datalake\\data\\transformed\\itens_vendas")

In [11]:
# Encerra a sessão Spark
spark.stop()