In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count
from pyspark.sql.types import *
from pyspark.sql import functions as F


In [2]:
# Criar a sessão Spark (criar apenas uma vez)
spark = SparkSession.builder.appName("PedidoslETL").getOrCreate()

# Ler o arquivo Parquet com o spark
df_pedidos = spark.read.parquet('C:\\Users\\Theuzao\\Desktop\\panvel_datalake\\data\\raw\\pedidos.parquet')

In [None]:
# Imprime o Schema
df_pedidos.printSchema()

root
 |-- n_id_pdd: long (nullable = true)
 |-- d_dt_eft_pdd: date (nullable = true)
 |-- d_dt_entr_pdd: timestamp (nullable = true)
 |-- v_cnl_orig_pdd: string (nullable = true)
 |-- v_uf_entr_pdd: string (nullable = true)
 |-- v_lc_ent_pdd: string (nullable = true)
 |-- n_vlr_tot_pdd: decimal(38,2) (nullable = true)



Tratamento dos dados

In [4]:
# Contar os valores faltantes em cada coluna
df_pedidos.select([count(when(col(c).isNull(), c)).alias(c) for c in df_pedidos.columns]).show()

+--------+------------+-------------+--------------+-------------+------------+-------------+
|n_id_pdd|d_dt_eft_pdd|d_dt_entr_pdd|v_cnl_orig_pdd|v_uf_entr_pdd|v_lc_ent_pdd|n_vlr_tot_pdd|
+--------+------------+-------------+--------------+-------------+------------+-------------+
|      61|        8139|         8139|             0|            0|           0|            0|
+--------+------------+-------------+--------------+-------------+------------+-------------+



In [5]:
# Preencher valores nulos nas colunas numéricas e de strings
df_pedidos = df_pedidos.withColumn(
    "n_id_pdd",
    F.when(F.col("n_id_pdd").isNull(), "Desconhecido").otherwise(F.col("n_id_pdd"))
)

# Preencher valores nulos nas colunas de data com '2001-01-01'
df_pedidos = df_pedidos.withColumn(
    "d_dt_eft_pdd",
    F.when(F.col("d_dt_eft_pdd").isNull(), F.lit("2001-01-01")).otherwise(F.col("d_dt_eft_pdd"))
).withColumn(
    "d_dt_entr_pdd",
    F.when(F.col("d_dt_entr_pdd").isNull(), F.lit("2001-01-01")).otherwise(F.col("d_dt_entr_pdd"))
)


In [6]:
# Converter todas as colunas para String após a leitura
df_pedidos = df_pedidos.select([F.col(col_name).cast("string").alias(col_name) for col_name in df_pedidos.columns])


In [7]:
# Mostrar o dataframe
df_pedidos.show() 

+-------------+------------+-------------------+--------------+-------------+--------------+-------------+
|     n_id_pdd|d_dt_eft_pdd|      d_dt_entr_pdd|v_cnl_orig_pdd|v_uf_entr_pdd|  v_lc_ent_pdd|n_vlr_tot_pdd|
+-------------+------------+-------------------+--------------+-------------+--------------+-------------+
|1187021679777|  2023-09-13|2023-09-13 21:49:15|             L|           RS|        VIAMAO|        19.99|
|1187888931657|  2023-09-29|2023-09-29 21:11:38|             L|           RS|  PORTO ALEGRE|        36.98|
|1187806295857|  2023-09-28|2023-09-28 21:00:10|             L|           RS|  PORTO ALEGRE|        29.56|
|1187490058337|  2023-09-22|2023-09-22 20:59:02|             L|           RS|  PORTO ALEGRE|        37.97|
|1186976289657|  2023-09-12|2023-09-12 21:40:09|             L|           RS|  PORTO ALEGRE|        39.93|
|1187367744477|  2023-09-19|2023-09-19 21:05:09|             L|           RS|   SANTA MARIA|        14.89|
|1187489749387|  2023-09-22|2023-09-2

In [8]:
# Salvar o DataFrame em formato CSV
df_pedidos.write.mode("overwrite").option("header", "true").csv("C:\\Users\\Theuzao\\Desktop\\panvel_datalake\\data\\transformed\\pedidos")

In [9]:
# Encerra a sessão Spark
spark.stop()