In [1]:
import os
import pandas as pd

# Criando diretórios
camada_origem = "bronze"
camada_destino = "silver"
os.makedirs(camada_origem, exist_ok=True)

In [2]:
# Função para tratar o zip code e garantir que tenha 5 dígitos
def tratar_zip_code(df, column_name):
    df[column_name] = df[column_name].astype(str).str.zfill(5)
    return df

In [3]:
# Função para tratamento de dados
def tratamento_dados(df, columns):
    for column in columns:
        df[column] = df[column].str.upper()
        df[column] = df[column].str.replace("[áàâãäéèêëíìîïóòôõöúùûüç]", "", regex=True)
        df[column] = df[column].str.replace("_", " ", regex=True)
        df[column] = df[column].str.replace("[^a-zA-Z0-9\s]", "", regex=True)
    return df

In [4]:
# Função para ler, tratar e salvar os dados
def processar_tabela(nome_tabela, colunas_tratamento, colunas_tipos):
    df = pd.read_parquet(f"{camada_origem}/{nome_tabela}/{nome_tabela}.parquet")
    df = tratamento_dados(df, colunas_tratamento)
    
    for coluna, tipo in colunas_tipos.items():
        if tipo == int:
            df[coluna] = df[coluna].fillna(0).astype(int)  # Substitui NaN por 0 antes de converter
        else:
            df[coluna] = df[coluna].astype(tipo)
    
    df = df.drop_duplicates()
    os.makedirs(f"{camada_destino}/{nome_tabela}", exist_ok=True)
    df.to_parquet(f"{camada_destino}/{nome_tabela}/{nome_tabela}.parquet", index=False)


In [5]:
# Processamento das tabelas
processar_tabela("customers", ["customer_city", "customer_state"], {
    "customer_id": str,
    "customer_unique_id": str,
    "customer_zip_code_prefix": str,
    "customer_city": str,
    "customer_state": str
})

processar_tabela("geolocation", ["geolocation_city", "geolocation_state"], {
    "geolocation_zip_code_prefix": str,
    "geolocation_lat": float,
    "geolocation_lng": float,
    "geolocation_city": str,
    "geolocation_state": str
})

processar_tabela("order_items", [], {
    "order_id": str,
    "order_item_id": int,
    "product_id": str,
    "seller_id": str,
    "shipping_limit_date": "datetime64[ns]",
    "price": float,
    "freight_value": float
})

processar_tabela("order_payments", ["payment_type"], {
    "order_id": str,
    "payment_sequential": int,
    "payment_type": str,
    "payment_installments": int,
    "payment_value": float
})

processar_tabela("order_reviews", ["review_comment_title", "review_comment_message"], {
    "order_id": str,
    "review_id": str,
    "review_score": int,
    "review_comment_title": str,
    "review_comment_message": str,
    "review_creation_date": "datetime64[ns]",
    "review_answer_timestamp": "datetime64[ns]"
})

processar_tabela("orders", ["order_status"], {
    "order_id": str,
    "customer_id": str,
    "order_status": str,
    "order_purchase_timestamp": "datetime64[ns]",
    "order_approved_at": "datetime64[ns]",
    "order_delivered_carrier_date": "datetime64[ns]",
    "order_delivered_customer_date": "datetime64[ns]",
    "order_estimated_delivery_date": "datetime64[ns]"
})

processar_tabela("product_category_name_translation", ["product_category_name", "product_category_name_english"], {
    "product_category_name": str,
    "product_category_name_english": str
})

processar_tabela("products", ["product_category_name"], {
    "product_id": str,
    "product_category_name": str,
    "product_name_lenght": int,
    "product_description_lenght": int,
    "product_photos_qty": int,
    "product_weight_g": int,
    "product_length_cm": int,
    "product_height_cm": int,
    "product_width_cm": int
})

processar_tabela("sellers", ["seller_city", "seller_state"], {
    "seller_id": str,
    "seller_zip_code_prefix": str,
    "seller_city": str,
    "seller_state": str
})