Importa√ß√£o inicial

In [22]:
import sys
import os
import time
import pandas as pd
from datetime import datetime
from sqlalchemy import create_engine, text
import warnings
warnings.filterwarnings('ignore')

Configura√ß√µes do banco de dados

In [23]:
DB_CONFIG = {
    'host': 'localhost',
    'port': '5433',
    'database': 'sbd2_vehicle',
    'user': 'sbd2_vehicle',
    'password': 'sbd2_vehicle'
}

Caminhos do arquivo

In [24]:
# Caminhos relativos (sai da pasta Transformer e entra em DataLayer)
CSV_PATH = '../DataLayer/raw/vehicle_price_prediction.csv'
DDL_PATH = '../DataLayer/silver/ddl.sql'

print(f"CSV existe? {os.path.exists(CSV_PATH)}")
print(f"DDL existe? {os.path.exists(DDL_PATH)}")

# Se existir, mostrar caminho completo
if os.path.exists(CSV_PATH):
    print(f"‚úì CSV: {os.path.abspath(CSV_PATH)}")
if os.path.exists(DDL_PATH):
    print(f"‚úì DDL: {os.path.abspath(DDL_PATH)}")

CSV existe? True
DDL existe? True
‚úì CSV: /mnt/d/temp/SBD2/SBD2---Engenharia-de-Dados---2025.2/DataLayer/raw/vehicle_price_prediction.csv
‚úì DDL: /mnt/d/temp/SBD2/SBD2---Engenharia-de-Dados---2025.2/DataLayer/silver/ddl.sql


fun√ß√µes auxiliares

In [25]:
def get_engine():
    connection_string = (
        f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}"
        f"@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
    )
    return create_engine(connection_string)

try:
    print("üîß Criando engine...")
    engine = get_engine()
    print(f"‚úì Engine criada: {engine}")
    
    print("\nüîå Testando conex√£o...")
    with engine.connect() as conn:
        result = conn.execute(text("SELECT 1 as test"))
        valor = result.scalar()
        print(f"‚úì Conex√£o OK! Resultado do teste: {valor}")
        
        # Ver vers√£o do PostgreSQL
        result = conn.execute(text("SELECT version()"))
        version = result.scalar()
        print(f"‚úì PostgreSQL: {version[:60]}...")
        
        # Ver database atual
        result = conn.execute(text("SELECT current_database()"))
        db = result.scalar()
        print(f"‚úì Database: {db}")
        
    print("\n‚úÖ Fun√ß√£o get_engine() est√° funcionando perfeitamente!")
    
except Exception as e:
    print(f"\n‚ùå ERRO: {e}")
    print("\nVerifique:")
    print(f"  ‚Ä¢ Host: {DB_CONFIG['host']}")
    print(f"  ‚Ä¢ Port: {DB_CONFIG['port']}")
    print(f"  ‚Ä¢ Database: {DB_CONFIG['database']}")
    print(f"  ‚Ä¢ User: {DB_CONFIG['user']}")


üîß Criando engine...
‚úì Engine criada: Engine(postgresql://sbd2_vehicle:***@localhost:5433/sbd2_vehicle)

üîå Testando conex√£o...
‚úì Conex√£o OK! Resultado do teste: 1
‚úì PostgreSQL: PostgreSQL 14.19 (Debian 14.19-1.pgdg13+1) on x86_64-pc-linu...
‚úì Database: sbd2_vehicle

‚úÖ Fun√ß√£o get_engine() est√° funcionando perfeitamente!


In [26]:
def wait_for_database(max_attempts=30, delay=5):
    """Aguarda o banco de dados ficar dispon√≠vel"""
    print("Aguardando banco de dados...")
    for attempt in range(max_attempts):
        try:
            engine = get_engine()
            with engine.connect() as conn:
                conn.execute(text("SELECT 1"))
            print(f"‚úì Banco dispon√≠vel ap√≥s {attempt + 1} tentativa(s)")
            return True
        except Exception as e:
            if attempt < max_attempts - 1:
                print(f"Tentativa {attempt + 1}/{max_attempts}: Aguardando...")
                time.sleep(delay)
            else:
                print(f"ERRO: {e}")
    
    print("ERRO: Timeout - banco n√£o ficou dispon√≠vel")
    return False

if wait_for_database():
    engine = get_engine()
    print("‚úì Engine criada com sucesso")
else:
    print("‚úó Falha ao conectar ao banco")

Aguardando banco de dados...
‚úì Banco dispon√≠vel ap√≥s 1 tentativa(s)
‚úì Engine criada com sucesso


Cria√ß√£o tabela bronze

In [27]:
def create_bronze_table(engine):
    """Cria a tabela Bronze (dados brutos do CSV)"""
    print("\n[1/4] Criando tabela Bronze...")
    
    ddl = """
    CREATE SCHEMA IF NOT EXISTS bronze;
    DROP TABLE IF EXISTS bronze.vehicle_prices CASCADE;
    CREATE TABLE IF NOT EXISTS bronze.vehicle_prices (
        id SERIAL PRIMARY KEY,
        make VARCHAR(100),
        model VARCHAR(100),
        year INTEGER,
        mileage INTEGER,
        engine_hp FLOAT,
        transmission VARCHAR(50),
        fuel_type VARCHAR(50),
        drivetrain VARCHAR(50),
        body_type VARCHAR(50),
        exterior_color VARCHAR(50),
        interior_color VARCHAR(50),
        owner_count INTEGER,
        accident_history VARCHAR(50),
        seller_type VARCHAR(50),
        condition VARCHAR(50),
        trim VARCHAR(50),
        vehicle_age INTEGER,
        mileage_per_year FLOAT,
        brand_popularity FLOAT,
        price FLOAT,
        _ingestion_timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
        _source_file VARCHAR(255)
    );
    """
    
    try:
        with engine.connect() as conn:
            conn.execute(text(ddl))
            conn.commit()
        print("  ‚úì Tabela Bronze criada")
        return True
    except Exception as e:
        print(f"  ‚úó Erro: {e}")
        return False

In [28]:
create_bronze_table(engine)


[1/4] Criando tabela Bronze...
  ‚úì Tabela Bronze criada


True

Carregar dados bronze

In [29]:
def load_bronze_data(engine):
    """Carrega dados do CSV para tabela Bronze"""
    print("\n[3/4] Carregando dados Bronze...")
    print("="*60)
    
    start_time = datetime.now()
    
    try:
        # Verificar se j√° tem dados
        with engine.connect() as conn:
            result = conn.execute(text("SELECT COUNT(*) FROM bronze.vehicle_prices"))
            count = result.scalar()
            if count > 0:
                print(f"  ‚ö† Bronze j√° cont√©m {count:,} registros. Pulando.")
                return True
        
        # Ler CSV
        print("  üìÅ Lendo CSV...")
        df = pd.read_csv(CSV_PATH)
        print(f"  ‚úì {len(df):,} linhas carregadas")
        
        # Tratamento de nulos
        print("  üîß Tratando valores nulos...")
        df['accident_history'] = df['accident_history'].fillna('None')
        
        # Adicionar metadados
        df['_ingestion_timestamp'] = datetime.now()
        df['_source_file'] = 'vehicle_price_prediction.csv'
        
        # Inserir no banco
        print("  üíæ Inserindo no banco...")
        df.to_sql(
            'vehicle_prices',
            engine,
            schema='bronze',
            if_exists='append',
            index=False,
            chunksize=10000
        )
        
        duration = (datetime.now() - start_time).total_seconds()
        print(f"  ‚úì Bronze carregado: {len(df):,} registros em {duration:.1f}s")
        
        return True
        
    except Exception as e:
        print(f"  ‚úó Erro: {e}")
        return False

# Executar
load_bronze_data(engine)


[3/4] Carregando dados Bronze...
  üìÅ Lendo CSV...


  ‚úì 1,000,000 linhas carregadas
  üîß Tratando valores nulos...
  üíæ Inserindo no banco...
  ‚úì Bronze carregado: 1,000,000 registros em 312.2s


True

verifica√ß√£o de dados

In [30]:
with engine.connect() as conn:
    # Contagens
    bronze_count = conn.execute(text("SELECT COUNT(*) FROM bronze.vehicle_prices")).scalar()

print(f"\nüìä CONTAGENS:")
print(f"  Bronze:            {bronze_count:>10,} registros")


üìä CONTAGENS:
  Bronze:             1,000,000 registros
