# üöÄ Setup Inicial - DataSUS Analytics Project

##**Objetivo**: Configurar ambiente Unity Catalog e validar configura√ß√£o serverless
 
##**Stack**: Databricks Free Edition (Serverless) + Unity Catalog + Delta Lake

## 1Ô∏è‚É£ Validar Ambiente Serverless

In [0]:
print("="*60)
print("üîç INFORMA√á√ïES DO AMBIENTE")
print("="*60)

# Vers√£o do Spark
print(f"‚úÖ Spark Version: {spark.version}")

# Validar Unity Catalog
current_catalog = spark.sql("SELECT current_catalog()").collect()[0][0]
current_schema = spark.sql("SELECT current_schema()").collect()[0][0]
print(f"‚úÖ Current Catalog: {current_catalog}")
print(f"‚úÖ Current Schema: {current_schema}")

print("\nüéâ Ambiente serverless configurado corretamente!")

## 2Ô∏è‚É£ Instalar Bibliotecas Necess√°rias


In [0]:
# Instalar bibliotecas adicionais
%pip install requests pandas openpyxl --quiet

In [0]:
# Restart do kernel ap√≥s instala√ß√£o
dbutils.library.restartPython()

In [0]:
# Importar bibliotecas principais
import requests
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.types import *
from datetime import datetime, timedelta
import json

print("‚úÖ Bibliotecas importadas com sucesso!")
print(f"üì¶ Pandas version: {pd.__version__}")
print(f"üì¶ Requests version: {requests.__version__}")

## 3Ô∏è‚É£ Criar Estrutura Unity Catalog

In [0]:
# Criar catalog para o projeto (se n√£o existir)
catalog_name = "datasus_project"

try:
    spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog_name}")
    print(f"‚úÖ Catalog '{catalog_name}' criado/verificado")
except Exception as e:
    print(f"‚ÑπÔ∏è Catalog j√° existe ou voc√™ n√£o tem permiss√£o de criar: {e}")
    print("   Usando catalog padr√£o do workspace")
    catalog_name = spark.sql("SELECT current_catalog()").collect()[0][0]

# Usar o catalog
spark.sql(f"USE CATALOG {catalog_name}")

In [0]:
# Criar schemas para as camadas medallion
schemas = ["bronze", "silver", "gold"]

for schema in schemas:
    try:
        spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema}")
        print(f"‚úÖ Schema '{schema}' criado com sucesso")
    except Exception as e:
        print(f"‚ö†Ô∏è Erro ao criar schema '{schema}': {e}")

In [0]:
# Listar estrutura criada
print("\nüìä ESTRUTURA UNITY CATALOG CRIADA:")
print("="*60)
spark.sql(f"SHOW SCHEMAS IN {catalog_name}").show(truncate=False)

## 4Ô∏è‚É£ Criar Volumes para Armazenamento


In [0]:
# Criar volumes para cada camada
volumes = ["raw_data", "processed_data", "analytics_data"]
schemas_for_volumes = ["bronze", "silver", "gold"]

for schema, volume in zip(schemas_for_volumes, volumes):
    try:
        spark.sql(f"""
            CREATE VOLUME IF NOT EXISTS {catalog_name}.{schema}.{volume}
        """)
        print(f"‚úÖ Volume '{schema}.{volume}' criado com sucesso")
    except Exception as e:
        print(f"‚ö†Ô∏è Erro ao criar volume '{schema}.{volume}': {e}")

In [0]:
# Listar volumes criados
print("\nüìÅ VOLUMES CRIADOS:")
print("="*60)
for schema in schemas:
    try:
        spark.sql(f"SHOW VOLUMES IN {catalog_name}.{schema}").show(truncate=False)
    except:
        pass
    

## 5Ô∏è‚É£ Criar Tabela de Controle de Execu√ß√£o


In [0]:
# Usar schema bronze para controles
spark.sql(f"USE {catalog_name}.bronze")

# Schema da tabela de controle
schema_control = StructType([
    StructField("pipeline_name", StringType(), False),
    StructField("execution_date", TimestampType(), False),
    StructField("status", StringType(), False),
    StructField("records_processed", LongType(), True),
    StructField("execution_time_seconds", DoubleType(), True),
    StructField("error_message", StringType(), True)
])

# Criar DataFrame vazio
df_control = spark.createDataFrame([], schema_control)

# Salvar como Delta Table no Unity Catalog
df_control.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(f"{catalog_name}.bronze.pipeline_control")

print("‚úÖ Tabela de controle 'pipeline_control' criada!")


In [0]:
# Verificar tabela criada
spark.sql(f"DESCRIBE TABLE {catalog_name}.bronze.pipeline_control").show(truncate=False)

## 6Ô∏è‚É£ Testar Acesso a API do DATASUS


In [0]:
# Testar conex√£o com OpenDataSUS
print("üîç Testando conex√£o com OpenDataSUS...")
print("="*60)

url_test = "https://opendatasus.saude.gov.br/dataset"

try:
    response = requests.get(url_test, timeout=10)
    if response.status_code == 200:
        print("‚úÖ Conex√£o com OpenDataSUS funcionando!")
        print(f"   Status Code: {response.status_code}")
    else:
        print(f"‚ö†Ô∏è Status code inesperado: {response.status_code}")
except Exception as e:
    print(f"‚ùå Erro na conex√£o: {e}")
    print("   Verifique sua conex√£o de internet")

## 7Ô∏è‚É£ Criar Fun√ß√µes Utilit√°rias

In [0]:
def log_pipeline_execution(pipeline_name, status, records=0, exec_time=0, error=None):
    """
    Registra a execu√ß√£o de um pipeline na tabela de controle Unity Catalog
    
    Args:
        pipeline_name (str): Nome do pipeline executado
        status (str): Status da execu√ß√£o (SUCCESS, FAILED, RUNNING)
        records (int): N√∫mero de registros processados
        exec_time (float): Tempo de execu√ß√£o em segundos
        error (str): Mensagem de erro, se houver
    """
    log_data = [(
        pipeline_name,
        datetime.now(),
        status,
        records,
        exec_time,
        error
    )]
    
    schema_control = StructType([
        StructField("pipeline_name", StringType(), False),
        StructField("execution_date", TimestampType(), False),
        StructField("status", StringType(), False),
        StructField("records_processed", LongType(), True),
        StructField("execution_time_seconds", DoubleType(), True),
        StructField("error_message", StringType(), True)
    ])
    
    df_log = spark.createDataFrame(log_data, schema_control)
    
    # Append na tabela Unity Catalog
    df_log.write \
        .format("delta") \
        .mode("append") \
        .saveAsTable(f"{catalog_name}.bronze.pipeline_control")
    
    print(f"‚úÖ Log registrado: {pipeline_name} - {status}")

# Testar fun√ß√£o
log_pipeline_execution("setup_inicial", "SUCCESS", 0, 1.2)

In [0]:
# Verificar log registrado
spark.sql(f"SELECT * FROM {catalog_name}.bronze.pipeline_control").show(truncate=False)

## 8Ô∏è‚É£ Configurar Widget de Par√¢metros (Opcional)

In [0]:
# Criar widget para facilitar testes
dbutils.widgets.text("environment", "development", "Ambiente")
dbutils.widgets.dropdown("log_level", "INFO", ["DEBUG", "INFO", "WARNING", "ERROR"])

print("‚úÖ Widgets de configura√ß√£o criados!")
print(f"   Ambiente: {dbutils.widgets.get('environment')}")
print(f"   Log Level: {dbutils.widgets.get('log_level')}")

## 9Ô∏è‚É£ Resumo da Configura√ß√£o

In [0]:
print("="*70)
print("üéâ SETUP CONCLU√çDO COM SUCESSO - DATABRICKS FREE EDITION!")
print("="*70)

print("\nüìä RESUMO DA CONFIGURA√á√ÉO:")
print(f"‚úÖ Catalog: {catalog_name}")
print(f"‚úÖ Schemas criados: {len(schemas)}")
print(f"‚úÖ Volumes criados: {len(volumes)}")
print(f"‚úÖ Tabelas de controle: 1")
print(f"‚úÖ Fun√ß√µes utilit√°rias: 1")
print(f"‚úÖ Compute: Serverless (autom√°tico)")

print("\nüìÅ ESTRUTURA UNITY CATALOG:")
for schema in schemas:
    print(f"   {catalog_name}.{schema}/")

print("\nüóÇÔ∏è VOLUMES PARA DADOS:")
print(f"   Raw Data: /Volumes/{catalog_name}/bronze/raw_data/")
print(f"   Processed: /Volumes/{catalog_name}/silver/processed_data/")
print(f"   Analytics: /Volumes/{catalog_name}/gold/analytics_data/")

print("\n" + "="*70)

## üîü Comandos √öteis Unity Catalog

In [0]:
# Comandos √∫teis para refer√™ncia futura
print("üìö COMANDOS √öTEIS UNITY CATALOG:")
print("="*60)
print(f"USE CATALOG {catalog_name};")
print(f"USE SCHEMA {catalog_name}.bronze;")
print(f"SHOW TABLES IN {catalog_name}.bronze;")
print(f"SHOW VOLUMES IN {catalog_name}.bronze;")
print(f"DESCRIBE TABLE {catalog_name}.bronze.pipeline_control;")
print("\n# Acessar arquivo em Volume:")
print(f"df = spark.read.csv('/Volumes/{catalog_name}/bronze/raw_data/arquivo.csv')")