# Setup DBT

In [None]:
import os

from google.colab import auth
from google.colab import drive
from google.cloud import bigquery

In [None]:
# Monta o Google Drive
print("Montando o Google Drive...")
drive.mount('/content/drive')

Montando o Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 3️⃣ Autenticar manualmente via OAuth
auth.authenticate_user()
print("🔐 Autenticado com sucesso!")

🔐 Autenticado com sucesso!


In [None]:
# Define o caminho para salvar o projeto no Drive
DBT_DIR = '/content/drive/MyDrive/dbt_projects'
PROJECT_NAME = "dbt_meu_projeto"
PROJECT_PATH = f"{DBT_DIR}/{PROJECT_NAME}"

In [None]:
os.makedirs(PROJECT_PATH, exist_ok=True)
print(f"📁 Diretório do dbt criado em: {DBT_DIR}")

📁 Diretório do dbt criado em: /content/drive/MyDrive/dbt_projects


In [None]:
# 2️⃣ Instalar o dbt para BigQuery
!pip install dbt-bigquery --quiet

In [None]:
# 5️⃣ Ir para o diretório base e inicializar o projeto dbt
%cd {DBT_DIR}
!dbt init {PROJECT_NAME}

/content/drive/MyDrive/dbt_projects
[0m01:04:23  Running with dbt=1.9.4
[0m01:04:24  A project called dbt_meu_projeto already exists here.


In [None]:
# 6️⃣ Criar o profiles.yml no local correto
profiles_content = f"""
meu_projeto_dbt:
  target: dev
  outputs:
    dev:
      type: bigquery
      method: oauth
      project: "meu-projeto-dbt"
      dataset: "meu_dataset"
      threads: 4
      location: US
"""

# Garantir que o diretório ~/.dbt exista
os.makedirs("/root/.dbt", exist_ok=True)

# Escrever o profiles.yml
with open("/root/.dbt/profiles.yml", "w") as f:
    f.write(profiles_content)
print("✅ profiles.yml configurado para BigQuery via OAuth")

# 2️⃣ Salvar uma cópia no Drive como backup
backup_path = f"{PROJECT_PATH}/profiles.yml"
with open(backup_path, "w") as f:
    f.write(profiles_content)
print(f"✅ Cópia de backup salva em: {backup_path}")

✅ profiles.yml configurado para BigQuery via OAuth
✅ Cópia de backup salva em: /content/drive/MyDrive/dbt_projects/dbt_meu_projeto/profiles.yml


In [None]:
# Caminho onde o dbt_project.yml será salvo
dbt_project_yml_path = f"{PROJECT_PATH}/dbt_project.yml"

# Conteúdo do dbt_project.yml
dbt_project_content = """
name: "meu_projeto_dbt"
version: "1.0"
profile: "meu_projeto_dbt"
config-version: 2

model-paths: ["models"]
macro-paths: ["macros"]

target-path: "target"
clean-targets: ["target", "dbt_modules"]
"""

# Criar o arquivo no diretório do projeto
with open(dbt_project_yml_path, "w") as f:
    f.write(dbt_project_content)

print(f"✅ dbt_project.yml criado em: {dbt_project_yml_path}")


✅ dbt_project.yml criado em: /content/drive/MyDrive/dbt_projects/dbt_meu_projeto/dbt_project.yml


In [None]:
# Create file /packages.yml
packages_content = """
packages:
  - package: dbt-labs/dbt_utils
    version: [">=0.8.0", "<1.0.0"]
"""

with open(f"{PROJECT_PATH}/packages.yml", "w") as f:
    f.write(packages_content)

print("✅ packages.yml criado com sucesso")

✅ packages.yml criado com sucesso


In [None]:
# Install DBT packages
!cd {PROJECT_PATH} && dbt deps

[0m01:04:28  Running with dbt=1.9.4
[0m01:04:30  Installing dbt-labs/dbt_utils
[0m01:04:34  Installed from version 0.9.6
[0m01:04:34  Updated version available: 1.3.0
[0m01:04:34  
[0m01:04:34  Updates available for packages: ['dbt-labs/dbt_utils']                 
Update your versions in packages.yml, then run dbt deps


In [None]:
# 7️⃣ Testar a conexão do dbt com BigQuery
%cd {PROJECT_PATH}
!dbt debug

/content/drive/MyDrive/dbt_projects/dbt_meu_projeto
[0m01:04:41  Running with dbt=1.9.4
[0m01:04:41  dbt version: 1.9.4
[0m01:04:41  python version: 3.11.12
[0m01:04:41  python path: /usr/bin/python3
[0m01:04:41  os info: Linux-6.1.85+-x86_64-with-glibc2.35
[0m01:04:44  Using profiles dir at /content/drive/MyDrive/dbt_projects/dbt_meu_projeto
[0m01:04:45  Using profiles.yml file at /content/drive/MyDrive/dbt_projects/dbt_meu_projeto/profiles.yml
[0m01:04:45  Using dbt_project.yml file at /content/drive/MyDrive/dbt_projects/dbt_meu_projeto/dbt_project.yml
[0m01:04:45  adapter type: bigquery
[0m01:04:45  adapter version: 1.9.1
[0m01:04:45  Configuration:
[0m01:04:45    profiles.yml file [[32mOK found and valid[0m]
[0m01:04:45    dbt_project.yml file [[32mOK found and valid[0m]
[0m01:04:45  Required dependencies:
[0m01:04:45   - git [[32mOK found[0m]

[0m01:04:45  Connection:
[0m01:04:45    method: oauth
[0m01:04:45    database: meu-projeto-dbt
[0m01:04:45    exec

# Estrutura de diretórios

In [None]:
# Estrutura de diretórios
folders = [
    "models/",
    "models/tests",
    "macros"
]

In [None]:
# Muda para esse diretório
os.chdir(PROJECT_PATH)

In [None]:
# ✅ Criação das pastas
for folder in folders:
    path = os.path.join(PROJECT_PATH, folder)
    os.makedirs(path, exist_ok=True)

# Dados brutos que vamos trabalhar

In [None]:
# Inicializa o cliente
client = bigquery.Client(project="meu-projeto-dbt")

# Define o ID completo da tabela
table_id = "meu-projeto-dbt.sample_data.marketing_data"

# Pega o schema da tabela
table = client.get_table(table_id)

all_columns = []
# Mostra o nome e o tipo de cada coluna
# Gera a lista de colunas da tabela
for schema_field in table.schema:
    print(f"{schema_field.name} ({schema_field.field_type})")
    all_columns.append(schema_field.name)

date (STRING)
tv (FLOAT)
radio (FLOAT)
ooh (FLOAT)
meta (FLOAT)
google (FLOAT)
tiktok (FLOAT)
digital (FLOAT)
sales (FLOAT)
holiday (FLOAT)


In [None]:
# Print first rows from data

query = """
SELECT * FROM `meu-projeto-dbt.sample_data.marketing_data`
LIMIT 5
"""
df = client.query(query).to_dataframe()
df

Unnamed: 0,date,tv,radio,ooh,meta,google,tiktok,digital,sales,holiday
0,2023-04-27,0.0,,1084.2,3796.61,7159.81,4019.28,2106.97,7693449.97,0.0
1,2023-08-25,4055.11,,1012.66,4035.08,4607.99,2801.7,1784.53,7512609.44,0.0
2,2023-10-14,3640.98,,934.71,2275.59,4254.67,2727.94,1627.36,5274272.74,0.0
3,2023-11-17,6816.8,,752.11,3965.98,9265.49,4155.98,2098.45,49283224.1,0.0
4,2024-03-19,9435.95,,1296.58,5003.28,10658.48,8675.1,2588.44,24741935.76,0.0


In [None]:
# 📝 Função para salvar SQL
def write_sql(path, content):
    with open(os.path.join(PROJECT_PATH, path), "w") as f:
        f.write(content.strip())

In [None]:
# ✅ Define nome do projeto e dataset conforme seu BigQuery
project_id = "meu-projeto-dbt"
dataset_id = "sample_data"
table_name = "marketing_data"

# ✅ Script para gerar o arquivo de sources
write_sql("models/src_sample_data.yml", f"""
version: 2

sources:
  - name: sample_data
    database: {project_id}
    schema: {dataset_id}
    tables:
      - name: {table_name}
""")

print("Arquivo src_sample_data.yml criado com sucesso.")

Arquivo src_sample_data.yml criado com sucesso.


# Definicação das Macros

In [None]:
# macro_remover_duplicatas

write_sql(
    "macros/remover_linhas_duplicadas.sql",
    """
    {% macro remover_linhas_duplicadas(table) %}
      SELECT DISTINCT * FROM {{ table }}
    {% endmacro %}
    """
)

print("✅ Macro `remover_linhas_duplicadas` salva.")

✅ Macro `remover_linhas_duplicadas` salva.


In [None]:
write_sql(
    "macros/remover_datas_duplicadas_prioriza_menos_nulos.sql",
    """
    {% macro remover_datas_duplicadas_prioriza_menos_nulos(table,ref_col) %}
    {%- set cols = dbt_utils.get_filtered_columns_in_relation(ref(ref_col)) -%}

    ranked AS (
        SELECT
            *,
            ROW_NUMBER() OVER (
                PARTITION BY date
                ORDER BY
                    (
                        {%- for col in cols %}
                            (CASE WHEN {{ col }} IS NULL THEN 1 ELSE 0 END){% if not loop.last %} + {% endif %}
                        {%- endfor %}
                    ) ASC
            ) AS row_num
        FROM {{ table }}
        WHERE date IS NOT NULL
    )
    {% endmacro %}
    """
)

print("✅ Macro `remover_datas_duplicadas_prioriza_menos_nulos` salva.")


✅ Macro `remover_datas_duplicadas_prioriza_menos_nulos` salva.


In [None]:
# macro_interpolar_sales

write_sql(
    "macros/interpolar_sales.sql",
    """
    {% macro interpolar_sales(table_ref) %}
    (
        SELECT
            *,
            IFNULL(
                sales,
                (
                    LAG(sales) OVER (ORDER BY date) + LEAD(sales) OVER (ORDER BY date)
                ) / 2
            ) AS sales_interpolado
        FROM {{ table_ref }}
    )
    {% endmacro %}
    """
)

print("✅ Macro `interpolar_sales` salva.")

✅ Macro `interpolar_sales` salva.


In [None]:
# macro_coalesce_todas_colunas

write_sql(
    "macros/coalesce_todas_colunas.sql",
    """
    {% macro coalesce_todas_colunas(colunas, valor_padrao) %}
        {% for col in colunas %}
            COALESCE({{ col }}, {{ valor_padrao }}) AS {{ col }}{{ "," if not loop.last else "" }}
        {% endfor %}
    {% endmacro %}
    """
)

print("✅ Macro `coalesce_todas_colunas` salva.")

✅ Macro `coalesce_todas_colunas` salva.


# Criação das tabelas

In [None]:
# Remove as colunas que não devem receber COALESCE
investment_cols = [col for col in all_columns if col not in ["date", "holiday", "sales"]]

print("Colunas para aplicar coalesce:", investment_cols)

Colunas para aplicar coalesce: ['tv', 'radio', 'ooh', 'meta', 'google', 'tiktok', 'digital']


In [None]:
# Bronze Investment

write_sql("models/bronze_investment.sql", f"""
WITH bronze AS (
    SELECT * FROM {{{{ source('sample_data', 'marketing_data') }}}}
),

    sem_duplicadas AS (
        {{{{ remover_linhas_duplicadas('bronze') }}}}
    )

SELECT
    date,
    {', '.join(investment_cols)}
FROM sem_duplicadas
""")

print("Model `bronze_investment` created.")

Model `bronze_investment` created.


In [None]:
# Silver Investment

write_sql("models/silver_investment.sql", """
WITH silver AS (
    SELECT * FROM {{ ref('bronze_investment') }}
),

    {{ remover_datas_duplicadas_prioriza_menos_nulos('silver','bronze_investment') }},

    sem_datas_repetidas AS (
    SELECT * FROM ranked WHERE row_num = 1
    )

SELECT
    *
FROM sem_datas_repetidas
""")

print("Model `silver_investment` created.")

Model `silver_investment` created.


In [None]:
# Gold Investment

# Gera a string para o SELECT
investment_select = []
for col in investment_cols:
    if col == "digital":
        investment_select.append(",\n    " + f"{col} AS display_video")
    else:
        investment_select.append(",\n    " + col)

# Create gold
write_sql("models/gold_investment.sql", f"""
WITH gold AS (
    SELECT * FROM {{{{ ref('silver_investment') }}}}
),

preenchido AS (
    SELECT
        date,
        {{{{ coalesce_todas_colunas({investment_cols}, 0) }}}}
    FROM gold
)

SELECT
    date {''.join(investment_select)}
FROM preenchido
""")

print("Model `gold_investment` created.")

Model `gold_investment` created.


In [None]:
# Bronze KPI

write_sql("models/bronze_kpi.sql", """
WITH bronze AS (
    SELECT * FROM {{ source('sample_data', 'marketing_data') }}
),

    sem_duplicadas AS (
        {{ remover_linhas_duplicadas('bronze') }}
    )

SELECT
    date,
    sales
FROM sem_duplicadas
""")

print("Model `bronze_kpi` created.")

Model `bronze_kpi` created.


In [None]:
# Silver KPI

write_sql("models/silver_kpi.sql", """
WITH silver AS (
    SELECT * FROM {{ ref('bronze_kpi') }}
),

    {{ remover_datas_duplicadas_prioriza_menos_nulos('silver','bronze_kpi') }},

    sem_datas_repetidas AS (
    SELECT * FROM ranked WHERE row_num = 1
)

SELECT
    *
FROM sem_datas_repetidas
""")

print("Model `silver_insilver_kpivestment` created.")

Model `silver_insilver_kpivestment` created.


In [None]:
# Gold KPI

write_sql("models/gold_kpi.sql", """
with interpolado AS (
 {{ interpolar_sales(ref('silver_kpi')) }}
)

SELECT
    date,
    case
      when sales is null then sales_interpolado
      else sales
    end as sales
FROM interpolado
""")

print("Model `gold_kpi` created.")

Model `gold_kpi` created.


In [None]:
# Bronze Auxiliar

write_sql("models/bronze_auxiliar.sql", f"""
WITH bronze AS (
    SELECT * FROM {{{{ source('sample_data', 'marketing_data') }}}}
),

    sem_duplicadas AS (
        {{{{ remover_linhas_duplicadas('bronze') }}}}
    )

SELECT
    date,
    holiday
FROM sem_duplicadas
""")

print("Model `bronze_auxiliar` created.")

Model `bronze_auxiliar` created.


In [None]:
# Silver Auxiliar

write_sql("models/silver_auxiliar.sql", """
WITH silver AS (
    SELECT * FROM {{ ref('bronze_auxiliar') }}
),

{{ remover_datas_duplicadas_prioriza_menos_nulos('silver','bronze_auxiliar') }},

    sem_datas_repetidas AS (
    SELECT * FROM ranked WHERE row_num = 1
    )

SELECT
    *
FROM sem_datas_repetidas
""")

print("Model `silver_auxiliar` created.")

Model `silver_auxiliar` created.


In [None]:
# Gold Auxiliar

write_sql("models/gold_auxiliar.sql", """
WITH gold AS (
    SELECT * FROM {{ ref('silver_auxiliar') }}
),

preenchido AS (
    SELECT
        date,
        {{ coalesce_todas_colunas(['holiday'], 0) }}
    FROM gold
)

SELECT
    *
FROM preenchido
""")

print("Model `gold_auxiliar` created.")

Model `gold_auxiliar` created.


# Definindo testes para os dados

In [None]:
tests_path = f"{PROJECT_PATH}/models/tests"

In [None]:
# Testes para gold_investment

try:
    index = investment_cols.index('digital')
    investment_cols[index] = 'display_video'
except ValueError:
    pass  # 'digital' not in list, so no replacement needed

test_investment = """version: 2

models:
  - name: gold_investment
    tests:
"""

# Adiciona os testes de expressão_is_true
for col in investment_cols:
    test_investment += f"""      - dbt_utils.expression_is_true:
          expression: "{col} >= 0"
"""

# Adiciona os testes por coluna
test_investment += "    columns:\n"
for col in investment_cols:
    test_investment += f"""      - name: {col}
        tests:
          - not_null
"""

with open(f"{tests_path}/test_gold_investment.yml", "w") as f:
    f.write(test_investment)

In [None]:
# Testes para gold_kpi
test_kpi = """
version: 2

models:
  - name: gold_kpi
    tests:
      - dbt_utils.expression_is_true:
          expression: "sales >= 0"
    columns:
      - name: sales
        tests:
          - not_null
"""

with open(f"{tests_path}/test_gold_kpi.yml", "w") as f:
    f.write(test_kpi)

In [None]:
# Testes para gold_auxiliar
test_aux = """
version: 2

models:
  - name: gold_auxiliar
    tests:
      - dbt_utils.expression_is_true:
          expression: "holiday in (0, 1)"
    columns:
      - name: holiday
        tests:
          - not_null
"""

with open(f"{tests_path}/test_gold_auxiliar.yml", "w") as f:
    f.write(test_aux)

# Rodar o DBT

In [None]:
!dbt run

[0m01:02:14  Running with dbt=1.9.4
[0m01:02:17  Registered adapter: bigquery=1.9.1
[0m01:02:19  Found 9 models, 18 data tests, 1 source, 692 macros
[0m01:02:19  
[0m01:02:19  Concurrency: 4 threads (target='dev')
[0m01:02:19  
[0m01:02:19  1 of 9 START sql view model meu_dataset.bronze_auxiliar ........................ [RUN]
[0m01:02:19  2 of 9 START sql view model meu_dataset.bronze_investment ...................... [RUN]
[0m01:02:19  3 of 9 START sql view model meu_dataset.bronze_kpi ............................. [RUN]
[0m01:02:20  3 of 9 OK created sql view model meu_dataset.bronze_kpi ........................ [[32mCREATE VIEW (0 processed)[0m in 0.77s]
[0m01:02:20  1 of 9 OK created sql view model meu_dataset.bronze_auxiliar ................... [[32mCREATE VIEW (0 processed)[0m in 0.77s]
[0m01:02:20  4 of 9 START sql view model meu_dataset.silver_kpi ............................. [RUN]
[0m01:02:20  5 of 9 START sql view model meu_dataset.silver_auxiliar ..........

In [None]:
!dbt test

[0m01:15:02  Running with dbt=1.9.4
[0m01:15:04  Registered adapter: bigquery=1.9.1
[0m01:15:07  Found 9 models, 18 data tests, 1 source, 692 macros
[0m01:15:07  
[0m01:15:07  Concurrency: 4 threads (target='dev')
[0m01:15:07  
[0m01:15:07  1 of 18 START test dbt_utils_expression_is_true_gold_auxiliar_holiday_in_0_1_ .. [RUN]
[0m01:15:07  4 of 18 START test dbt_utils_expression_is_true_gold_investment_meta_0 ......... [RUN]
[0m01:15:07  3 of 18 START test dbt_utils_expression_is_true_gold_investment_google_0 ....... [RUN]
[0m01:15:07  2 of 18 START test dbt_utils_expression_is_true_gold_investment_display_video_0  [RUN]
[0m01:15:08  4 of 18 PASS dbt_utils_expression_is_true_gold_investment_meta_0 ............... [[32mPASS[0m in 1.08s]
[0m01:15:08  5 of 18 START test dbt_utils_expression_is_true_gold_investment_ooh_0 .......... [RUN]
[0m01:15:08  1 of 18 PASS dbt_utils_expression_is_true_gold_auxiliar_holiday_in_0_1_ ........ [[32mPASS[0m in 1.09s]
[0m01:15:08  6 of 18