# Crypto Data Pipeline (CoinGecko API v3 ‚Üí BigQuery)

Este notebook demonstra um pipeline simples e **reprodut√≠vel** para:

1. Coletar **snapshot** de mercado via CoinGecko (`/coins/markets`) e persistir em uma tabela **RAW** (payload JSON) no BigQuery  
2. Derivar o **Top N por Market Cap** a partir do √∫ltimo snapshot  
3. Coletar **hist√≥rico di√°rio de pre√ßo** (`/coins/{id}/market_chart/range`) para o Top N e persistir em uma tabela FACT


## 0) Setup (depend√™ncias + vari√°veis de ambiente)


In [2]:
# Se estiver no Colab, descomente:
# !pip install -q google-cloud-bigquery requests python-dotenv

import os
import time
import json
import logging
from datetime import datetime, timezone, timedelta

import requests
import pandas as pd
from google.cloud import bigquery


# Configure no seu ambiente/Colab:
#   export COINGECKO_API_KEY="..."          # opcional (Pro)
#   export COINGECKO_IS_PRO="0"             # "1" se for PRO, "0" se for Public API
#   export GCP_PROJECT_ID="..."
#   export BQ_DATASET_ID="crypto_pipeline"
#
COINGECKO_API_KEY = os.getenv("COINGECKO_API_KEY", "")
COINGECKO_IS_PRO = os.getenv("COINGECKO_IS_PRO", "0") == "1"

PROJECT_ID = os.getenv("GCP_PROJECT_ID", "crypto-data-pipeline-488018")
DATASET_ID = os.getenv("BQ_DATASET_ID", "crypto_pipeline")

BASE_URL = "https://pro-api.coingecko.com/api/v3" if COINGECKO_IS_PRO else "https://api.coingecko.com/api/v3"

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger("crypto-pipeline")

# --- Autentica√ß√£o BigQuery ---
# Colab: abre prompt de login
try:
    from google.colab import auth
    auth.authenticate_user()
    print("Colab auth OK")
except Exception:

    print("N√£o √© Colab. Garanta ADC (gcloud) ou GOOGLE_APPLICATION_CREDENTIALS.")

client = bigquery.Client(project=PROJECT_ID)



Colab auth OK


## 1) Fun√ß√µes utilit√°rias (HTTP com retry e parse)


In [3]:
HEADERS = {"accept": "application/json"}
if COINGECKO_IS_PRO and COINGECKO_API_KEY:
    HEADERS["x-cg-pro-api-key"] = COINGECKO_API_KEY

RETRY_STATUS = {429, 500, 502, 503, 504}
FORBIDDEN_STATUS = {401, 403}

def http_get_json(url: str, params: dict | None = None, timeout: int = 30, max_retries: int = 6):
    """GET com retry exponencial (especialmente √∫til para 429 / rate limit)."""
    params = params or {}
    last_err = None

    for attempt in range(max_retries):
        resp = requests.get(url, headers=HEADERS, params=params, timeout=timeout)

        if resp.status_code in RETRY_STATUS:

            sleep_s = (2 ** attempt) + 0.2
            logger.warning("HTTP %s em %s | retry em %.1fs | params=%s", resp.status_code, url, sleep_s, params)
            time.sleep(sleep_s)
            last_err = resp.text[:500]
            continue

        if resp.status_code in FORBIDDEN_STATUS:
            raise RuntimeError(
                f"Erro de autentica√ß√£o/permiss√£o ({resp.status_code}). "
                f"Verifique COINGECKO_API_KEY/Plano. Body={resp.text[:500]}"
            )

        resp.raise_for_status()

        return resp.json()

    raise RuntimeError(f"Falha ap√≥s {max_retries} tentativas. √öltimo erro: {last_err}")


## 2) Criar tabela RAW (snapshot JSON)


In [4]:
RAW_TABLE_ID = f"{PROJECT_ID}.{DATASET_ID}.raw_tb"

schema_raw = [
    bigquery.SchemaField("ingestion_timestamp", "TIMESTAMP", mode="REQUIRED",
                         description="Timestamp em que o snapshot foi coletado da API."),
    bigquery.SchemaField("source", "STRING", mode="REQUIRED",
                         description="Identifica a origem do dado."),
    bigquery.SchemaField("payload", "JSON", mode="REQUIRED",
                         description="Payload bruto retornado pela API CoinGecko."),
]

table = bigquery.Table(RAW_TABLE_ID, schema=schema_raw)
table.description = (
    "Tabela RAW: snapshots JSON da CoinGecko. Mant√©m payload completo para rastreabilidade e reprocessamento."
)

table = client.create_table(table, exists_ok=True)
print("RAW pronta:", RAW_TABLE_ID)


RAW pronta: crypto-data-pipeline-488018.crypto_pipeline.raw_tb


## 3) Coletar snapshot completo de assets com pagina√ß√£o e gravar em RAW


In [46]:
def fetch_markets_snapshot(
    vs_currency: str = "usd",
    per_page: int = 250,
    max_pages: int = 3,          # Public API: recomendo 2 ou 3
    min_interval_s: float = 15.0, # Public API: 15s para ser est√°vel e nao cair em 429
) -> list[dict]:
    """Busca lista de moedas com dados de mercado (GET /coins/markets)."""
    url = f"{BASE_URL}/coins/markets"
    all_rows: list[dict] = []

    for page in range(1, max_pages + 1):
        params = {
            "vs_currency": vs_currency,
            "order": "market_cap_desc",
            "per_page": per_page,
            "page": page,
            "sparkline": "false",
            "price_change_percentage": "24h",
        }

        batch = http_get_json(url, params=params, min_interval_s=min_interval_s)

        if not isinstance(batch, list):
            raise ValueError(f"Esperava lista no /coins/markets, veio: {type(batch)}")

        all_rows.extend(batch)
        logger.info("P√°gina %s: %s registros (acumulado=%s)", page, len(batch), len(all_rows))

        # Se retornou menos que per_page, acabou
        if len(batch) < per_page:
            break

    return all_rows

assets = fetch_markets_snapshot(per_page=250, max_pages=3, min_interval_s=15.0)
print("Total assets (snapshot):", len(assets))
print("Primeiro id:", assets[0]["id"] if assets else None)
print("√öltimo id:", assets[-1]["id"] if assets else None)

INFO:crypto-pipeline:P√°gina 1: 250 registros (acumulado=250)
INFO:crypto-pipeline:P√°gina 2: 250 registros (acumulado=500)
INFO:crypto-pipeline:P√°gina 3: 250 registros (acumulado=750)


Total assets (snapshot): 750
Primeiro id: bitcoin
√öltimo id: trillions


In [48]:
#Verificando a tabela para entender a estrutura e disponibiliza√ß√£o dos dados
df_assets = pd.DataFrame(assets)

df_assets.head()

Unnamed: 0,id,symbol,name,image,current_price,market_cap,market_cap_rank,fully_diluted_valuation,total_volume,high_24h,...,max_supply,ath,ath_change_percentage,ath_date,atl,atl_change_percentage,atl_date,roi,last_updated,price_change_percentage_24h_in_currency
0,bitcoin,btc,Bitcoin,https://coin-images.coingecko.com/coins/images...,68441.0,1367818392728,1,1367818000000.0,35800170000.0,68524.0,...,21000000.0,126080.0,-45.71588,2025-10-06T18:57:42.558Z,67.81,100832.6,2013-07-06T00:00:00.000Z,,2026-02-21T15:09:00.460Z,1.611259
1,ethereum,eth,Ethereum,https://coin-images.coingecko.com/coins/images...,1987.6,239767042192,2,239767000000.0,15649620000.0,1990.04,...,,4946.05,-59.81434,2025-08-24T19:21:03.333Z,0.432979,458952.9,2015-10-20T00:00:00.000Z,"{'times': 37.83102559568848, 'currency': 'btc'...",2026-02-21T15:09:00.473Z,1.412652
2,tether,usdt,Tether,https://coin-images.coingecko.com/coins/images...,0.99973,183685277164,3,189150600000.0,57268620000.0,0.999891,...,,1.32,-24.43978,2018-07-24T00:00:00.000Z,0.572521,74.61956,2015-03-02T00:00:00.000Z,,2026-02-21T15:08:55.953Z,0.007244
3,ripple,xrp,XRP,https://coin-images.coingecko.com/coins/images...,1.45,88724890127,4,145385600000.0,2097297000.0,1.46,...,100000000000.0,3.65,-60.13895,2025-07-18T03:40:53.808Z,0.002686,54009.19,2014-05-22T00:00:00.000Z,,2026-02-21T15:08:50.469Z,2.277485
4,binancecoin,bnb,BNB,https://coin-images.coingecko.com/coins/images...,632.61,86209463297,5,86209460000.0,892040100.0,633.75,...,200000000.0,1369.99,-53.8238,2025-10-13T08:41:24.131Z,0.039818,1588666.0,2017-10-19T00:00:00.000Z,,2026-02-21T15:09:00.369Z,2.952442


In [21]:
# Monta snapshot e insere na camada RAW
now_utc = datetime.now(timezone.utc)

snapshot_payload = {
    "meta": {
        "source": "coingecko_api_v3",
        "collected_at": now_utc.isoformat(),
        "count": len(assets),
    },
    "data": assets,
}

rows_to_insert = [{
    "ingestion_timestamp": now_utc.isoformat(),
    "source": "coingecko_api_v3",
    "payload": snapshot_payload,
}]

job = client.load_table_from_json(rows_to_insert, RAW_TABLE_ID)
job.result()
print("Snapshot inserido na RAW:", RAW_TABLE_ID)


Snapshot inserido na RAW: crypto-data-pipeline-488018.crypto_pipeline.raw_tb


## 4) Derivar Top N por **price_change_percentage_24h_in_currency** a partir do √∫ltimo snapshot (SQL)
A camada RAW j√° esta rankeada por market_cap_usb, por√©m, trouxe de uma forma calcul√°vel esse top n a fim de visualiza√ß√£o, podendo ser alterado a v√°riavel de escolha.


In [53]:
TOP_N = 5

q_topn = f'''
WITH last_snapshot AS (
  SELECT payload
  FROM `{RAW_TABLE_ID}`
  WHERE 1=1
    AND source = "coingecko_api_v3"
  QUALIFY ROW_NUMBER() OVER (ORDER BY ingestion_timestamp DESC) = 1
),
assets AS (
  SELECT
    JSON_VALUE(a, '$.id') AS crypto_id,
    SAFE_CAST(JSON_VALUE(a, '$.market_cap') AS BIGNUMERIC) AS market_cap_usd,
    SAFE_CAST(JSON_VALUE(a, '$.price_change_percentage_24h_in_currency') AS FLOAT64) AS price_change_percentage_24h_in_currency,
  FROM last_snapshot,
  UNNEST(JSON_QUERY_ARRAY(payload, '$.data')) AS a
)
SELECT
  crypto_id,
  price_change_percentage_24h_in_currency
FROM assets
WHERE crypto_id IS NOT NULL
ORDER BY price_change_percentage_24h_in_currency DESC NULLS LAST
LIMIT {TOP_N}
'''

df_top = client.query(q_topn).to_dataframe()
display(df_top)

top_ids = df_top["crypto_id"].dropna().astype(str).tolist()
print("Top IDs:", top_ids)


Unnamed: 0,crypto_id,price_change_percentage_24h_in_currency
0,gamer-tag,50.879901
1,greyhunt,49.975791
2,radix,36.429087
3,singularry,30.076238
4,ribbita-by-virtuals,27.701592


Top IDs: ['gamer-tag', 'greyhunt', 'radix', 'singularry', 'ribbita-by-virtuals']


## 5) Coletar hist√≥rico di√°rio de pre√ßo | Makertcap | Volume (/coins/markets/{slug}/history) para Top N


In [56]:
def get_history_range_df(
    coin_id: str,
    vs_currency: str,
    start_unix: int,
    end_unix: int,
    min_interval_s: float = 20.0,
) -> pd.DataFrame:
    """
    GET /coins/{id}/market_chart/range

    Retorna DataFrame com:
      timestamp_ms | price_usd | market_cap_usd | volume_usd | datetime_utc

    Obs (429):
    - Public API rate limit √© baixo; min_interval_s controla o ritmo (throttle no http_get_json).
    - Se cair em 429: aumente min_interval_s (ex.: 25‚Äì30s) e/ou reduza top_ids.
    """
    url = f"{BASE_URL}/coins/{coin_id}/market_chart/range"
    payload = http_get_json(
        url,
        params={"vs_currency": vs_currency, "from": start_unix, "to": end_unix},
        min_interval_s=min_interval_s,
    )

    # Cada lista vem como [[timestamp_ms, valor], ...]
    df = pd.DataFrame(payload.get("prices", []), columns=["timestamp_ms", "price_usd"])
    if df.empty:
        return pd.DataFrame(columns=["timestamp_ms", "price_usd", "market_cap_usd", "volume_usd", "datetime_utc"])

    df_mcap = pd.DataFrame(payload.get("market_caps", []), columns=["timestamp_ms", "market_cap_usd"])
    df_vol  = pd.DataFrame(payload.get("total_volumes", []), columns=["timestamp_ms", "volume_usd"])

    df = (
        df.merge(df_mcap, on="timestamp_ms", how="left")
          .merge(df_vol, on="timestamp_ms", how="left")
    )
    df["datetime_utc"] = pd.to_datetime(df["timestamp_ms"], unit="ms", utc=True)
    return df


# -------------------------
# Janela (ex.: √∫ltimos 21 dias)
# -------------------------
end_dt = datetime.now(timezone.utc)
start_dt = end_dt - timedelta(days=20)
start_unix, end_unix = int(start_dt.timestamp()), int(end_dt.timestamp())

rows = []

for i, coin_id in enumerate(top_ids, start=1):
    try:
        df_coin = get_history_range_df(
            coin_id=coin_id,
            vs_currency="usd",
            start_unix=start_unix,
            end_unix=end_unix,
            min_interval_s=30.0,
        )

        if df_coin.empty:
            logger.warning("Sem dados para %s", coin_id)
            continue

        # 1 linha por dia: pega o √∫ltimo ponto do dia (√∫ltimo timestamp dispon√≠vel)
        df_coin["date"] = df_coin["datetime_utc"].dt.date
        df_daily = (
            df_coin.sort_values("timestamp_ms")
                  .groupby("date", as_index=False)
                  .tail(1)[["date", "price_usd", "market_cap_usd", "volume_usd"]]
        )

        # Converte para records e adiciona crypto_id
        daily_records = df_daily.to_dict("records")
        for rec in daily_records:
            rec["crypto_id"] = coin_id
        rows.extend(daily_records)

        logger.info("[%s/%s] %s: %s dias", i, len(top_ids), coin_id, len(df_daily))

    except Exception as e:
        logger.exception("Falha ao coletar hist√≥rico de %s: %s", coin_id, e)

df_hist = pd.DataFrame(rows)


if not df_hist.empty:
    df_hist["date"] = pd.to_datetime(df_hist["date"]).dt.date
    for c in ["price_usd", "market_cap_usd", "volume_usd"]:
        df_hist[c] = pd.to_numeric(df_hist[c], errors="coerce")

display(df_hist.head())
print("Linhas hist√≥rico:", len(df_hist))

INFO:crypto-pipeline:[1/5] gamer-tag: 9 dias
INFO:crypto-pipeline:[2/5] greyhunt: 18 dias


KeyboardInterrupt: 

## 6) Criar tabela FACT e carregar dados (BigQuery)


In [42]:
FACT_TABLE_ID = f"{PROJECT_ID}.{DATASET_ID}.fact_coin_history_daily"

schema_fact = [
    bigquery.SchemaField(
        "crypto_id", "STRING", mode="REQUIRED",
        description="Identificador √∫nico da criptomoeda na CoinGecko (ex: bitcoin, ethereum)."
    ),
    bigquery.SchemaField(
        "date", "DATE", mode="REQUIRED",
        description="Data de refer√™ncia (UTC) do √∫ltimo pre√ßo dispon√≠vel no dia."
    ),
    bigquery.SchemaField(
        "price_usd", "BIGNUMERIC", mode="NULLABLE",
        description="Pre√ßo em USD no fechamento do dia (√∫ltimo timestamp dispon√≠vel)."
    ),
    bigquery.SchemaField(
        "market_cap_usd", "BIGNUMERIC", mode="NULLABLE",
        description="Market cap em USD no fechamento do dia."
    ),
    bigquery.SchemaField(
        "volume_usd", "BIGNUMERIC", mode="NULLABLE",
        description="Volume negociado em USD no fechamento do dia."
    ),
]

fact_table = bigquery.Table(FACT_TABLE_ID, schema=schema_fact)

fact_table.description = (
    "Tabela FACT: hist√≥rico di√°rio de criptomoedas derivado da API CoinGecko. "
    "Cont√©m pre√ßo, market cap e volume di√°rio em USD por crypto_id."
)

# PARTITION por DATE
fact_table.time_partitioning = bigquery.TimePartitioning(
    type_=bigquery.TimePartitioningType.DAY,
    field="date",  # particiona usando a coluna date
)

# CLUSTER por crypto_id (melhora filtro/agrupamento por moeda)
fact_table.clustering_fields = ["crypto_id"]

fact_table = client.create_table(fact_table, exists_ok=True)
print("FACT pronta (particionada e clusterizada):", FACT_TABLE_ID)

FACT pronta (particionada e clusterizada): crypto-data-pipeline-488018.crypto_pipeline.fact_coin_history_daily


## 7) Carga Manual na tabela FACT


In [55]:
# Para Sandbox, o mais simples √© substituir tudo em cada execu√ß√£o durante o desenvolvimento.
# Em produ√ß√£o (com billing), evoluir para stage + MERGE.
WRITE_MODE = "TRUNCATE"

if df_hist.empty:
    print("df_hist vazio ‚Äî nada para carregar.")
else:
    df_load = df_hist[
        ["crypto_id", "date", "price_usd", "market_cap_usd", "volume_usd"]
    ].copy()

    # üîπ Garantir tipos corretos das colunas
    df_load["date"] = pd.to_datetime(df_load["date"]).dt.date
    df_load["price_usd"] = pd.to_numeric(df_load["price_usd"], errors="coerce")
    df_load["market_cap_usd"] = pd.to_numeric(df_load["market_cap_usd"], errors="coerce")
    df_load["volume_usd"] = pd.to_numeric(df_load["volume_usd"], errors="coerce")

    job_config = bigquery.LoadJobConfig(
        create_disposition=bigquery.CreateDisposition.CREATE_IF_NEEDED,
        write_disposition=(
            bigquery.WriteDisposition.WRITE_TRUNCATE
            if WRITE_MODE == "TRUNCATE"
            else bigquery.WriteDisposition.WRITE_APPEND
        ),
    )

    job = client.load_table_from_dataframe(
        df_load,
        FACT_TABLE_ID,
        job_config=job_config
    )

    job.result()

    print("Carga conclu√≠da:", FACT_TABLE_ID, "| linhas:", len(df_load))

Carga conclu√≠da: crypto-data-pipeline-488018.crypto_pipeline.fact_coin_history_daily | linhas: 90


# 7) Query simples (registro di√°rio das moedas no dia 2026-02-21 na FACT)


In [59]:
q_latest = f"""
SELECT
  date,
  crypto_id,
  price_usd,
  market_cap_usd,
  volume_usd
FROM `{FACT_TABLE_ID}`
WHERE date = '2026-02-21'
"""

df_example = client.query(q_latest).to_dataframe()
display(df_example)

Unnamed: 0,date,crypto_id,price_usd,market_cap_usd,volume_usd
0,2026-02-21,singularry,0.039691,39686500.0,847993.4
1,2026-02-21,radix,0.002474,33011940.0,583232.7
2,2026-02-21,ribbita-by-virtuals,0.177921,178208800.0,12483560.0
3,2026-02-21,greyhunt,5.997427,116948900.0,89401.49
