# Crypto Data Pipeline (CoinGecko API v3 ‚Üí BigQuery)

Este notebook demonstra um pipeline simples e **reprodut√≠vel** para:

1. Coletar **snapshot** de mercado via CoinGecko (`/coins/markets`) e persistir em uma tabela **RAW** (payload JSON) no BigQuery  
2. Derivar o **Top N por Market Cap** a partir do √∫ltimo snapshot  
3. Coletar **hist√≥rico di√°rio de pre√ßo** (`/coins/{id}/market_chart/range`) para o Top N e persistir em uma tabela FACT


## 0) Setup (depend√™ncias + vari√°veis de ambiente)


In [22]:
# Se estiver no Colab, descomente:
# !pip install -q google-cloud-bigquery requests python-dotenv

import os
import time
import json
import logging
from datetime import datetime, timezone, timedelta

import requests
import pandas as pd
from google.cloud import bigquery


# Configure no seu ambiente/Colab:
#   export COINGECKO_API_KEY="..."          # opcional (Pro)
#   export COINGECKO_IS_PRO="0"             # "1" se for PRO, "0" se for Public API
#   export GCP_PROJECT_ID="..."
#   export BQ_DATASET_ID="crypto_pipeline"
#
COINGECKO_API_KEY = os.getenv("COINGECKO_API_KEY", "")
COINGECKO_IS_PRO = os.getenv("COINGECKO_IS_PRO", "0") == "1"

PROJECT_ID = os.getenv("GCP_PROJECT_ID", "crypto-data-pipeline-488018")
DATASET_ID = os.getenv("BQ_DATASET_ID", "crypto_pipeline")

BASE_URL = "https://pro-api.coingecko.com/api/v3" if COINGECKO_IS_PRO else "https://api.coingecko.com/api/v3"

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger("crypto-pipeline")

# --- Autentica√ß√£o BigQuery ---
# Colab: abre prompt de login
try:
    from google.colab import auth
    auth.authenticate_user()
    print("Colab auth OK")
except Exception:

    print("N√£o √© Colab. Garanta ADC (gcloud) ou GOOGLE_APPLICATION_CREDENTIALS.")

client = bigquery.Client(project=PROJECT_ID)



Colab auth OK


## 1) Fun√ß√µes utilit√°rias (HTTP com retry e parse)


In [24]:
HEADERS = {"accept": "application/json"}
if COINGECKO_IS_PRO and COINGECKO_API_KEY:
    HEADERS["x-cg-pro-api-key"] = COINGECKO_API_KEY

RETRY_STATUS = {429, 500, 502, 503, 504}
FORBIDDEN_STATUS = {401, 403}

def http_get_json(url: str, params: dict | None = None, timeout: int = 30, max_retries: int = 6):
    """GET com retry exponencial (especialmente √∫til para 429 / rate limit)."""
    params = params or {}
    last_err = None

    for attempt in range(max_retries):
        resp = requests.get(url, headers=HEADERS, params=params, timeout=timeout)

        if resp.status_code in RETRY_STATUS:

            sleep_s = (2 ** attempt) + 0.2
            logger.warning("HTTP %s em %s | retry em %.1fs | params=%s", resp.status_code, url, sleep_s, params)
            time.sleep(sleep_s)
            last_err = resp.text[:500]
            continue

        if resp.status_code in FORBIDDEN_STATUS:
            raise RuntimeError(
                f"Erro de autentica√ß√£o/permiss√£o ({resp.status_code}). "
                f"Verifique COINGECKO_API_KEY/Plano. Body={resp.text[:500]}"
            )

        resp.raise_for_status()

        return resp.json()

    raise RuntimeError(f"Falha ap√≥s {max_retries} tentativas. √öltimo erro: {last_err}")


## 2) Criar tabela RAW (snapshot JSON)


In [None]:
RAW_TABLE_ID = f"{PROJECT_ID}.{DATASET_ID}.raw_tb"

schema_raw = [
    bigquery.SchemaField("ingestion_timestamp", "TIMESTAMP", mode="REQUIRED",
                         description="Timestamp em que o snapshot foi coletado da API."),
    bigquery.SchemaField("source", "STRING", mode="REQUIRED",
                         description="Identifica a origem do dado."),
    bigquery.SchemaField("payload", "JSON", mode="REQUIRED",
                         description="Payload bruto retornado pela API CoinGecko."),
]

table = bigquery.Table(RAW_TABLE_ID, schema=schema_raw)
table.description = (
    "Tabela RAW: snapshots JSON da CoinGecko. Mant√©m payload completo para rastreabilidade e reprocessamento."
)

table = client.create_table(table, exists_ok=True)
print("RAW pronta:", RAW_TABLE_ID)


RAW pronta: crypto-data-pipeline-488018.crypto_pipeline.raw_tb


## 3) Coletar snapshot completo de assets com pagina√ß√£o e gravar em RAW


In [33]:
import time
import pandas as pd

def fetch_markets_snapshot(
    vs_currency: str = "usd",
    per_page: int = 250,
    max_pages: int = 3,           # Public API: recomendo 2 ou 3
    min_interval_s: float = 15.0, # Public API: 15s para ser est√°vel e nao cair em 429
) -> list[dict]:
    """Busca lista de moedas com dados de mercado (GET /coins/markets)."""
    url = f"{BASE_URL}/coins/markets"
    all_rows: list[dict] = []

    last_call_ts: float | None = None

    for page in range(1, max_pages + 1):
        params = {
            "vs_currency": vs_currency,
            "order": "market_cap_desc",
            "per_page": per_page,
            "page": page,
            "sparkline": "false",
            "price_change_percentage": "24h",
        }

        # ---- throttle simples por tempo m√≠nimo entre chamadas ----
        if last_call_ts is not None:
            elapsed = time.time() - last_call_ts
            if elapsed < min_interval_s:
                time.sleep(min_interval_s - elapsed)

        # chama SEM min_interval_s (quem controla o intervalo √© este bloco acima)
        batch = http_get_json(url, params=params)

        last_call_ts = time.time()

        if not isinstance(batch, list):
            raise ValueError(f"Esperava lista no /coins/markets, veio: {type(batch)}")

        all_rows.extend(batch)
        logger.info("P√°gina %s: %s registros (acumulado=%s)", page, len(batch), len(all_rows))

        # Se retornou menos que per_page, acabou
        if len(batch) < per_page:
            break

    return all_rows


# ---- uso mantendo sua l√≥gica ----
assets = fetch_markets_snapshot(per_page=250, max_pages=3, min_interval_s=15.0)

print("Total assets (snapshot):", len(assets))
print("Primeiro id:", assets[0]["id"] if assets else None)
print("√öltimo id:", assets[-1]["id"] if assets else None)

# ---- DF pronto (sem mudar retorno da fun√ß√£o) ----
assets_df = pd.json_normalize(assets, sep="_")  # sep="_" ajuda em campos aninhados
print("DF shape:", assets_df.shape)

Total assets (snapshot): 750
Primeiro id: bitcoin
√öltimo id: spacecoin-2
DF shape: (750, 30)


In [34]:
#Verificando a tabela para entender a estrutura e disponibiliza√ß√£o dos dados
df_assets = pd.DataFrame(assets)

df_assets.head()

Unnamed: 0,id,symbol,name,image,current_price,market_cap,market_cap_rank,fully_diluted_valuation,total_volume,high_24h,...,max_supply,ath,ath_change_percentage,ath_date,atl,atl_change_percentage,atl_date,roi,last_updated,price_change_percentage_24h_in_currency
0,bitcoin,btc,Bitcoin,https://coin-images.coingecko.com/coins/images...,68304.0,1365618055175,1,1365618000000.0,32325750000.0,68637.0,...,21000000.0,126080.0,-45.82475,2025-10-06T18:57:42.558Z,67.81,100630.1,2013-07-06T00:00:00.000Z,,2026-02-21T16:59:13.249Z,0.769824
1,ethereum,eth,Ethereum,https://coin-images.coingecko.com/coins/images...,1986.1,239758376939,2,239758400000.0,16884400000.0,1990.33,...,,4946.05,-59.84473,2025-08-24T19:21:03.333Z,0.432979,458605.8,2015-10-20T00:00:00.000Z,"{'times': 37.87438514984134, 'currency': 'btc'...",2026-02-21T16:59:13.485Z,0.905491
2,tether,usdt,Tether,https://coin-images.coingecko.com/coins/images...,0.999705,183687860155,3,189153300000.0,51605520000.0,0.999851,...,,1.32,-24.43948,2018-07-24T00:00:00.000Z,0.572521,74.62026,2015-03-02T00:00:00.000Z,,2026-02-21T16:59:07.721Z,-0.01143
3,ripple,xrp,XRP,https://coin-images.coingecko.com/coins/images...,1.44,88011425903,4,144216500000.0,1809381000.0,1.46,...,100000000000.0,3.65,-60.44399,2025-07-18T03:40:53.808Z,0.002686,53595.11,2014-05-22T00:00:00.000Z,,2026-02-21T16:59:07.280Z,0.990266
4,binancecoin,bnb,BNB,https://coin-images.coingecko.com/coins/images...,625.86,85343485965,5,85343490000.0,817456100.0,633.75,...,200000000.0,1369.99,-54.31669,2025-10-13T08:41:24.131Z,0.039818,1571708.0,2017-10-19T00:00:00.000Z,,2026-02-21T16:59:13.150Z,0.965513


In [27]:
# Monta snapshot e insere na camada RAW
now_utc = datetime.now(timezone.utc)

snapshot_payload = {
    "meta": {
        "source": "coingecko_api_v3",
        "collected_at": now_utc.isoformat(),
        "count": len(assets),
    },
    "data": assets,
}

rows_to_insert = [{
    "ingestion_timestamp": now_utc.isoformat(),
    "source": "coingecko_api_v3",
    "payload": snapshot_payload,
}]

job = client.load_table_from_json(rows_to_insert, RAW_TABLE_ID)
job.result()
print("Snapshot inserido na RAW:", RAW_TABLE_ID)


Snapshot inserido na RAW: crypto-data-pipeline-488018.crypto_pipeline.raw_tb


## 4) Derivar Top N por **price_change_percentage_24h_in_currency** a partir do √∫ltimo snapshot (SQL)
A camada RAW j√° esta rankeada por market_cap_usb, por√©m, trouxe de uma forma calcul√°vel esse top n a fim de visualiza√ß√£o, podendo ser alterado a v√°riavel de escolha.


In [29]:
TOP_N = 3

q_topn = f'''
WITH last_snapshot AS (
  SELECT payload
  FROM `{RAW_TABLE_ID}`
  WHERE 1=1
    AND source = "coingecko_api_v3"
  QUALIFY ROW_NUMBER() OVER (ORDER BY ingestion_timestamp DESC) = 1
),
assets AS (
  SELECT
    JSON_VALUE(a, '$.id') AS crypto_id,
    SAFE_CAST(JSON_VALUE(a, '$.market_cap') AS BIGNUMERIC) AS market_cap_usd,
    SAFE_CAST(JSON_VALUE(a, '$.price_change_percentage_24h_in_currency') AS FLOAT64) AS price_change_percentage_24h_in_currency,
  FROM last_snapshot,
  UNNEST(JSON_QUERY_ARRAY(payload, '$.data')) AS a
)
SELECT
  crypto_id,
  -- price_change_percentage_24h_in_currency
FROM assets
WHERE crypto_id IS NOT NULL
ORDER BY price_change_percentage_24h_in_currency DESC NULLS LAST
LIMIT {TOP_N}
'''

df_top = client.query(q_topn).to_dataframe()
display(df_top)

top_ids = df_top["crypto_id"].dropna().astype(str).tolist()
print("Top IDs:", top_ids)


Unnamed: 0,crypto_id
0,trillions
1,gamer-tag
2,radix


Top IDs: ['trillions', 'gamer-tag', 'radix']


## 5) Coletar hist√≥rico di√°rio de pre√ßo | Makertcap | Volume (/coins/markets/{slug}/history) para Top N


In [35]:
import time
from datetime import datetime, timezone, timedelta
import pandas as pd


def _throttle(last_call_ts: float | None, min_interval_s: float) -> float:
    """Garante intervalo m√≠nimo entre chamadas; retorna novo timestamp p√≥s-chamada."""
    if last_call_ts is not None:
        elapsed = time.time() - last_call_ts
        if elapsed < min_interval_s:
            sleep_s = min_interval_s - elapsed
            print(f"  throttle: dormindo {sleep_s:.1f}s (min_interval_s={min_interval_s})")
            time.sleep(sleep_s)
    return time.time()


def _http_get_json_with_retry(
    url: str,
    params: dict,
    *,
    min_interval_s: float,
    last_call_ts: float | None,
    max_retries: int = 4,
    base_backoff_s: float = 10.0,
):
    """
    Chama http_get_json com:
      - throttle externo (min_interval_s)
      - retry para 429 (backoff simples)
    Retorna: (payload, last_call_ts_atualizado)
    """
    attempt = 0
    while True:
        attempt += 1

        # throttle antes da tentativa
        last_call_ts = _throttle(last_call_ts, min_interval_s)

        print(f"GET {url} params={params} (tentativa {attempt}/{max_retries + 1})")
        try:
            payload = http_get_json(url, params=params)  # <- sem min_interval aqui
            return payload, time.time()
        except Exception as e:
            msg = str(e).lower()
            is_429 = ("429" in msg) or ("too many requests" in msg) or ("rate limit" in msg)

            if is_429 and attempt <= max_retries:
                backoff = base_backoff_s * attempt  # 10, 20, 30, 40...
                print(f"  429/rate limit detectado. Backoff {backoff:.1f}s e retry...")
                time.sleep(backoff)
                continue

            # qualquer outro erro (ou excedeu retries)
            raise


def get_history_range_df(
    coin_id: str,
    vs_currency: str,
    start_unix: int,
    end_unix: int,
    *,
    min_interval_s: float,
    last_call_ts: float | None,
) -> tuple[pd.DataFrame, float | None]:
    """
    GET /coins/{id}/market_chart/range

    Retorna: (df, last_call_ts_atualizado)

    DF com:
      timestamp_ms | price_usd | market_cap_usd | volume_usd | datetime_utc
    """
    url = f"{BASE_URL}/coins/{coin_id}/market_chart/range"
    params = {"vs_currency": vs_currency, "from": start_unix, "to": end_unix}

    payload, last_call_ts = _http_get_json_with_retry(
        url,
        params,
        min_interval_s=min_interval_s,
        last_call_ts=last_call_ts,
        max_retries=4,
        base_backoff_s=10.0,
    )

    # Cada lista vem como [[timestamp_ms, valor], ...]
    df = pd.DataFrame(payload.get("prices", []), columns=["timestamp_ms", "price_usd"])
    if df.empty:
        empty = pd.DataFrame(
            columns=["timestamp_ms", "price_usd", "market_cap_usd", "volume_usd", "datetime_utc"]
        )
        return empty, last_call_ts

    df_mcap = pd.DataFrame(payload.get("market_caps", []), columns=["timestamp_ms", "market_cap_usd"])
    df_vol  = pd.DataFrame(payload.get("total_volumes", []), columns=["timestamp_ms", "volume_usd"])

    df = (
        df.merge(df_mcap, on="timestamp_ms", how="left")
          .merge(df_vol, on="timestamp_ms", how="left")
    )
    df["datetime_utc"] = pd.to_datetime(df["timestamp_ms"], unit="ms", utc=True)

    return df, last_call_ts


# -------------------------
# Janela (ex.: √∫ltimos 21 dias)
# -------------------------
end_dt = datetime.now(timezone.utc)
start_dt = end_dt - timedelta(days=20)
start_unix, end_unix = int(start_dt.timestamp()), int(end_dt.timestamp())

rows: list[dict] = []

# throttle compartilhado entre moedas (igual seu snapshot que funciona)
last_call_ts: float | None = None

for i, coin_id in enumerate(top_ids, start=1):
    try:
        print(f"\n[{i}/{len(top_ids)}] Coletando {coin_id}...")

        df_coin, last_call_ts = get_history_range_df(
            coin_id=coin_id,
            vs_currency="usd",
            start_unix=start_unix,
            end_unix=end_unix,
            min_interval_s=30.0,      # ajuste aqui
            last_call_ts=last_call_ts # <- estado compartilhado
        )

        if df_coin.empty:
            print(f"  sem dados para {coin_id}")
            continue

        # 1 linha por dia: pega o √∫ltimo ponto do dia (√∫ltimo timestamp dispon√≠vel)
        df_coin["date"] = df_coin["datetime_utc"].dt.date
        df_daily = (
            df_coin.sort_values("timestamp_ms")
                  .groupby("date", as_index=False)
                  .tail(1)[["date", "price_usd", "market_cap_usd", "volume_usd"]]
        )

        daily_records = df_daily.to_dict("records")
        for rec in daily_records:
            rec["crypto_id"] = coin_id
        rows.extend(daily_records)

        print(f"  OK {coin_id}: {len(df_daily)} dias | total acumulado rows={len(rows)}")

    except Exception as e:
        print(f"  ERRO em {coin_id}: {e}")


df_hist = pd.DataFrame(rows)

if not df_hist.empty:
    df_hist["date"] = pd.to_datetime(df_hist["date"]).dt.date
    for c in ["price_usd", "market_cap_usd", "volume_usd"]:
        df_hist[c] = pd.to_numeric(df_hist[c], errors="coerce")

print("\nPreview df_hist:")
print(df_hist.head(10))
print("Linhas hist√≥rico:", len(df_hist))


[1/3] Coletando trillions...
GET https://api.coingecko.com/api/v3/coins/trillions/market_chart/range params={'vs_currency': 'usd', 'from': 1769965201, 'to': 1771693201} (tentativa 1/5)
  OK trillions: 21 dias | total acumulado rows=21

[2/3] Coletando gamer-tag...
  throttle: dormindo 30.0s (min_interval_s=30.0)
GET https://api.coingecko.com/api/v3/coins/gamer-tag/market_chart/range params={'vs_currency': 'usd', 'from': 1769965201, 'to': 1771693201} (tentativa 1/5)
  OK gamer-tag: 9 dias | total acumulado rows=30

[3/3] Coletando radix...
  throttle: dormindo 30.0s (min_interval_s=30.0)
GET https://api.coingecko.com/api/v3/coins/radix/market_chart/range params={'vs_currency': 'usd', 'from': 1769965201, 'to': 1771693201} (tentativa 1/5)
  OK radix: 21 dias | total acumulado rows=51

Preview df_hist:
         date  price_usd  market_cap_usd    volume_usd  crypto_id
0  2026-02-01   0.000348   347812.245683  58102.034491  trillions
1  2026-02-02   0.000407   407476.490585  56511.230565  t

## 6) Criar tabela FACT e carregar dados (BigQuery)


In [36]:
FACT_TABLE_ID = f"{PROJECT_ID}.{DATASET_ID}.fact_coin_history_daily"

schema_fact = [
    bigquery.SchemaField(
        "crypto_id", "STRING", mode="REQUIRED",
        description="Identificador √∫nico da criptomoeda na CoinGecko (ex: bitcoin, ethereum)."
    ),
    bigquery.SchemaField(
        "date", "DATE", mode="REQUIRED",
        description="Data de refer√™ncia (UTC) do √∫ltimo pre√ßo dispon√≠vel no dia."
    ),
    bigquery.SchemaField(
        "price_usd", "BIGNUMERIC", mode="NULLABLE",
        description="Pre√ßo em USD no fechamento do dia (√∫ltimo timestamp dispon√≠vel)."
    ),
    bigquery.SchemaField(
        "market_cap_usd", "BIGNUMERIC", mode="NULLABLE",
        description="Market cap em USD no fechamento do dia."
    ),
    bigquery.SchemaField(
        "volume_usd", "BIGNUMERIC", mode="NULLABLE",
        description="Volume negociado em USD no fechamento do dia."
    ),
]

fact_table = bigquery.Table(FACT_TABLE_ID, schema=schema_fact)

fact_table.description = (
    "Tabela FACT: hist√≥rico di√°rio de criptomoedas derivado da API CoinGecko. "
    "Cont√©m pre√ßo, market cap e volume di√°rio em USD por crypto_id."
)

# PARTITION por DATE
fact_table.time_partitioning = bigquery.TimePartitioning(
    type_=bigquery.TimePartitioningType.DAY,
    field="date",  # particiona usando a coluna date
)

# CLUSTER por crypto_id (melhora filtro/agrupamento por moeda)
fact_table.clustering_fields = ["crypto_id"]

fact_table = client.create_table(fact_table, exists_ok=True)
print("FACT pronta (particionada e clusterizada):", FACT_TABLE_ID)

FACT pronta (particionada e clusterizada): crypto-data-pipeline-488018.crypto_pipeline.fact_coin_history_daily


## 7) Carga Manual na tabela FACT


In [37]:
# Para Sandbox, o mais simples √© substituir tudo em cada execu√ß√£o durante o desenvolvimento.
# Em produ√ß√£o (com billing), evoluir para stage + MERGE.
WRITE_MODE = "TRUNCATE"

if df_hist.empty:
    print("df_hist vazio ‚Äî nada para carregar.")
else:
    df_load = df_hist[
        ["crypto_id", "date", "price_usd", "market_cap_usd", "volume_usd"]
    ].copy()

    # üîπ Garantir tipos corretos das colunas
    df_load["date"] = pd.to_datetime(df_load["date"]).dt.date
    df_load["price_usd"] = pd.to_numeric(df_load["price_usd"], errors="coerce")
    df_load["market_cap_usd"] = pd.to_numeric(df_load["market_cap_usd"], errors="coerce")
    df_load["volume_usd"] = pd.to_numeric(df_load["volume_usd"], errors="coerce")

    job_config = bigquery.LoadJobConfig(
        create_disposition=bigquery.CreateDisposition.CREATE_IF_NEEDED,
        write_disposition=(
            bigquery.WriteDisposition.WRITE_TRUNCATE
            if WRITE_MODE == "TRUNCATE"
            else bigquery.WriteDisposition.WRITE_APPEND
        ),
    )

    job = client.load_table_from_dataframe(
        df_load,
        FACT_TABLE_ID,
        job_config=job_config
    )

    job.result()

    print("Carga conclu√≠da:", FACT_TABLE_ID, "| linhas:", len(df_load))

Carga conclu√≠da: crypto-data-pipeline-488018.crypto_pipeline.fact_coin_history_daily | linhas: 51


# 8) Query simples (registro di√°rio das moedas no dia 2026-02-21 na FACT)


In [38]:
q_latest = f"""
SELECT
  date,
  crypto_id,
  price_usd,
  market_cap_usd,
  volume_usd
FROM `{FACT_TABLE_ID}`
WHERE date = '2026-02-21'
"""

df_example = client.query(q_latest).to_dataframe()
display(df_example)

Unnamed: 0,date,crypto_id,price_usd,market_cap_usd,volume_usd
0,2026-02-21,radix,0.002549,34262270.0,637944.4
1,2026-02-21,trillions,0.022315,22315210.0,3340021.0
