Configuración

In [12]:
COINS = ["BTC", "ETH", "ADA"]    
FIAT  = "USD"                    

START_DATE = "2024-01-01"        
END_DATE   = "2024-03-31"        

OUTPUT_CSV = "data/crypto_limpio.csv"  # csv final 
FIG_DIR    = "data/figs"

import os
os.makedirs("data", exist_ok=True)
os.makedirs(FIG_DIR, exist_ok=True)

COINS, FIAT, START_DATE, END_DATE, OUTPUT_CSV


(['BTC', 'ETH', 'ADA'],
 'USD',
 '2024-01-01',
 '2024-03-31',
 'data/crypto_limpio.csv')

cargar API Key 

In [13]:
import os
from dotenv import load_dotenv

load_dotenv()  # carga variables de .env 
API_KEY = os.getenv("CRYPTOCOMPARE_KEY", None)

print("API key detectada:" , ("sí (oculta)" if API_KEY else "no (modo público con más límites)"))


API key detectada: sí (oculta)


descarga desde cryptocompare 

In [14]:
import time
import requests
import pandas as pd
from datetime import datetime, timedelta
from dateutil import tz

BASE_URL = "https://min-api.cryptocompare.com/data/v2/histoday"
MAX_LIMIT = 2000  # máximo de días por request

def fetch_histoday(fsym: str, tsym: str, start_date: str, end_date: str, api_key: str | None) -> pd.DataFrame:
    """
    Devuelve columnas originales de CryptoCompare:
    time (segundos unix), open, high, low, close, volumefrom, volumeto
    """
    start = datetime.strptime(start_date, "%Y-%m-%d").date()
    end   = datetime.strptime(end_date,   "%Y-%m-%d").date()
    total_days = (end - start).days + 1

    frames = []
    to_dt = datetime.combine(end, datetime.min.time())

    remaining = total_days
    while remaining > 0:
        chunk = min(remaining, MAX_LIMIT + 1)   
        limit = chunk - 1
        toTs  = int(to_dt.replace(tzinfo=tz.UTC).timestamp())

        params = {"fsym": fsym, "tsym": tsym, "limit": limit, "toTs": toTs}
        headers = {"authorization": f"Apikey {api_key}"} if api_key else {}

        r = requests.get(BASE_URL, params=params, headers=headers, timeout=60)
        r.raise_for_status()
        js = r.json()
        if js.get("Response") != "Success":
            raise RuntimeError(f"API error: {js.get('Message')}")

        df = pd.DataFrame(js["Data"]["Data"])
        df["time"] = pd.to_datetime(df["time"], unit="s").dt.date
        df = df[(df["time"] >= start) & (df["time"] <= end)]
        frames.append(df)

        if not df.empty:
            first_date = df["time"].min()
            to_dt = datetime.combine(first_date, datetime.min.time()) - timedelta(days=1)
            remaining = (to_dt.date() - start).days + 1
        else:
            break

        time.sleep(0.8)  # pausa anti rate-limit

    if not frames:
        return pd.DataFrame(columns=["time","open","high","low","close","volumefrom","volumeto"])

    out = pd.concat(frames, ignore_index=True).sort_values("time")
    return out


Se definen reglas de limpieza estricta

In [None]:
import numpy as np
import pandas as pd

def _numeric(df, cols):
    for c in cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

def _drop_invalid_ohlc_rows(df):
    
    req_cols = ["time","open","high","low","close","volumeto"]
    df = df.dropna(subset=req_cols)

    df = df[(df["open"]  > 0) & (df["high"]  > 0) & (df["low"]   > 0) & (df["close"] > 0)]
    df = df[df["volumeto"] >= 0]
    df = df[df["high"] >= df["low"]]
    df = df[(df["open"] >= df["low"]) & (df["open"] <= df["high"])]
    df = df[(df["close"] >= df["low"]) & (df["close"] <= df["high"])]

    return df

def clean_frame_strict(df: pd.DataFrame, coin_symbol: str) -> pd.DataFrame:
    
    if df.empty:
        return pd.DataFrame(columns=["time","open","high","low","close","volumeto","coin"])

    #  duplicados 
    df = df.drop_duplicates(subset=["time"]).copy()

    #  a numéricos
    df = _numeric(df, ["open","high","low","close","volumefrom","volumeto"])

    #  forzar NaN en ceros/negativos
    for c in ["open","high","low","close"]:
        df.loc[df[c] <= 0, c] = np.nan
    df.loc[df["volumeto"] < 0, "volumeto"] = np.nan

    #  eliminar filas inválidas o inconsistentes
    df = _drop_invalid_ohlc_rows(df)

    #  estructura final
    df["coin"] = coin_symbol.lower()
    keep = ["time","open","high","low","close","volumeto","coin"]
    df = df[keep].reset_index(drop=True)

    #  verificación final: nada de NaN, nada de duplicados
    assert not df.isna().any().any(), "Persisten NaN tras limpieza estricta."
    assert not df.duplicated(subset=["coin","time"]).any(), "Duplicados coin+time tras limpieza."
    return df


descarga, limpieza, union y guardado

In [16]:
all_frames = []
for sym in COINS:
    print(f"> Descargando {sym}/{FIAT} ...")
    raw = fetch_histoday(sym, FIAT, START_DATE, END_DATE, API_KEY)
    print(f"  -> crudo: {len(raw)} filas")
    clean = clean_frame_strict(raw, sym)
    print(f"  -> limpio: {len(clean)} filas")
    all_frames.append(clean)

final_df = pd.concat(all_frames, ignore_index=True).sort_values(["coin","time"])

# Reglas extra de seguridad a nivel conjunto
assert not final_df.isna().any().any(), "Quedan NaN en el dataset unido."
assert not final_df.duplicated(subset=["coin","time"]).any(), "Duplicados coin+time en dataset final."

final_df.to_csv(OUTPUT_CSV, index=False)
OUTPUT_CSV, final_df.shape, final_df.head(10)


> Descargando BTC/USD ...
  -> crudo: 91 filas
  -> limpio: 91 filas
> Descargando ETH/USD ...
  -> crudo: 91 filas
  -> limpio: 91 filas
> Descargando ADA/USD ...
  -> crudo: 91 filas
  -> limpio: 91 filas


('data/crypto_limpio.csv',
 (273, 7),
            time    open    high     low   close     volumeto coin
 182  2024-01-01  0.5938  0.6242  0.5905  0.6234  19115404.63  ada
 183  2024-01-02  0.6234  0.6372  0.6022  0.6052  29680628.54  ada
 184  2024-01-03  0.6052  0.6187  0.5346  0.5572  53874007.80  ada
 185  2024-01-04  0.5572  0.5790  0.5499  0.5704  27363954.90  ada
 186  2024-01-05  0.5704  0.5731  0.5226  0.5419  31754702.64  ada
 187  2024-01-06  0.5419  0.5433  0.5075  0.5231  20137921.97  ada
 188  2024-01-07  0.5231  0.5303  0.4883  0.4946  20255458.38  ada
 189  2024-01-08  0.4946  0.5448  0.4650  0.5416  42975948.45  ada
 190  2024-01-09  0.5416  0.5438  0.4966  0.5123  30498183.66  ada
 191  2024-01-10  0.5123  0.5897  0.4879  0.5662  48877074.88  ada)