In [None]:
import os
import requests
import pandas as pd

# =========================
# CONFIG
# =========================
API_KEY = "e3273ecc2ae89542944197d0bff87f80af57f713087ddabb21bbb89a4a4c1a1f"
URL = "https://min-api.cryptocompare.com/data/v2/histohour"
HEADERS = {"authorization": f"Apikey {API_KEY}"}

COINS = {
    "BTC": "Bitcoin",
    "ETH": "Ethereum",
    "DASH": "Dash",
    "LTC": "Litecoin",
    "MAID": "MaidSafeCoin",
    "XMR": "Monero",
    "XRP": "Ripple"
}

# Paper period (hourly)
PAPER_START = pd.Timestamp("2017-02-25 00:00:00")
PAPER_END   = pd.Timestamp("2017-08-17 14:00:00")

# Output folders
PAPER_DIR = "data/raw/paper"
LIVE_DIR  = "data/raw/live"

os.makedirs(PAPER_DIR, exist_ok=True)
os.makedirs(LIVE_DIR, exist_ok=True)

# =========================
# HELPERS
# =========================
def _api_call_histohour(fsym: str, tsym: str = "USD", limit: int = 2000, to_ts: int | None = None) -> pd.DataFrame:
    """
    Calls CryptoCompare histohour endpoint and returns a dataframe.
    """
    params = {"fsym": fsym, "tsym": tsym, "limit": limit}
    if to_ts is not None:
        params["toTs"] = to_ts

    r = requests.get(URL, params=params, headers=HEADERS, timeout=30)
    r.raise_for_status()
    j = r.json()

    if j.get("Response") != "Success":
        msg = j.get("Message", "Unknown error")
        raise RuntimeError(f"CryptoCompare API error for {fsym}: {msg}")

    df = pd.DataFrame(j["Data"]["Data"])
    if df.empty:
        return df

    df["time"] = pd.to_datetime(df["time"], unit="s")
    df = df.sort_values("time").reset_index(drop=True)
    return df


def download_paper_sample(symbol: str, start: pd.Timestamp, end: pd.Timestamp) -> pd.DataFrame:
    """
    Downloads hourly data between start and end (inclusive) by paging backwards with toTs.
    """
    all_chunks = []
    to_ts = int(end.timestamp())

    while True:
        chunk = _api_call_histohour(fsym=symbol, tsym="USD", limit=2000, to_ts=to_ts)
        if chunk.empty:
            break

        all_chunks.append(chunk)

        # stop if we reached start
        if chunk["time"].min() <= start:
            break

        # next call: go further back (1 hour before earliest)
        to_ts = int(chunk["time"].min().timestamp()) - 3600

    if not all_chunks:
        return pd.DataFrame()

    df = (
        pd.concat(all_chunks, ignore_index=True)
        .drop_duplicates(subset="time")
        .sort_values("time")
        .reset_index(drop=True)
    )

    # exact period filter (inclusive)
    df = df[(df["time"] >= start) & (df["time"] <= end)].reset_index(drop=True)
    return df


def download_live_latest(symbol: str, hours: int = 2000) -> pd.DataFrame:
    """
    Downloads the most recent 'hours' hourly observations.
    """
    df = _api_call_histohour(fsym=symbol, tsym="USD", limit=hours, to_ts=None)
    return df


# =========================
# MAIN: PAPER DATASET
# =========================
print("=== Downloading PAPER sample (fixed 2017 window) ===")
for sym, name in COINS.items():
    print(f"\n{name} ({sym}) - paper window: {PAPER_START} -> {PAPER_END}")

    paper_df = download_paper_sample(sym, PAPER_START, PAPER_END)

    out_path = os.path.join(PAPER_DIR, f"{sym}_USD_hourly_2017_paper.csv")
    paper_df.to_csv(out_path, index=False)

    print(f"Saved: {out_path}")
    print(f"Rows: {len(paper_df)}  |  Time min: {paper_df['time'].min() if len(paper_df) else None}  |  Time max: {paper_df['time'].max() if len(paper_df) else None}")

# =========================
# MAIN: LIVE DATASET
# =========================
print("\n=== Downloading LIVE sample (latest 2000 hours) ===")
for sym, name in COINS.items():
    print(f"\n{name} ({sym}) - latest sample")

    live_df = download_live_latest(sym, hours=2000)

    out_path = os.path.join(LIVE_DIR, f"{sym}_USD_hourly_latest.csv")
    live_df.to_csv(out_path, index=False)

    print(f"Saved: {out_path}")
    print(f"Rows: {len(live_df)}  |  Time min: {live_df['time'].min() if len(live_df) else None}  |  Time max: {live_df['time'].max() if len(live_df) else None}")

print("\nDone.")



Downloading Bitcoin (BTC)...
BTC: 4167 hourly observations
Downloading Ethereum (ETH)...
ETH: 4167 hourly observations
Downloading Dash (DASH)...
DASH: 4167 hourly observations
Downloading Litecoin (LTC)...
LTC: 4167 hourly observations
Downloading MaidSafeCoin (MAID)...
MAID: 4167 hourly observations
Downloading Monero (XMR)...
XMR: 4167 hourly observations
Downloading Ripple (XRP)...
XRP: 4167 hourly observations
