In [1]:
"""
download_histories.py

Uso:
- coloque a lista de tickers (formato ex: 'ABEV3.SA') em `tickers`.
- execute este script: python download_histories.py
- os arquivos serão salvos em ./data/historical/<TICKER>.parquet e ./data/historical/<TICKER>.csv
"""

import os
import time
import logging
from datetime import datetime
from typing import List, Dict, Optional
import pandas as pd
import yfinance as yf
from tqdm import tqdm
import traceback

In [2]:
# logging simples
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

# diretório onde os históricos serão salvos (compatível com seu script anterior)
HIST_DIR = os.path.join("data", "historical")
os.makedirs(HIST_DIR, exist_ok=True)

# parâmetros de download
DEFAULT_START = "1998-01-01"
BATCH_SIZE = 15            # número de tickers por chamada yfinance.download (ajuste se achar necessário)
MAX_ATTEMPTS = 4           # tentativas por batch
SLEEP_BETWEEN_BATCHES = 1  # segundos entre batches para reduzir risco de throttling
SLEEP_BETWEEN_TICKERS = 0.2

In [3]:
def ticker_exists_local(ticker: str) -> bool:
    """Verifica se já existe parquet salvo para ticker (usa para pular downloads)"""
    path = os.path.join(HIST_DIR, f"{ticker}.parquet")
    return os.path.isfile(path)

In [4]:
def save_history_df(ticker: str, df: pd.DataFrame, save_csv: bool = True):
    """Salva DataFrame em parquet e opcionalmente em CSV. Garante coluna 'date' se índice for DatetimeIndex."""
    if df is None or df.empty:
        raise ValueError("DataFrame nulo ou vazio")
    df = df.copy()
    # garantir que a coluna de data exista como coluna
    if isinstance(df.index, pd.DatetimeIndex):
        df.index.name = "date"
        df = df.reset_index()
    # converter coluna date para string ISO ao salvar CSV (mantém compatibilidade)
    out_parquet = os.path.join(HIST_DIR, f"{ticker}.parquet")
    out_csv = os.path.join(HIST_DIR, f"{ticker}.csv")
    try:
        df.to_parquet(out_parquet, index=False)
        logging.info("Saved %s rows for %s -> %s", len(df), ticker, out_parquet)
    except Exception as e:
        logging.exception("Erro salvando parquet para %s: %s", ticker, e)
        raise
    if save_csv:
        try:
            # padronizar data para ISO antes de salvar CSV (se existir)
            if 'date' in df.columns:
                df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')
            df.to_csv(out_csv, index=False)
            logging.info("Saved CSV for %s -> %s", ticker, out_csv)
        except Exception as e:
            logging.exception("Erro salvando CSV para %s: %s", ticker, e)
            # não raise — parquet já salvo, apenas logamos o problema

In [5]:
def download_batch(batch: List[str], start: str = DEFAULT_START, threads: bool = True) -> Dict[str, Optional[pd.DataFrame]]:
    """
    Tenta baixar um batch de tickers via yfinance.download.
    Retorna dict ticker -> DataFrame or None (se falhou).
    """
    joined = " ".join(batch)
    attempt = 0
    last_exc = None
    while attempt < MAX_ATTEMPTS:
        try:
            logging.info("yfinance.download attempt %d for batch size %d", attempt+1, len(batch))
            data = yf.download(tickers=joined, start=start, progress=False, threads=threads, group_by='ticker', auto_adjust=False, actions=True)
            result = {}
            if isinstance(data, pd.DataFrame) and isinstance(data.columns, pd.MultiIndex):
                for ticker in batch:
                    if ticker in data.columns.get_level_values(0):
                        df_t = data[ticker].copy()
                        result[ticker] = df_t
                    else:
                        try:
                            single = yf.download(ticker, start=start, progress=False, actions=True)
                            result[ticker] = single if not single.empty else None
                        except Exception:
                            result[ticker] = None
            else:
                for ticker in batch:
                    try:
                        df_t = yf.download(ticker, start=start, progress=False, actions=True)
                        result[ticker] = df_t if not df_t.empty else None
                    except Exception:
                        result[ticker] = None
            return result
        except Exception as e:
            last_exc = e
            logging.warning("Erro no yfinance.download (attempt %d): %s", attempt+1, str(e))
            attempt += 1
            time.sleep(2 ** attempt)  # backoff exponencial
    logging.error("Todas tentativas falharam para batch (%s). Último erro: %s", joined, last_exc)
    # fallback: tentar baixar ticker a ticker
    result = {}
    for ticker in batch:
        try:
            df_t = yf.download(ticker, start=start, progress=False, actions=True)
            result[ticker] = df_t if not df_t.empty else None
        except Exception as e:
            logging.warning("Fallback individual falhou para %s: %s", ticker, e)
            result[ticker] = None
    return result


In [6]:
def download_all_histories(tickers: List[str], start: str = DEFAULT_START, force: bool = False, save_summary: bool = True, save_csv_per_ticker: bool = True):
    """
    Processo principal: recebe lista de tickers (strings), baixa históricos e salva parquet + csv por ticker.
    - force: se True, re-baixa mesmo que arquivo exista.
    - save_csv_per_ticker: se True salva um CSV para cada ticker (além do parquet).
    - retorna um DataFrame resumo com status por ticker.
    """
    os.makedirs(HIST_DIR, exist_ok=True)
    tickers = [t for t in tickers if isinstance(t, str) and t.strip()]
    tickers = list(dict.fromkeys(tickers))
    summary = []
    for i in range(0, len(tickers), BATCH_SIZE):
        batch = tickers[i:i+BATCH_SIZE]
        to_download = [t for t in batch if force or not ticker_exists_local(t)]
        if not to_download:
            logging.info("Batch %d: todos já existem localmente — pulando.", i//BATCH_SIZE+1)
            for t in batch:
                summary.append({
                    "ticker": t,
                    "status": "skipped_local",
                    "rows": None,
                    "saved_parquet": os.path.join(HIST_DIR, f"{t}.parquet") if ticker_exists_local(t) else None,
                    "saved_csv": os.path.join(HIST_DIR, f"{t}.csv") if os.path.exists(os.path.join(HIST_DIR, f"{t}.csv")) else None
                })
            continue

        logging.info("Processando batch %d/%d (download %d/%d)", i//BATCH_SIZE+1, (len(tickers)+BATCH_SIZE-1)//BATCH_SIZE, len(to_download), len(batch))
        results = download_batch(to_download, start=start)
        for t in batch:
            df_t = results.get(t) if t in results else None
            if df_t is None or (isinstance(df_t, pd.DataFrame) and df_t.empty):
                logging.warning("Nenhum dado para %s em batch; tentativa isolada...", t)
                try:
                    single = yf.download(t, start=start, progress=False, actions=True)
                    df_t = single if not single.empty else None
                except Exception:
                    df_t = None
            if df_t is None or df_t.empty:
                logging.error("Falha obtendo dados para %s", t)
                summary.append({"ticker": t, "status": "failed", "rows": 0, "saved_parquet": None, "saved_csv": None})
            else:
                try:
                    save_history_df(t, df_t, save_csv=save_csv_per_ticker)
                    summary.append({
                        "ticker": t,
                        "status": "ok",
                        "rows": len(df_t),
                        "saved_parquet": os.path.join(HIST_DIR, f"{t}.parquet"),
                        "saved_csv": os.path.join(HIST_DIR, f"{t}.csv") if save_csv_per_ticker else None
                    })
                except Exception as e:
                    logging.exception("Erro salvando para %s: %s", t, e)
                    summary.append({"ticker": t, "status": "save_error", "rows": len(df_t) if isinstance(df_t, pd.DataFrame) else None, "saved_parquet": None, "saved_csv": None})
            time.sleep(SLEEP_BETWEEN_TICKERS)
        time.sleep(SLEEP_BETWEEN_BATCHES)

    df_summary = pd.DataFrame(summary)
    if save_summary:
        ts = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
        summary_path = os.path.join(HIST_DIR, f"download_summary_{ts}.csv")
        df_summary.to_csv(summary_path, index=False)
        logging.info("Resumo salvo em %s", summary_path)
    return df_summary

In [7]:
# utilitário para recombinar todos os parquets em um único arquivo (long format)
def combine_all_to_single_parquet(out_path: str = os.path.join(HIST_DIR, "all_histories.parquet"), out_csv: Optional[str] = os.path.join(HIST_DIR, "all_histories.csv"), tickers: Optional[List[str]] = None):
    """
    Lê todos os parquets em HIST_DIR (ou tickers list) e concatena em formato long:
    columns: ['ticker','date', 'Open','High','Low','Close','Adj Close','Volume', 'Dividends','Stock Splits']
    Salva em parquet e opcionalmente em csv.
    """
    files = []
    if tickers:
        files = [os.path.join(HIST_DIR, f"{t}.parquet") for t in tickers if os.path.exists(os.path.join(HIST_DIR, f"{t}.parquet"))]
    else:
        files = [os.path.join(HIST_DIR, f) for f in os.listdir(HIST_DIR) if f.endswith(".parquet")]
    dfs = []
    for f in files:
        try:
            df = pd.read_parquet(f)
            if 'date' in df.columns:
                df['date'] = pd.to_datetime(df['date'])
            fname = os.path.basename(f).replace(".parquet","")
            if 'ticker' not in df.columns:
                df.insert(0, 'ticker', fname)
            dfs.append(df)
        except Exception as e:
            logging.warning("Erro lendo %s: %s", f, e)
    if not dfs:
        raise RuntimeError("Nenhum parquet encontrado para combinar.")
    big = pd.concat(dfs, ignore_index=True, sort=False)
    big.to_parquet(out_path, index=False)
    logging.info("Combined saved to %s (rows=%d)", out_path, len(big))
    if out_csv:
        try:
            # converter date para formato iso ao salvar CSV
            if 'date' in big.columns:
                big['date'] = pd.to_datetime(big['date']).dt.strftime('%Y-%m-%d')
            big.to_csv(out_csv, index=False)
            logging.info("Combined CSV saved to %s", out_csv)
        except Exception as e:
            logging.exception("Erro salvando combined CSV: %s", e)
    return big

# ------------------------------
# Exemplo de uso
# ------------------------------
if __name__ == "__main__":
    # 1) carregue a lista de tickers a partir do arquivo que você já salvou
    tickers_file = os.path.join("tickers_ibrx100_full.csv")
    if os.path.exists(tickers_file):
        df = pd.read_csv(tickers_file)
        if 'Ticker' in df.columns:
            tickers = df['Ticker'].dropna().astype(str).tolist()
        else:
            tickers = df.iloc[:,0].dropna().astype(str).tolist()
    else:
        raise RuntimeError(f"Não encontrou {tickers_file}. Coloque seu CSV de tickers no mesmo diretório ou edite este script.")

    # 2) opção: validar/normalizar tickers (garantir sufixo .SA)
    def normalize(t):
        t = str(t).strip().upper()
        if not t.endswith(".SA"):
            t = t.replace(".SA","") + ".SA"
        return t
    tickers = [normalize(t) for t in tickers]
    print("Tickers a baixar:", len(tickers), tickers[:10])

    # 3) executar (force=True re-baixa mesmo se já existir)
    summary_df = download_all_histories(tickers, start=DEFAULT_START, force=False, save_summary=True, save_csv_per_ticker=True)
    print(summary_df.head(50))

    # 4) opcional: combinar tudo em um único parquet e CSV (pode ser grande)
    combined = combine_all_to_single_parquet()
    print("Combined rows:", len(combined))


2025-10-16 14:33:22,428 INFO Batch 1: todos já existem localmente — pulando.
2025-10-16 14:33:22,435 INFO Batch 2: todos já existem localmente — pulando.
2025-10-16 14:33:22,439 INFO Batch 3: todos já existem localmente — pulando.
2025-10-16 14:33:22,447 INFO Batch 4: todos já existem localmente — pulando.
2025-10-16 14:33:22,453 INFO Batch 5: todos já existem localmente — pulando.
2025-10-16 14:33:22,459 INFO Batch 6: todos já existem localmente — pulando.
2025-10-16 14:33:22,464 INFO Batch 7: todos já existem localmente — pulando.
2025-10-16 14:33:22,539 INFO Resumo salvo em data\historical\download_summary_20251016T173322Z.csv


Tickers a baixar: 97 ['ALOS3.SA', 'ABEV3.SA', 'ANIM3.SA', 'ASAI3.SA', 'AURE3.SA', 'AZZA3.SA', 'BBSE3.SA', 'BBDC3.SA', 'BBDC4.SA', 'BRAP4.SA']
       ticker         status  rows                      saved_parquet  \
0    ALOS3.SA  skipped_local  None   data\historical\ALOS3.SA.parquet   
1    ABEV3.SA  skipped_local  None   data\historical\ABEV3.SA.parquet   
2    ANIM3.SA  skipped_local  None   data\historical\ANIM3.SA.parquet   
3    ASAI3.SA  skipped_local  None   data\historical\ASAI3.SA.parquet   
4    AURE3.SA  skipped_local  None   data\historical\AURE3.SA.parquet   
5    AZZA3.SA  skipped_local  None   data\historical\AZZA3.SA.parquet   
6    BBSE3.SA  skipped_local  None   data\historical\BBSE3.SA.parquet   
7    BBDC3.SA  skipped_local  None   data\historical\BBDC3.SA.parquet   
8    BBDC4.SA  skipped_local  None   data\historical\BBDC4.SA.parquet   
9    BRAP4.SA  skipped_local  None   data\historical\BRAP4.SA.parquet   
10   BBAS3.SA  skipped_local  None   data\historical\BB

2025-10-16 14:33:25,833 INFO Combined saved to data\historical\all_histories.parquet (rows=1256344)
2025-10-16 14:33:37,326 INFO Combined CSV saved to data\historical\all_histories.csv


Combined rows: 1256344
