In [None]:
## Pipeline de Ingesta de Datos Financieros desde Yahoo Finance

In [1]:
!pip install yfinance sqlalchemy psycopg2-binary pandas

Collecting yfinance
  Downloading yfinance-0.2.66-py2.py3-none-any.whl.metadata (6.0 kB)
Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.11-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl.metadata (4.9 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.7-py3-none-any.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.18.3.tar.gz (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting curl_cffi>=0.7 (from yfinance)
  Downloading curl_cffi-0.13.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aa

In [2]:
import os
import pandas as pd
import yfinance as yf
from sqlalchemy import create_engine, text
from datetime import datetime
import time

In [6]:
PG_USER = "megranda"
PG_PASSWORD = "TradingDatabase"
PG_HOST = "warehouse"
PG_PORT = "5432"
PG_DB = "trading_db"

engine = create_engine(f"postgresql://{PG_USER}:{PG_PASSWORD}@{PG_HOST}:{PG_PORT}/{PG_DB}")

In [None]:
TICKERS = os.getenv('TICKERS')  
START_DATE = os.getenv('START_DATE') 
END_DATE = os.getenv('END_DATE')

df_all = []

max_retries = 3

for TICKER in TICKERS.split(','):
    # Reintentos para descarga de datos
    for attempt in range(max_retries):
        try:
            print(f"Descargando datos para {TICKER}...")
            df_raw = yf.download(
                tickers=TICKER, 
                start=START_DATE, 
                end=END_DATE, 
                interval='1d',
                progress=False,
                auto_adjust=False
            )

            # Asegurar que las columnas no sean MultiIndex
            if isinstance(df_raw.columns, pd.MultiIndex):
                df_raw.columns = df_raw.columns.get_level_values(0)

            df_raw.reset_index(inplace=True)
            # Añadir columna ticker
            df_raw['ticker'] = TICKER

            df_all.append(df_raw)
            print(f"Filas descargadas para {TICKER}: {len(df_raw)}")
            break  # si descarga ok, rompe el loop de reintentos
        except Exception as e:
            print(f"Error al descargar {TICKER} (intento {attempt + 1}): {e}")
            time.sleep(5 * (attempt + 1))
            if attempt == max_retries - 1:
                print(f"No se pudo descargar {TICKER} después de {max_retries} intentos.")

# Unir todos los DataFrames
df_all = pd.concat(df_all, ignore_index=True)
print(f"Total filas descargadas: {len(df_all)}")

In [None]:
#Metadatos
df_all['ingested_at_utc'] = datetime.utcnow()
df_all['run_id'] = datetime.now().strftime('run_%Y%m%d_%H%M')
df_all['source_name'] = 'yahoo_finance'