In [2]:
!pip install yfinance

Collecting yfinance
  Downloading yfinance-1.0-py2.py3-none-any.whl.metadata (6.0 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.7-py3-none-any.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.18.3.tar.gz (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting curl_cffi<0.14,>=0.7 (from yfinance)
  Downloading curl_cffi-0.13.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting websockets>=13.0 (from yfinance)
  Downloading websockets-15.0.1-cp311-cp311-manylinux_2_5_x86_64.manyl

In [1]:
import yfinance as yf
import pandas as pd
from datetime import date, timedelta
from pathlib import Path
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# 1. PEŁNA LISTA GRUP I TICKERÓW
GROUPS = {
    "USA": ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA', 'META', 'TSLA', 'BRK-B', 'UNH', 'V', 'JNJ', 'WMT', 'JPM', 'MA', 'PG', 'AVGO', 'HD', 'CVX', 'ORCL', 'ABBV', 'KO', 'PEP', 'COST', 'BAC', 'ADBE'],
    "INDEX": ['^GSPC', '^IXIC', '^DJI', '^RUT', '^VIX', '^FTSE', '^GDAXI', '^FCHI', '^N225', '^HSI', '^STOXX50E', 'QQQ', 'SPY', 'IWM', 'EEM', 'VWO', 'VEA', 'VNQ', 'GLD', 'SLV', 'DBC', 'TLT', 'HYG', 'VTI', 'VXUS'],
    "GPW": ['CDR.WA', 'PKO.WA', 'PKN.WA', 'KGH.WA', 'PZU.WA', 'PEO.WA', 'ALE.WA', 'LPP.WA', 'DNP.WA', 'PGE.WA', 'OPL.WA', 'SPL.WA', 'JSW.WA', 'ACP.WA', 'KRU.WA', 'MBK.WA', 'CPS.WA', 'TPE.WA', 'KTY.WA', 'ATT.WA', 'ASB.WA', 'BDX.WA', 'GPW.WA', '11B.WA', 'CCC.WA'],
    "CRYPTO": ['BTC-USD', 'ETH-USD', 'SOL-USD', 'BNB-USD', 'XRP-USD', 'ADA-USD', 'DOGE-USD', 'AVAX-USD', 'DOT-USD', 'TRX-USD', 'LINK-USD', 'SHIB-USD', 'LTC-USD', 'DAI-USD', 'BCH-USD', 'ATOM-USD', 'UNI7083-USD', 'XLM-USD', 'XMR-USD', 'ETC-USD', 'FIL-USD', 'HBAR-USD', 'NEAR-USD'],
    "COMMODITIES": ['GC=F', 'SI=F', 'CL=F', 'NG=F', 'HG=F', 'ZC=F', 'ZW=F', 'KC=F', 'CC=F', 'CT=F', 'OJ=F', 'PL=F', 'PA=F', 'SB=F', 'SOYB', 'WEAT', 'CORN', 'WOOD', 'LIT', 'REMX', 'TAN', 'FAN', 'PICK', 'URA']
}

# Spłaszczamy listę do pobierania
ALL_TICKERS = [ticker for group in GROUPS.values() for ticker in group]

# 2. KONFIGURACJA ŚCIEŻEK I SPARK
BRONZE_STOCKS_PATH = Path("/home/jovyan/work/data/bronze/incremental_stocks")
BRONZE_STOCKS_PATH.mkdir(parents=True, exist_ok=True)

spark = SparkSession.builder \
    .appName("Stocks_Incremental_125_Tickers") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.6.0") \
    .getOrCreate()

DB_CONF = {
    "url": "jdbc:postgresql://postgres_dw:5432/currency_db",
    "user": "admin",
    "password": "password123",
    "driver": "org.postgresql.Driver"
}

# 3. SENSOR: Sprawdzamy najnowszą datę w Postgresie
def get_last_trade_date():
    try:
        df = spark.read.format("jdbc").options(**DB_CONF).option("dbtable", "f_stock_prices").load()
        return df.select(F.max("trade_date")).collect()[0][0]
    except Exception:
        return date(2025, 1, 1)

last_db_date = get_last_trade_date()
start_fetch = last_db_date + timedelta(days=1)
today = date.today()

if start_fetch >= today:
    print(f" Dane giełdowe są aktualne. Ostatnia data: {last_db_date}")
else:
    print(f" Wykryto brakujące dni ({start_fetch} do {today}). Pobieram 125 tickerów...")

    # 4. POBIERANIE (Bronze - Fizyczne pliki CSV)
    all_new_dfs = []
    
    # Pobieramy dane w małych paczkach (per grupa), aby uniknąć limitów API
    for group_name, tickers in GROUPS.items():
        print(f"--- Grupa: {group_name} ---")
        # Pobieramy dane zbiorczo dla grupy (szybsze niż pojedynczo)
        data = yf.download(tickers, start=start_fetch, progress=False, group_by='ticker')
        
        for ticker in tickers:
            try:
                # Wyciągamy dane dla konkretnego tickera z pobranego obiektu
                if len(tickers) > 1:
                    t_data = data[ticker].dropna().reset_index()
                else:
                    t_data = data.dropna().reset_index()

                if not t_data.empty:
                    t_data['ticker'] = ticker
                    t_data['group_name'] = group_name
                    
                    # Zapis fizyczny CSV (Bronze)
                    file_name = BRONZE_STOCKS_PATH / f"{ticker}_{today}.csv"
                    t_data.to_csv(file_name, index=False)
                    
                    all_new_dfs.append(t_data)
            except Exception as e:
                print(f"   Pominąłem {ticker}: brak nowych danych lub błąd.")

    # 5. ŁADOWANIE DO POSTGRESA (Silver - Append)
    if all_new_dfs:
        combined_pandas_df = pd.concat(all_new_dfs)
        spark_df = spark.createDataFrame(combined_pandas_df)

        # Standaryzacja kolumn pod Twoją strukturę tabeli
        final_stocks = spark_df.select(
            F.col("ticker"),
            F.col("Date").cast("date").alias("trade_date"),
            F.col("Open").cast("decimal(14,4)").alias("open_price"),
            F.col("High").cast("decimal(14,4)").alias("high_price"),
            F.col("Low").cast("decimal(14,4)").alias("low_price"),
            F.col("Close").cast("decimal(14,4)").alias("close_price"),
            F.col("Volume").cast("bigint").alias("volume")
        )

        print(f" Zapisuję {final_stocks.count()} nowych wierszy do bazy...")
        
        final_stocks.write \
            .format("jdbc") \
            .options(**DB_CONF) \
            .option("dbtable", "f_stock_prices") \
            .mode("append") \
            .save()
            
        print(f" GOTOWE! Twoja baza została zasilona nowymi danymi z {len(all_new_dfs)} tickerów.")
    else:
        print(" Brak nowych zamkniętych sesji giełdowych do pobrania.")

✅ Dane giełdowe są aktualne. Ostatnia data: 2026-01-05
