In [2]:
!pip install yfinance

Collecting yfinance
  Downloading yfinance-1.0-py2.py3-none-any.whl.metadata (6.0 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.7-py3-none-any.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.18.3.tar.gz (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting curl_cffi<0.14,>=0.7 (from yfinance)
  Downloading curl_cffi-0.13.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting websockets>=13.0 (from yfinance)
  Downloading websockets-15.0.1-cp311-cp311-manylinux_2_5_x86_64.manyl

In [1]:
import yfinance as yf
import pandas as pd
from datetime import date, timedelta
import datetime
from pathlib import Path
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# 1. PEŁNA LISTA GRUP I TICKERÓW
GROUPS = {
    "USA": ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'NVDA', 'META', 'TSLA', 'BRK-B', 'UNH', 'V', 'JNJ', 'WMT', 'JPM', 'MA', 'PG', 'AVGO', 'HD', 'CVX', 'ORCL', 'ABBV', 'KO', 'PEP', 'COST', 'BAC', 'ADBE'],
    "INDEX": ['^GSPC', '^IXIC', '^DJI', '^RUT', '^VIX', '^FTSE', '^GDAXI', '^FCHI', '^N225', '^HSI', '^STOXX50E', 'QQQ', 'SPY', 'IWM', 'EEM', 'VWO', 'VEA', 'VNQ', 'GLD', 'SLV', 'DBC', 'TLT', 'HYG', 'VTI', 'VXUS'],
    "GPW": ['CDR.WA', 'PKO.WA', 'PKN.WA', 'KGH.WA', 'PZU.WA', 'PEO.WA', 'ALE.WA', 'LPP.WA', 'DNP.WA', 'PGE.WA', 'OPL.WA', 'SPL.WA', 'JSW.WA', 'ACP.WA', 'KRU.WA', 'MBK.WA', 'CPS.WA', 'TPE.WA', 'KTY.WA', 'ATT.WA', 'ASB.WA', 'BDX.WA', 'GPW.WA', '11B.WA', 'CCC.WA'],
    "CRYPTO": ['BTC-USD', 'ETH-USD', 'SOL-USD', 'BNB-USD', 'XRP-USD', 'ADA-USD', 'DOGE-USD', 'AVAX-USD', 'DOT-USD', 'TRX-USD', 'LINK-USD', 'SHIB-USD', 'LTC-USD', 'DAI-USD', 'BCH-USD', 'ATOM-USD', 'XLM-USD', 'XMR-USD', 'ETC-USD', 'FIL-USD', 'HBAR-USD', 'NEAR-USD'],
    "COMMODITIES": ['GC=F', 'SI=F', 'CL=F', 'NG=F', 'HG=F', 'ZC=F', 'ZW=F', 'KC=F', 'CC=F', 'CT=F', 'OJ=F', 'PL=F', 'PA=F', 'SB=F', 'SOYB', 'WEAT', 'CORN', 'WOOD', 'LIT', 'REMX', 'TAN', 'FAN', 'PICK', 'URA']
}

# 2. KONFIGURACJA ŚCIEŻEK I SPARK
BRONZE_STOCKS_PATH = Path("/home/jovyan/work/data/bronze/incremental_stocks")
BRONZE_STOCKS_PATH.mkdir(parents=True, exist_ok=True)

spark = SparkSession.builder \
    .appName("Stocks_Incremental_125_Tickers") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.6.0") \
    .getOrCreate()

DB_CONF = {
    "url": "jdbc:postgresql://postgres_dw:5432/currency_db",
    "user": "admin",
    "password": "password123",
    "driver": "org.postgresql.Driver"
}

# 3. INTELIGENTNY SENSOR: Sprawdzamy stan sesji USA (AAPL)
def get_last_trade_date_aapl():
    try:
        # Sprawdzamy max datę tylko dla AAPL - wyznacznika pełnej sesji światowej
        query = "(SELECT MAX(trade_date) as max_date FROM f_stock_prices WHERE ticker = 'AAPL') as sub"
        df = spark.read.format("jdbc").options(**DB_CONF).option("dbtable", query).load()
        res = df.collect()[0][0]
        return res if res else date(2025, 1, 1)
    except Exception:
        return date(2025, 1, 1)

last_db_date = get_last_trade_date_aapl()
start_fetch = last_db_date + timedelta(days=1)
today = date.today()
now_hour = datetime.datetime.now().hour

# Logika blokady
should_run = True
if start_fetch > today:
    print(f"--- STATUS: Dane są aktualne. (Ostatnie Apple: {last_db_date}) ---")
    should_run = False
elif start_fetch == today and now_hour < 22:
    print(f"--- STATUS: Mamy {today}, ale czekam do 22:00 na zamknięcie USA, by uniknąć pustych danych. ---")
    should_run = False

if should_run:
    print(f"--- START: Pobieram dane od {start_fetch} do {today} ---")
    all_new_dfs = []
    
    for group_name, tickers in GROUPS.items():
        print(f"Pobieram grupę: {group_name}...")
        # Pobieramy dane zbiorczo (szybsze niż pojedynczo)
        data = yf.download(tickers, start=start_fetch, progress=False, group_by='ticker')
        
        for ticker in tickers:
            try:
                # Wyciąganie danych z MultiIndexu yfinance
                if len(tickers) > 1:
                    t_data = data[ticker].dropna(how='all').reset_index()
                else:
                    t_data = data.dropna(how='all').reset_index()

                if not t_data.empty:
                    # Filtrujemy, aby nie brać danych z "dzisiaj", jeśli sesja trwa (brak Close)
                    t_data = t_data[t_data['Date'].dt.date < today] if now_hour < 22 else t_data
                    
                    if not t_data.empty:
                        t_data['ticker'] = ticker
                        t_data['group_name'] = group_name
                        
                        # Zapis Bronze (CSV)
                        file_name = BRONZE_STOCKS_PATH / f"{ticker}_{today}.csv"
                        t_data.to_csv(file_name, index=False)
                        all_new_dfs.append(t_data)
            except Exception as e:
                continue

    # 5. ŁADOWANIE DO POSTGRESA (Silver)
    if all_new_dfs:
        combined_pandas_df = pd.concat(all_new_dfs)
        spark_df = spark.createDataFrame(combined_pandas_df)

        final_stocks = spark_df.select(
            F.col("ticker"),
            F.col("Date").cast("date").alias("trade_date"),
            F.col("Open").cast("decimal(14,4)").alias("open_price"),
            F.col("High").cast("decimal(14,4)").alias("high_price"),
            F.col("Low").cast("decimal(14,4)").alias("low_price"),
            F.col("Close").cast("decimal(14,4)").alias("close_price"),
            F.col("Volume").cast("bigint").alias("volume")
        ).distinct() # Usuwamy ewentualne duplikaty z pobrania

        print(f"Zapisuję {final_stocks.count()} nowych wierszy do bazy...")
        
        final_stocks.write \
            .format("jdbc") \
            .options(**DB_CONF) \
            .option("dbtable", "f_stock_prices") \
            .mode("append") \
            .save()
            
        print(f"SUKCES! Baza zaktualizowana do daty: {today - timedelta(days=1)}")
    else:
        print("Brak nowych, zamkniętych danych do zapisu.")

--- START: Pobieram dane od 2026-01-09 do 2026-01-10 ---
Pobieram grupę: USA...



1 Failed download:
['JNJ']: OperationalError('database is locked')


Pobieram grupę: INDEX...
Pobieram grupę: GPW...
Pobieram grupę: CRYPTO...
Pobieram grupę: COMMODITIES...
Zapisuję 120 nowych wierszy do bazy...
SUKCES! Baza zaktualizowana do daty: 2026-01-09


Test działania kodu na daną datę/ticker


In [3]:
spark.read.format("jdbc").options(**DB_CONF).option("dbtable", "f_stock_prices") \
     .load().filter("trade_date = '2026-01-05'").show()

+--------+----------+----------+----------+---------+-----------+---------+
|  ticker|trade_date|open_price|high_price|low_price|close_price|   volume|
+--------+----------+----------+----------+---------+-----------+---------+
| LTC-USD|2026-01-05|     82.16|     83.44|    81.66|      81.81|386006752|
| DAI-USD|2026-01-05|      1.00|      1.00|     1.00|       1.00| 74932208|
|NEAR-USD|2026-01-05|      1.73|      1.79|     1.72|       1.72|191494656|
|AVAX-USD|2026-01-05|     14.23|     14.49|    14.13|      14.22|373752544|
|    ^VIX|2026-01-05|     15.14|     15.26|    15.08|      15.16|        0|
|   ^FTSE|2026-01-05|   9951.45|  10022.05|  9951.45|    9961.46|        0|
| DOT-USD|2026-01-05|      2.14|      2.17|     2.11|       2.12|170998416|
| TRX-USD|2026-01-05|      0.29|      0.30|     0.29|       0.29|562142848|
|  ALE.WA|2026-01-05|     31.45|     31.47|    31.11|      31.11|   520203|
|  LPP.WA|2026-01-05|  21210.00|  21390.00| 21040.00|   21240.00|      747|
|  DNP.WA|20

Testing

In [1]:
import yfinance as yf
from datetime import date, timedelta
import pandas as pd

# Konfiguracja testowa
ticker_to_test = "TSLA"
target_date = "2026-01-15"
next_day = "2026-01-16"

print(f"--- START TESTU DLA {ticker_to_test} ---")

# Pobieramy dane konkretnie za ten jeden dzień
# yfinance wymaga 'end' jako dzień po dacie, którą chcemy dostać
df = yf.download(ticker_to_test, start=target_date, end=next_day, progress=False)

if df.empty:
    print(f"!!! BŁĄD: yfinance zwrócił pusty zestaw danych dla {target_date}.")
    print("Możliwe powody: Dane nie są jeszcze dostępne w API lub giełda była zamknięta.")
else:
    print(f"SUKCES! Pobrano dane dla {ticker_to_test}:")
    # Resetujemy index, żeby data była widoczna jako kolumna
    df_show = df.reset_index()
    print(df_show.to_string())

print("--- KONIEC TESTU ---")

--- START TESTU DLA TSLA ---
SUKCES! Pobrano dane dla TSLA:
Price        Date       Close        High         Low        Open    Volume
Ticker                   TSLA        TSLA        TSLA        TSLA      TSLA
0      2026-01-15  438.570007  445.359985  437.649994  441.130005  49367500
--- KONIEC TESTU ---
