In [156]:
import pandas as pd
import yfinance as yf
import duckdb
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [112]:
class YahooClient:
    def __init__(self, ticker: str):
        self.ticker = ticker
        self._yf = yf.Ticker(ticker)
    
    def identification(self):
        isin = self._yf.get_isin()
        ticker = self.ticker
        return {"ticker": ticker, "isin": isin}
    
    def mutualfund_holders(self):
        return self._yf.get_mutualfund_holders()

    def institutional_holders(self):
        return self._yf.get_institutional_holders()
    
    def major_holders(self):
        return self._yf.get_major_holders()

    def dividends(self):
        return self._yf.get_dividends()
    
    def shares_outstanding(self):
        return self._yf.get_shares_outstanding()
    
    def income_statement(self):
        return self._yf.get_income_stmt()
    
    def balance_sheet(self):
        return self._yf.get_balance_sheet()

    def cashflow(self):
        return self._yf.get_cashflow()
    def get_all_yf_data(self):
        # Wir definieren, welche Methoden wir abrufen wollen
        methods = {
            "balance_sheet": self.balance_sheet,
            "income_statement": self.income_statement,
            "cashflow": self.cashflow,
            "dividends": self.dividends,
            "mutualfund_holders": self.mutualfund_holders,
            "institutional_holders": self.institutional_holders,
            "major_holders": self.major_holders,
            
        }
        data_package = {}
        
        for name, method in methods.items():
            try:
                # Wir führen die Methode aus
                data = method()
                if data is not None and not data.empty:
                    data_package[name] = data
            except Exception as e:
                print(f"Fehler beim Abrufen von {name} für {self.ticker}: {e}")
                
        return data_package
    

In [166]:
# YahooClient("AAPL").mutualfund_holders()
# #YahooClient("AAPL").institutional_holders()
YahooClient("AAPL").major_holders().reset_index().columns
# #YahooClient("AAPL").balance_sheet()

Index(['index', 'Value'], dtype='object', name='Breakdown')

In [167]:
class FinanceTransformer:
    @staticmethod
    def transform_financial_statement(df_raw, ticker, affiliation):
        """Transformiert Income Statement, Balance Sheet oder Cashflow."""
        if df_raw is None or df_raw.empty:
            return pd.DataFrame()
        
        # Das typische yfinance Format ist: Zeilen = Items, Spalten = Daten
        # Wir müssen es in das 'Long-Format' schmelzen (Melt)
        df = df_raw.melt(ignore_index=False, var_name='date', value_name='value')
        df = df.reset_index().rename(columns={'index': 'item_description'})
        
        df['ticker'] = ticker
        df['affiliation'] = affiliation
        
        # Datentyp-Härtung
        df['date'] = pd.to_datetime(df['date']).dt.tz_localize(None).dt.date
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        
        return df[['ticker', 'date', 'affiliation', 'item_description', 'value']]

    @staticmethod
    def transform_series(series, ticker, affiliation, description):
        """Transformiert einfache Zeitreihen (z.B. Dividenden oder Shares)."""
        if series is None or series.empty:
            return pd.DataFrame()
            
        df = series.reset_index()
        df.columns = ['date', 'value']
        
        df['ticker'] = ticker
        df['affiliation'] = affiliation
        df['item_description'] = description
        
        df['date'] = pd.to_datetime(df['date']).dt.tz_localize(None).dt.date
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        
        return df[['ticker', 'date', 'affiliation', 'item_description', 'value']]
    
    @staticmethod
    def transform_holder_group(df_raw, ticker, affiliation):
        """Transformiert die Holder-DataFrames (Mutual, Institutional, Major)."""
        df_raw = df_raw.reset_index()
        if df_raw is None or df_raw.empty:
            return pd.DataFrame()
        print(df_raw)
        # Wir nehmen an, dass die Holder-DataFrames eine Spalte 'Holder' und 'pctHeld' haben
        if 'Value' not in df_raw.columns:
            print(f"Unerwartetes Format für Holder-Daten von {ticker}")
            return pd.DataFrame()
        
        df = df_raw.rename(columns={'index': 'item_description', 'Value': 'value'})
        df['ticker'] = ticker
        df['affiliation'] = affiliation
        df['date'] = pd.to_datetime('today').date()  # Wir setzen das Datum auf heute, da es keine Zeitreihe ist
        
        return df[['ticker', 'date', 'affiliation', 'item_description', 'value']]
    
    @staticmethod
    def transform_holders(df_raw, ticker, affiliation):
        """Transformiert die Holder-DataFrames (Mutual, Institutional, Major)."""
        
        if df_raw is None or df_raw.empty:
            return pd.DataFrame()
        
        # Wir nehmen an, dass die Holder-DataFrames eine Spalte 'Holder' und 'pctHeld' haben
        if 'Holder' not in df_raw.columns or 'pctHeld' not in df_raw.columns:
            print(f"Unerwartetes Format für Holder-Daten von {ticker}")
            return pd.DataFrame()
        
        df = df_raw.rename(columns={'Holder': 'item_description', 'pctHeld': 'value', 'Date Reported': 'date'})
        df = df.drop(columns=["Shares", "Value", "pctChange"], errors='ignore')
        df['ticker'] = ticker
        if affiliation == "mutualfund_holders":
            affiliation = "Mutual Fund Holder"
        else:
            affiliation = "Institutional Holder"
        df['affiliation'] = affiliation        
        return df[['ticker', 'date', 'affiliation', 'item_description', 'value']]

    @staticmethod
    def smart_transform(raw_data, ticker, affiliation, description=None):
        if raw_data is None or (hasattr(raw_data, 'empty') and raw_data.empty):
            return pd.DataFrame()

        # --- FALL 1: DATAFRAMES ---
        if isinstance(raw_data, pd.DataFrame):
            # Wir prüfen sowohl Spalten als auch den Index-Namen
            cols = [str(c) for c in raw_data.columns]
            index_name = raw_data.index.name
            
            # A) Check für Major Holders (oft ist 'Breakdown' der Index)
            if len(raw_data) == 4:
                return FinanceTransformer.transform_holder_group(raw_data, ticker, affiliation)
            
            # B) Check für Institutional/Mutualfund Holders
            elif 'Holder' in cols or index_name == 'Holder':
                return FinanceTransformer.transform_holders(raw_data, ticker, affiliation)
            
            # C) Standard: Finanzberichte
            else:
                return FinanceTransformer.transform_financial_statement(raw_data, ticker, affiliation)
        
        # --- FALL 2: SERIES ---
        elif isinstance(raw_data, pd.Series):
            desc = description or raw_data.name or affiliation
            return FinanceTransformer.transform_series(raw_data, ticker, affiliation, desc)
        
        return pd.DataFrame()

In [127]:
from pathlib import Path

class DataHub:
    def __init__(self, db_name="yahoo_finance.db"):
        # Pfad-Management (funktioniert in Scripts & Notebooks)
        base_path = Path.cwd()
        db_path = base_path.parent.parent / "data" / "01_raw" / "yahoo" / db_name
        db_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Verbindung herstellen
        self.con = duckdb.connect(str(db_path))
        self._initialize_tables()
        print(f"DuckDB verbunden: {db_path}")

    def _initialize_tables(self):
        """Erstellt die Tabellenstruktur, falls sie noch nicht existiert."""
        self.con.execute("""
            CREATE TABLE IF NOT EXISTS bronze_financials (
                ticker VARCHAR,
                date DATE,
                affiliation VARCHAR,
                item_description VARCHAR,
                value DOUBLE,
                ingested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            );
        """)
        
        # Index für schnellere Abfragen später (wichtig bei 30.000 Tickern)
        self.con.execute("CREATE INDEX IF NOT EXISTS idx_ticker_date ON bronze_financials (ticker, date);")

    def insert_financials(self, df: pd.DataFrame):
        """Speichert ein einheitliches DataFrame in die Datenbank."""
        if df is None or df.empty:
            return

        # Wir nutzen den 'Appender' Mechanismus von DuckDB für maximale Speed
        # DuckDB erkennt das DataFrame 'df' im lokalen Python-Scope automatisch
        try:
            self.con.execute("INSERT INTO bronze_financials (ticker, date, affiliation, item_description, value) SELECT ticker, date, affiliation, item_description, value FROM df")
        except Exception as e:
            print(f"Fehler beim Insert: {e}")
    
    def preview_data(self, limit=10):
        """Holt die neuesten Einträge als DataFrame zur Kontrolle."""
        query = f"SELECT * FROM bronze_financials"
        return self.con.execute(query).df()

    def get_summary_stats(self):
        """Gibt eine kleine Statistik über den Füllstand der DB aus."""
        return self.con.execute("""
            SELECT 
                COUNT(DISTINCT ticker) as count_tickers, 
                COUNT(*) as total_rows 
            FROM bronze_financials
        """).df()

    def close(self):
        """Schließt die Verbindung sauber."""
        self.con.close()

    def clear_database(self):
        """
        Löscht alle Daten und Tabellen aus der DuckDB-Datenbank.
        """
        try:
            # Wir holen uns alle Tabellennamen im aktuellen Schema (default: main)
            tables = self.con.execute("SHOW TABLES").fetchall()
            
            if not tables:
                print("Datenbank ist bereits leer.")
                return

            print(f"Lösche {len(tables)} Tabellen...")
            
            # Jede Tabelle einzeln löschen
            for (table_name,) in tables:
                self.con.execute(f"DROP TABLE IF EXISTS {table_name}")
                
            print("Datenbank wurde erfolgreich geleert.")
            
        except Exception as e:
            print(f"Fehler beim Leeren der Datenbank: {e}")

In [121]:
class YahooIngestor:
    def __init__(self, tickers: list[str]):
        self.tickers = tickers
    def ingest(self):
        hub = DataHub()
        for ticker in self.tickers:
            client = YahooClient(ticker)
            data = client.get_all_yf_data()
            for model_name, raw_data in data.items():
                if raw_data is None or (hasattr(raw_data, 'empty') and raw_data.empty):
                    print(f"Keine Daten für {model_name} bei {client.ticker}")
                    continue
                clean_df = FinanceTransformer.smart_transform(raw_data, ticker=client.ticker, affiliation=model_name)
                hub.insert_financials(clean_df)
        hub.close()

In [168]:
DataHub().clear_database()

DuckDB verbunden: c:\Diversification\data\01_raw\yahoo\yahoo_finance.db
Lösche 1 Tabellen...
Datenbank wurde erfolgreich geleert.


In [169]:
YahooIngestor(tickers=["AAPL"]).ingest()
DataHub().preview_data()

DuckDB verbunden: c:\Diversification\data\01_raw\yahoo\yahoo_finance.db
Breakdown                         index       Value
0                   insidersPercentHeld     0.01702
1               institutionsPercentHeld     0.65046
2          institutionsFloatPercentHeld     0.66173
3                     institutionsCount  7190.00000
DuckDB verbunden: c:\Diversification\data\01_raw\yahoo\yahoo_finance.db


Unnamed: 0,ticker,date,affiliation,item_description,value,ingested_at
0,AAPL,2025-09-30,balance_sheet,TreasurySharesNumber,,2026-02-08 21:04:06.725363
1,AAPL,2025-09-30,balance_sheet,OrdinarySharesNumber,14773260000.0,2026-02-08 21:04:06.725363
2,AAPL,2025-09-30,balance_sheet,ShareIssued,14773260000.0,2026-02-08 21:04:06.725363
3,AAPL,2025-09-30,balance_sheet,NetDebt,62723000000.0,2026-02-08 21:04:06.725363
4,AAPL,2025-09-30,balance_sheet,TotalDebt,98657000000.0,2026-02-08 21:04:06.725363
5,AAPL,2025-09-30,balance_sheet,TangibleBookValue,73733000000.0,2026-02-08 21:04:06.725363
6,AAPL,2025-09-30,balance_sheet,InvestedCapital,172390000000.0,2026-02-08 21:04:06.725363
7,AAPL,2025-09-30,balance_sheet,WorkingCapital,-17674000000.0,2026-02-08 21:04:06.725363
8,AAPL,2025-09-30,balance_sheet,NetTangibleAssets,73733000000.0,2026-02-08 21:04:06.725363
9,AAPL,2025-09-30,balance_sheet,CapitalLeaseObligations,,2026-02-08 21:04:06.725363
