In [4]:
import pandas as pd
import yfinance as yf
import duckdb
import re
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [5]:
import logging

# Konfiguration des Loggers
logging.basicConfig(
    filename='errors.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

In [6]:
class YahooClient:
    def __init__(self, ticker: str):
        self.ticker = ticker
        self._yf = yf.Ticker(ticker)
    
    def identification(self):
        return pd.DataFrame([self._yf.get_isin()])
    
    def mutualfund_holders(self):
        return self._yf.get_mutualfund_holders()

    def institutional_holders(self):
        return self._yf.get_institutional_holders()
    
    def major_holders(self):
        return self._yf.get_major_holders()

    def dividends(self):
        return self._yf.get_dividends()
    
    def shares_outstanding(self):
        return self._yf.get_shares_full()
    
    def income_statement(self):
        return self._yf.get_income_stmt()
    
    def balance_sheet(self):
        return self._yf.get_balance_sheet()

    def cashflow(self):
        return self._yf.get_cashflow()
    def get_all_yf_data(self):
        # Wir definieren, welche Methoden wir abrufen wollen
        methods = {
            #"identification": self.identification,
            "balance_sheet": self.balance_sheet,
            "income_statement": self.income_statement,
            "cashflow": self.cashflow,
            "dividends": self.dividends,
            "shares_outstanding": self.shares_outstanding,
            "mutualfund_holders": self.mutualfund_holders,
            "institutional_holders": self.institutional_holders,
            "major_holders": self.major_holders,
            
        }
        data_package = {}
        
        for name, method in methods.items():
            try:
                # Wir führen die Methode aus
                data = method()
                if data is not None and not data.empty:
                    data_package[name] = data
            except Exception as e:
                print(f"Fehler beim Abrufen von {name} für {self.ticker}: {e}")
                
        return data_package
    

In [7]:
class FinanceTransformer:
    @staticmethod
    def transform_identification(df_raw, ticker, affiliation):
        """Transformiert die Identifikationsdaten (ISIN)."""
        df = df_raw
        df['ticker'] = ticker
        df['affiliation'] = affiliation
        df['item_description'] = 'isin'
        df['date'] = pd.to_datetime('today').date()
        df['value'] = df.iloc[0, 0]  # Wir nehmen den ISIN-Wert aus der ersten Zelle
        
        return df[['ticker', 'date', 'affiliation', 'item_description', 'value']]
    
    
    @staticmethod
    def transform_financial_statement(df_raw, ticker, affiliation, date_pattern = r'^\d{4}-\d{2}-\d{2}$'):
        """Transformiert Income Statement, Balance Sheet oder Cashflow.""" 
        # Das typische yfinance Format ist: Zeilen = Items, Spalten = Daten
        # Wir müssen es in das 'Long-Format' schmelzen (Melt)
        if pd.to_datetime(df_raw.columns, errors='coerce').isna().any():
            print(f"Format-Fehler: Spaltennamen von {affiliation} für {ticker} entsprechen nicht dem Datumsmuster.")
            print(df_raw.head())
            return pd.DataFrame()   
        df = df_raw.melt(ignore_index=False, var_name='date', value_name='value')
        df = df.reset_index().rename(columns={'index': 'item_description'})
        
        df['ticker'] = ticker
        df['affiliation'] = affiliation
        
        # Datentyp-Härtung
        df['date'] = pd.to_datetime(df['date']).dt.tz_localize(None).dt.date
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        
        return df[['ticker', 'date', 'affiliation', 'item_description', 'value']]

    @staticmethod
    def transform_series(series, ticker, affiliation):           
        df = series.reset_index()
        df.columns = ['date', 'value']
        
        df['ticker'] = ticker
        df['affiliation'] = affiliation
        df['item_description'] = affiliation
        
        df['date'] = pd.to_datetime(df['date']).dt.tz_localize(None).dt.date
        df['value'] = pd.to_numeric(df['value'], errors='coerce')
        
        return df[['ticker', 'date', 'affiliation', 'item_description', 'value']]
    
    @staticmethod
    def transform_holder_group(df_raw, ticker, affiliation):
        """Transformiert die Holder-DataFrames (Mutual, Institutional, Major)."""
        df_raw = df_raw.reset_index()
        # Wir nehmen an, dass die Holder-DataFrames eine Spalte 'Holder' und 'pctHeld' haben
        if 'Value' not in df_raw.columns or 'index' not in df_raw.columns:
            print(f"Unerwartetes Format der Eigentümer Grupen (Major Holders) von {ticker}")
            print(df_raw.head())
            return pd.DataFrame()
        
        df = df_raw.rename(columns={'index': 'item_description', 'Value': 'value'})
        df['ticker'] = ticker
        df['affiliation'] = affiliation
        df['date'] = pd.to_datetime('today').date()
        
        return df[['ticker', 'date', 'affiliation', 'item_description', 'value']]
    
    @staticmethod
    def transform_holders(df_raw, ticker, affiliation):
        """Transformiert die Holder-DataFrames (Mutual, Institutional, Major)."""       
        # Wir nehmen an, dass die Holder-DataFrames eine Spalte 'Holder' und 'pctHeld' haben
        if 'Holder' not in df_raw.columns or 'pctHeld' not in df_raw.columns:
            print(f"Unerwartetes Format für Eigentümer (Mutual Funds & Institutional) von {ticker}")
            print(df_raw.head())
            return pd.DataFrame()
        
        df = df_raw.rename(columns={'Holder': 'item_description', 'pctHeld': 'value', 'Date Reported': 'date'})
        df = df.drop(columns=["Shares", "Value", "pctChange"], errors='ignore')
        df['ticker'] = ticker
        df['affiliation'] = affiliation        
        return df[['ticker', 'date', 'affiliation', 'item_description', 'value']]

    @staticmethod
    def smart_transform(raw_data, ticker, affiliation, description=None):
        # --- FALL 1: DATAFRAMES ---
        if isinstance(raw_data, pd.DataFrame):
            # Wir prüfen sowohl Spalten als auch den Index-Namen
            cols = [str(c) for c in raw_data.columns]
            index_name = raw_data.index.name
            
            # A) Check für Major Holders
            if affiliation == 'major_holders':
                return FinanceTransformer.transform_holder_group(raw_data, ticker, affiliation)
            
            # B) Check für Institutional/Mutualfund Holders
            elif affiliation == 'institutional_holders' or affiliation == 'mutualfund_holders':
                return FinanceTransformer.transform_holders(raw_data, ticker, affiliation)
            
            # C) Standard: Finanzberichte
            elif affiliation in ['balance_sheet', 'income_statement', 'cashflow']:
                return FinanceTransformer.transform_financial_statement(raw_data, ticker, affiliation)
            elif affiliation == 'identification':
                return FinanceTransformer.transform_identification(raw_data, ticker, affiliation)
        
        # --- FALL 2: SERIES ---
        elif isinstance(raw_data, pd.Series):
            return FinanceTransformer.transform_series(raw_data, ticker, affiliation)
        
        return pd.DataFrame()

In [8]:
from pathlib import Path

class DataHub:
    def __init__(self, db_name="yahoo_finance.db"):
        # Pfad-Management (funktioniert in Scripts & Notebooks)
        base_path = Path.cwd()
        db_path = base_path.parent.parent / "data" / "01_raw" / "yahoo" / db_name
        db_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Verbindung herstellen
        self.con = duckdb.connect(str(db_path))
        self._initialize_tables()
        print(f"DuckDB verbunden: {db_path}")

    def _initialize_tables(self):
        """Erstellt die Tabellenstruktur, falls sie noch nicht existiert."""
        self.con.execute("""
            CREATE TABLE IF NOT EXISTS bronze_financials (
                ticker VARCHAR,
                date DATE,
                affiliation VARCHAR,
                item_description VARCHAR,
                value DOUBLE,
                ingested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            );
        """)
        
        # Index für schnellere Abfragen später (wichtig bei 30.000 Tickern)
        self.con.execute("CREATE INDEX IF NOT EXISTS idx_ticker_date ON bronze_financials (ticker, date);")

    def insert_financials(self, df: pd.DataFrame):
        """Speichert ein einheitliches DataFrame in die Datenbank."""
        if df is None or df.empty:
            return

        # Wir nutzen den 'Appender' Mechanismus von DuckDB für maximale Speed
        # DuckDB erkennt das DataFrame 'df' im lokalen Python-Scope automatisch
        try:
            self.con.execute("INSERT INTO bronze_financials (ticker, date, affiliation, item_description, value) SELECT ticker, date, affiliation, item_description, value FROM df")
        except Exception as e:
            print(f"Fehler beim Insert: {e}")
    
    def preview_data(self, limit=10):
        """Holt die neuesten Einträge als DataFrame zur Kontrolle."""
        query = f"SELECT * FROM bronze_financials"
        df = self.con.execute(query).df()
        return df

    def get_summary_stats(self):
        """Gibt eine kleine Statistik über den Füllstand der DB aus."""
        return self.con.execute("""
            SELECT 
                COUNT(DISTINCT ticker) as count_tickers, 
                COUNT(*) as total_rows 
            FROM bronze_financials
        """).df()

    def close(self):
        """Schließt die Verbindung sauber."""
        self.con.close()

    def clear_database(self):
        """
        Löscht alle Daten und Tabellen aus der DuckDB-Datenbank.
        """
        try:
            # Wir holen uns alle Tabellennamen im aktuellen Schema (default: main)
            tables = self.con.execute("SHOW TABLES").fetchall()
            
            if not tables:
                print("Datenbank ist bereits leer.")
                return

            print(f"Lösche {len(tables)} Tabellen...")
            
            # Jede Tabelle einzeln löschen
            for (table_name,) in tables:
                self.con.execute(f"DROP TABLE IF EXISTS {table_name}")
                
            print("Datenbank wurde erfolgreich geleert.")
            
        except Exception as e:
            print(f"Fehler beim Leeren der Datenbank: {e}")

In [9]:
class YahooIngestor:
    def __init__(self, tickers: list[str]):
        self.tickers = tickers
    def ingest(self):
        hub = DataHub()
        for ticker in self.tickers:
            client = YahooClient(ticker)
            data = client.get_all_yf_data()
            for model_name, raw_data in data.items():
                if raw_data is None or not isinstance(raw_data, (pd.DataFrame, pd.Series)) or raw_data.empty:
                    msg = f"Keine validen Daten für {model_name} bei {client.ticker} (None oder kein Pandas Objekt oder Leeres Objekt)"
                    print(msg)            # Für die Konsole
                    logging.warning(msg)
                    continue
                clean_df = FinanceTransformer.smart_transform(raw_data, ticker=client.ticker, affiliation=model_name)
                hub.insert_financials(clean_df)
        hub.close()

In [11]:
tickers = pd.read_csv("C:\\Diversification\\data\\tickers.csv")["0"].tolist()
YahooIngestor(tickers).ingest()

DuckDB verbunden: c:\Diversification\data\01_raw\yahoo\yahoo_finance.db
Fehler beim Abrufen von dividends für PRTH: 'NoneType' object is not subscriptable


  if (arr.astype(int) == arr).all():
  if (arr.astype(int) == arr).all():
  if (arr.astype(int) == arr).all():
  if (arr.astype(int) == arr).all():
  if (arr.astype(int) == arr).all():
  if (arr.astype(int) == arr).all():
  if (arr.astype(int) == arr).all():
  if (arr.astype(int) == arr).all():
  if (arr.astype(int) == arr).all():
  if (arr.astype(int) == arr).all():
  if (arr.astype(int) == arr).all():
  if (arr.astype(int) == arr).all():


In [23]:
df = DataHub().preview_data()


DuckDB verbunden: c:\Diversification\data\01_raw\yahoo\yahoo_finance.db


In [24]:
df.groupby('ticker').size().mean()

909.6916436464088