In [33]:
# 1. Installationen
# ----------------------------------------------------------------------
!pip install numpy pandas torch
!pip install gymnasium stable-baselines3 wandb



In [34]:
# 2. Standard- & ML-Bibliotheken (Alle Imports)
# ----------------------------------------------------------------------
import os
import uuid # Für eindeutige Run-Namen
from functools import partial # Für parallele Umgebungen
from typing import List, Dict, Optional, Callable

# Daten & ML
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.distributions import Dirichlet

# Gymnasium (OpenAI Gym Ersatz)
import gymnasium as gym
from gymnasium import spaces

# Stable Baselines 3 (SB3)
from stable_baselines3 import PPO
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.distributions import Distribution
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback, CallbackList

# Weights & Biases (W&B)
import wandb
from wandb.integration.sb3 import WandbCallback

# Google Colab (falls Umgebung 'colab' ist)
try:
    from google.colab import drive, userdata
except ImportError:
    print("Nicht in Colab-Umgebung. Überspringe Colab-Imports.")

In [35]:
import uuid # Für eindeutige Run-Namen

# 3. KONFIGURATION, PORTABILITÄT & SETUP
# ----------------------------------------------------------------------

# --- 1. Umgebungs-Erkennung & Secrets ---
# Optionen: "colab", "kaggle" oder "htw"
ENV = "colab"
WANDB_API_KEY = ""
BASE_DIR = ""

if ENV == "colab":
    try:
        drive.mount('/content/drive')
        WANDB_API_KEY = userdata.get('WANDB_API_KEY')
        BASE_DIR = '/content/drive/MyDrive/data/PPO_portfolio_optimization'
        print("Umgebung: Google Colab. Drive gemountet.")
    except Exception as e:
        print(f"Colab-Fehler: {e}")

elif ENV == "kaggle":
    # (Muss angepasst werden, falls Kaggle-Secrets verwendet werden)
    # from kaggle_secrets import UserSecretsClient
    # WANDB_API_KEY = UserSecretsClient().get_secret("WANDB_API_KEY")
    BASE_DIR = '/kaggle/input/ppo-data' # (Beispielpfad)
    print("Umgebung: Kaggle.")

elif ENV == "htw":
    WANDB_API_KEY = os.environ.get('WANDB_API_KEY')
    BASE_DIR = '/home/christoph/data/ppo_portfolio' # (Beispielpfad)
    print("Umgebung: HTW Server.")

# W&B Login
if WANDB_API_KEY:
    os.environ['WANDB_API_KEY'] = WANDB_API_KEY
    wandb.login(key=WANDB_API_KEY)
else:
    print("WARNUNG: WANDB_API_KEY nicht gefunden. W&B wird evtl. interaktiv nach Login fragen.")

# --- 2. Dynamischer Notebook-Name für W&B ---
NOTEBOOK_NAME = "PPO_Training_V3.ipynb" # Fallback
os.environ['WANDB_NOTEBOOK_NAME'] = NOTEBOOK_NAME
print(f"Notebook-Name für W&B gesetzt: {NOTEBOOK_NAME}")

# --- 3. Zentrale Konfiguration (Config-Objekt) ---
config = {
    "project_name": "PPO_Portfolio_SP500",
    "run_name": f"PPO_LSTM_{uuid.uuid4().hex[:8]}",
    "use_wandb": True,
    "save_model": True,
    "env_id": ENV,

    "feature_tensor_path": '/content/drive/MyDrive/01_Data/projects/PPO_portfolio_optimization/processed_data/features_cleaned.csv',
    "toy_data_csv_path": None, # os.path.join(BASE_DIR, 'DS4_Risk_Choice.csv')
    "model_save_dir": os.path.join(BASE_DIR, 'models'),

    "train_start_date": '2005-01-01', # Start nach 20 Tagen Rolling Window
    "train_end_date": '2019-12-31',
    "eval_start_date": '2020-01-01',
    "eval_end_date": '2024-12-31',

    "initial_balance": 10000.0,
    "window_size": 30,
    "transaction_cost_pct": 0.001,

    "total_timesteps": 6_000_000,
    "num_cpu_cores": 1,

    "n_steps": 4096,
    "batch_size": 1024,
    "n_epochs": 8,
    "learning_rate": 0.0001,
    "gamma": 0.99,
    "gae_lambda": 0.95,
    "clip_range": 0.1,
    "vf_coef": 0.5,
    "ent_coef": 0.0,

    "extractor_type": "LSTM",
    "extractor_hidden_size": 128,
    "policy_pi_arch": [64],
    "policy_vf_arch": [64]
}

print(f"--- Konfiguration geladen für: {ENV} ---")
print(f"Datenquelle: {config['feature_tensor_path']}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




Umgebung: Google Colab. Drive gemountet.
Notebook-Name für W&B gesetzt: PPO_Training_V3.ipynb
--- Konfiguration geladen für: colab ---
Datenquelle: /content/drive/MyDrive/01_Data/projects/PPO_portfolio_optimization/processed_data/features_cleaned.csv


In [36]:
class PortfolioEnv(gym.Env):
    """
    [KORRIGIERTE VERSION v3]
    - Behebt den Absturz in _get_obs (unstack vs. stack).
    - Stellt sicher, dass 'market_data' als Key verwendet wird.
    - Stellt sicher, dass der ActionSpace (0,1) zu Zelle 6 passt.
    """
    metadata = {'render_modes': ['human'], 'render_fps': 30}

    def __init__(self, data_df: pd.DataFrame, **kwargs):
        super(PortfolioEnv, self).__init__()

        self.data = data_df
        # Preis-Ratios extrahieren (exp(LogReturns))
        try:
            self.price_ratios = np.exp(data_df.loc[:, 'LogReturns'].values)
        except KeyError:
            print("FATALER FEHLER: Spalte 'LogReturns' nicht im MultiIndex gefunden.")
            raise

        # Konfiguration aus den kwargs (aus Zelle 7) extrahieren
        self.initial_capital = kwargs.get("initial_capital", 100000.0)
        self.max_drawdown = kwargs.get("max_drawdown", 0.5)
        self.transaction_cost_pct = kwargs.get("transaction_cost_pct", 0.001)
        # Name 'window_size' muss zur config in Zelle 3 und 7 passen
        self.lookback_window = kwargs.get("window_size", 10)

        # Metriken & State
        self.current_portfolio_value = self.initial_capital
        self.current_step = self.lookback_window
        self.max_steps = len(data_df) - 1

        # Dimensionen
        self.num_assets = self.price_ratios.shape[1]
        self.num_features = self.data.columns.get_level_values(0).nunique()
        self.N_ASSETS_PLUS_CASH = self.num_assets + 1

        # Action Space: Box(0, 1)
        # Die CustomDirichletDistribution (Zelle 6) gibt fertige Gewichte aus (sum=1).
        self.action_space = spaces.Box(low=0.0, high=1.0,
                                       shape=(self.N_ASSETS_PLUS_CASH,),
                                       dtype=np.float32)

        # Observation Space: Dict
        # Die Keys ('market_data', 'portfolio_weights') müssen exakt zu
        # Zelle 5 (CustomCombinedExtractor) passen.
        self.observation_space = spaces.Dict({
            'market_data': spaces.Box(low=-np.inf, high=np.inf,
                                   shape=(self.lookback_window, self.num_assets, self.num_features),
                                   dtype=np.float32),
            'portfolio_weights': spaces.Box(low=0.0, high=1.0,
                                            shape=(self.N_ASSETS_PLUS_CASH,),
                                            dtype=np.float32)
        })

        # Initialgewichtung (Cash-Position [0] + Assets [1:])
        self.current_weights = np.array([1.0 / self.N_ASSETS_PLUS_CASH] * self.N_ASSETS_PLUS_CASH, dtype=np.float32)

    def _get_obs(self):
        start = self.current_step - self.lookback_window
        end = self.current_step

        # Shape (L, F*A)
        feature_history_df = self.data.iloc[start:end]

        # [KORREKTUR]
        # Um (L, F*A) -> (L, A, F) zu bekommen, nutzen wir stack(level=1).
        # unstack(level=0) war die Ursache für den Absturz der Worker.
        try:
            # 1. stack(level=1): Ticker (Level 1) nach innen -> (L * A, F)
            # 2. .values: NumPy Array holen
            # 3. .reshape: In die Zielform bringen (L, A, F)
            feature_history = feature_history_df.stack(level=1).values.reshape(
                self.lookback_window, self.num_assets, self.num_features
            )
        except ValueError as e:
            print(f"FATALER FEHLER im Reshape von _get_obs (Worker stürzt ab):")
            print(f"  DataFrame Shape: {feature_history_df.shape}")
            print(f"  Erwartetes Array Shape: ({self.lookback_window}, {self.num_assets}, {self.num_features})")
            print(f"  Fehlermeldung: {e}")
            raise e

        # Keys müssen zu Zelle 5 (Extractor) passen
        return {
            'market_data': feature_history.astype(np.float32),
            'portfolio_weights': self.current_weights.astype(np.float32)
        }

    def step(self, action: np.ndarray):

        # 'action' sind die gesampelten Gewichte [w0, ..., wN]
        # von der CustomDirichletDistribution (Zelle 6).
        # Es ist keine Softplus-Logik mehr in der Env nötig.
        new_weights = action

        # 2. Transaktionskosten
        delta_weights = np.abs(new_weights - self.current_weights)
        turnover = np.sum(delta_weights[1:]) # Nur Assets (Index 1 bis N), nicht Cash (Index 0)
        cost = turnover * self.transaction_cost_pct

        # 3. Portfolio-Wertentwicklung
        asset_ratios = self.price_ratios[self.current_step]
        # Cash-Ratio (Index 0) ist 1.0, Asset-Ratios (Index 1..N)
        price_ratios_with_cash = np.insert(asset_ratios, 0, 1.0)

        portfolio_return = np.sum(new_weights * price_ratios_with_cash) - cost
        self.current_portfolio_value *= portfolio_return

        # 4. Reward (Log-Return) - Abgesichert gegen log(0)
        safe_portfolio_return = max(portfolio_return, 1e-9)
        reward = np.log(safe_portfolio_return)

        # 5. Aktualisieren und Termination
        self.current_weights = new_weights
        self.current_step += 1

        terminated = (self.current_step >= self.max_steps) or \
                     (self.current_portfolio_value < self.initial_capital * (1 - self.max_drawdown))

        truncated = False

        info = {
            'portfolio_value': self.current_portfolio_value,
            'portfolio_return': portfolio_return,
            'reward': reward,
            'weights': self.current_weights
        }

        # Strafe bei Drawdown-Verletzung
        if terminated and self.current_portfolio_value < self.initial_capital * (1 - self.max_drawdown):
            reward = -10.0

        return self._get_obs(), reward, terminated, truncated, info

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = self.lookback_window
        self.current_portfolio_value = self.initial_capital
        self.current_weights = np.array([1.0 / self.N_ASSETS_PLUS_CASH] * self.N_ASSETS_PLUS_CASH, dtype=np.float32)
        return self._get_obs(), {}

    def render(self): pass
    def close(self): pass

In [37]:
# 5. Angepasster Feature Extractor (Dynamische Architektur)
# ----------------------------------------------------------------------
class CustomCombinedExtractor(BaseFeaturesExtractor):
    def __init__(
        self,
        observation_space: spaces.Dict,

        # Diese Argumente kommen direkt aus policy_kwargs
        extractor_type: str = "LSTM",
        hidden_size: int = 64
    ):

        # --- Dimensionen bestimmen ---
        market_space = observation_space["market_data"]
        weights_space = observation_space["portfolio_weights"]
        window_size, num_assets, num_features = market_space.shape

        self.lstm_input_size = num_assets * num_features
        self.lstm_hidden_size = hidden_size

        features_dim = self.lstm_hidden_size + weights_space.shape[0]

        # super().__init__ MUSS hier aufgerufen werden
        super().__init__(observation_space, features_dim)

        # --- Netzwerk definieren (basierend auf den direkten Argumenten) ---

        # --- KORREKTUR v4 ---
        # .toUpperCase() (JavaScript) zu .upper() (Python) geändert

        if extractor_type.upper() == "LSTM":
            self.rnn = nn.LSTM(
                input_size=self.lstm_input_size,
                hidden_size=self.lstm_hidden_size,
                batch_first=True
            )
        elif extractor_type.upper() == "GRU":
            self.rnn = nn.GRU(
                input_size=self.lstm_input_size,
                hidden_size=self.lstm_hidden_size,
                batch_first=True
            )
        # --- ENDE KORREKTUR v4 ---
        else:
            raise ValueError(f"Unbekannter extractor_type: {extractor_type}")

        print(f"CustomCombinedExtractor (Typ: {extractor_type}, Hidden: {hidden_size}) initialisiert.")


    def forward(self, observations: dict) -> torch.Tensor:
        market_data = observations["market_data"]
        portfolio_weights = observations["portfolio_weights"]
        batch_size, window_size = market_data.shape[0], market_data.shape[1]

        flat_market_data = market_data.reshape(batch_size, window_size, -1)

        rnn_out, hidden = self.rnn(flat_market_data)

        if isinstance(hidden, tuple): # LSTM
            last_hidden_state = hidden[0][-1]
        else: # GRU
            last_hidden_state = hidden[-1]

        combined_features = torch.cat([last_hidden_state, portfolio_weights], dim=1)
        return combined_features

print("--- CustomCombinedExtractor Klasse (KORRIGIERT v4) definiert ---")

--- CustomCombinedExtractor Klasse (KORRIGIERT v4) definiert ---


In [38]:
# 6. Policy-Klassen & Callbacks (Aus V2 übernommen)
# ----------------------------------------------------------------------

class CustomDirichletDistribution(Distribution):
    # (Kompletter Code aus deiner V2-Datei)
    def __init__(self, action_dim: int):
        super().__init__()
        self.action_dim = action_dim
    def proba_distribution_net(self, latent_dim: int) -> nn.Module:
        action_net = nn.Linear(latent_dim, self.action_dim)
        return action_net
    def proba_distribution(self, action_net_output: torch.Tensor) -> "CustomDirichletDistribution":
        alphas = torch.nn.functional.softplus(action_net_output) + 1.0
        self.distribution = Dirichlet(alphas)
        return self
    def log_prob(self, actions: torch.Tensor) -> torch.Tensor:
        actions_clipped = torch.clamp(actions, 1e-6, 1.0 - 1e-6)
        actions_normalized = actions_clipped / torch.sum(actions_clipped, dim=-1, keepdim=True)
        return self.distribution.log_prob(actions_normalized)
    def entropy(self) -> torch.Tensor:
        return self.distribution.entropy()
    def sample(self) -> torch.Tensor:
        return self.distribution.rsample()
    def mode(self) -> torch.Tensor:
        return self.distribution.mean
    def actions_from_params(self, action_net_output: torch.Tensor, deterministic: bool = False) -> torch.Tensor:
        self.proba_distribution(action_net_output)
        if deterministic: return self.mode()
        return self.sample()
    def log_prob_from_params(self, action_net_output: torch.Tensor) -> (torch.Tensor, torch.Tensor):
        self.proba_distribution(action_net_output)
        actions = self.sample()
        log_prob = self.log_prob(actions)
        return actions, log_prob

class CustomActorCriticPolicy(ActorCriticPolicy):
    # (Kompletter Code aus deiner V2-Datei)
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    def _get_action_dist_from_space(self, action_space: spaces.Box) -> CustomDirichletDistribution:
        if not isinstance(action_space, spaces.Box):
            raise ValueError("DirichletPolicy unterstützt nur Box Action Space.")
        action_dim = action_space.shape[0]
        return CustomDirichletDistribution(action_dim)

class AvgStepRewardLogger(BaseCallback):
    # (Kompletter Code aus deiner V2-Datei)
    def __init__(self, verbose=0):
        super(AvgStepRewardLogger, self).__init__(verbose)
        self.step_rewards = []
    def _on_step(self) -> bool:
        self.step_rewards.extend(self.locals['rewards'])
        return True
    def _on_rollout_end(self) -> None:
        if self.step_rewards:
            avg_step_reward = np.mean(self.step_rewards)
            self.logger.record("reward/step_reward_avg", avg_step_reward)
            self.step_rewards = []

print("--- Alle Klassen (Env, Extractor, Policy, Callbacks) definiert ---")

--- Alle Klassen (Env, Extractor, Policy, Callbacks) definiert ---


In [41]:
# %% [code]
#
# 7. HAUPT-TRAININGSFUNKTION & START
# (ANGEPASST AN NEUE CONFIG-KEYS)
# ----------------------------------------------------------------------

# --- Helfer-Funktion für SubprocVecEnv (Parallelisierung) ---
# Diese Funktion MUSS auf der obersten Ebene (global) sein.
def create_env(data_config: dict, env_config: dict, start_date: str, end_date: str) -> gym.Env:
    """Erstellt eine Instanz der Umgebung und wickelt sie in Monitor ein."""
    env = PortfolioEnv(
        data_config=data_config,
        env_config=env_config,
        start_date=start_date,
        end_date=end_date
    )
    env = Monitor(env) # Monitor ist wichtig für SB3, um Belohnungen zu loggen
    return env

# --- Haupt-Trainingsfunktion (für W&B Sweep) ---
def train(config):
    """
    Haupt-Trainingsfunktion.
    """

    # --- 1. W&B Run initialisieren ---
    run = wandb.init(
        project=config["project_name"],
        config=config,
        name=config["run_name"],
        sync_tensorboard=True,
        monitor_gym=True,
        save_code=True,
        reinit=True
    )

    # (Aktualisiere die Config mit der von W&B, falls im Sweep)
    config = wandb.config

    # --- 2. Umgebungskonfigurationen vorbereiten ---

    # [NOTWENDIGE KORREKTUR]
    # Wir müssen den Key-Namen verwenden, den wir in Zelle 3 definiert haben
    data_cfg = {
        "feature_csv_path": config.feature_csv_path, # (War vorher 'feature_tensor_path')
        "toy_data_csv_path": config.toy_data_csv_path
    }
    # [ENDE DER KORREKTUR]

    env_cfg = {
        "initial_balance": config.initial_balance,
        "window_size": config.window_size,
        "transaction_cost_pct": config.transaction_cost_pct
    }

    # --- 3. Erstelle TRAIN-Umgebung (Parallel) ---
    print(f"Erstelle {config.num_cpu_cores} parallele TRAIN-Umgebungen...")
    train_env_partial = partial(
        create_env,
        data_config=data_cfg,
        env_config=env_cfg,
        start_date=config.train_start_date,
        end_date=config.train_end_date
    )

    train_env = SubprocVecEnv([train_env_partial for _ in range(config.num_cpu_cores)])

    # --- 4. Erstelle EVAL-Umgebung (Out-of-Sample) ---
    print("Erstelle EVAL-Umgebung (Out-of-Sample)...")
    eval_env_partial = partial(
        create_env,
        data_config=data_cfg,
        env_config=env_cfg,
        start_date=config.eval_start_date,
        end_date=config.eval_end_date
    )
    eval_env = DummyVecEnv([eval_env_partial])

    # --- 5. Policy-Architektur definieren ---
    policy_kwargs = dict(
        features_extractor_class=CustomCombinedExtractor,
        features_extractor_kwargs=dict(
             extractor_type=config.extractor_type,
            hidden_size=config.extractor_hidden_size
        ),
        net_arch=dict(
            pi=config.policy_pi_arch,
            vf=config.policy_vf_arch
        )
    )

    # --- 6. Callbacks definieren ---
    callback_list = []
    if config.use_wandb:
        wandb_callback = WandbCallback(model_save_path=None, verbose=0)
        callback_list.append(wandb_callback)

    callback_list.append(AvgStepRewardLogger())

    if config.save_model:
        save_path = os.path.join(config.model_save_dir, run.id)
        os.makedirs(save_path, exist_ok=True)
        print(f"Modell-Speicherung: {save_path}")
        eval_callback = EvalCallback(
            eval_env,
            best_model_save_path=save_path,
            log_path=save_path,
            eval_freq=max(config.n_steps // config.num_cpu_cores, 500),
            n_eval_episodes=1,
            deterministic=True,
            render=False,
            verbose=1
        )
        callback_list.append(eval_callback)

    # --- 7. Modell initialisieren (PPO) ---
    model = PPO(
        policy=CustomActorCriticPolicy, # (Aus Block 6)
        env=train_env,
        n_steps=config.n_steps,
        batch_size=config.batch_size,
        n_epochs=config.n_epochs,
        learning_rate=config.learning_rate,
        gamma=config.gamma,
        gae_lambda=config.gae_lambda,
        clip_range=config.clip_range,
        vf_coef=config.vf_coef,
        ent_coef=config.ent_coef,
        policy_kwargs=policy_kwargs,
        tensorboard_log=f"runs/{run.id}",
        device="cuda" if torch.cuda.is_available() else "cpu",
        verbose=0
    )

    print(f"--- PPO-Modell initialisiert (Gerät: {model.device}) ---")

    # --- 8. Training starten ---
    print(f"--- Starte Training für {config.total_timesteps} Timesteps ---")
    try:
        model.learn(
            total_timesteps=config.total_timesteps,
            callback=CallbackList(callback_list),
            progress_bar=True
        )
        print("--- Training abgeschlossen ---")
    except Exception as e:
        print(f"FEHLER während des Trainings: {e}")
    finally:
        # --- 9. Aufräumen ---
        train_env.close()
        eval_env.close()
        run.finish()
        print("--- Run beendet und Umgebungen geschlossen ---")

# ----------------------------------------------------------------------
# STARTPUNKT (WICHTIG für SubprocVecEnv)
# ----------------------------------------------------------------------
if __name__ == "__main__":
    # Hole die 'config', die wir in Block 3 global definiert haben
    global_config = globals().get('config')
    if global_config:
        train(global_config)
    else:
        print("FEHLER: Globale 'config' wurde nicht gefunden.")

AttributeError: <class 'wandb.sdk.wandb_config.Config'> object has no attribute 'feature_csv_path'