In [None]:
# 1. Installationen
# ----------------------------------------------------------------------
!pip install numpy pandas torch
!pip install gymnasium stable-baselines3 wandb

In [None]:
# 2. Standard- & ML-Bibliotheken (Alle Imports)
# ----------------------------------------------------------------------
import os
import uuid # Für eindeutige Run-Namen
from functools import partial # Für parallele Umgebungen
from typing import List, Dict, Optional, Callable

# Daten & ML
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.distributions import Dirichlet

# Gymnasium (OpenAI Gym Ersatz)
import gymnasium as gym
from gymnasium import spaces

# Stable Baselines 3 (SB3)
from stable_baselines3 import PPO
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.distributions import Distribution
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback, CallbackList

# Weights & Biases (W&B)
import wandb
from wandb.integration.sb3 import WandbCallback

# Google Colab (falls Umgebung 'colab' ist)
try:
    from google.colab import drive, userdata
except ImportError:
    print("Nicht in Colab-Umgebung. Überspringe Colab-Imports.")

In [36]:
# %% [code]
#
# 3. KONFIGURATION, PORTABILITÄT & SETUP
# (ANGEPASST MIT WANDB-LOGGING-PFAD)
# ----------------------------------------------------------------------

# --- 1. Umgebungs-Erkennung & Secrets ---
ENV = "colab"
WANDB_API_KEY = ""
BASE_DIR = ""

if ENV == "colab":
    try:
        drive.mount('/content/drive')
        WANDB_API_KEY = userdata.get('WANDB_API_KEY')
        BASE_DIR = '/content/drive/MyDrive/01_Data/projects/PPO_portfolio_optimization'
        print("Umgebung: Google Colab. Drive gemountet.")
    except Exception as e:
        print(f"Colab-Fehler: {e}")
else:
    print(f"Umgebung: {ENV} (Lokale Pfade evtl. anpassen)")

# W&B Login
if WANDB_API_KEY:
    os.environ['WANDB_API_KEY'] = WANDB_API_KEY
    wandb.login(key=WANDB_API_KEY)
else:
    print("WARNUNG: WANDB_API_KEY nicht gefunden.")

# --- 2. Dynamischer Notebook-Name für W&B ---
NOTEBOOK_NAME = "PPO_Training_V3_CSV_Input.ipynb"
os.environ['WANDB_NOTEBOOK_NAME'] = NOTEBOOK_NAME
print(f"Notebook-Name für W&B gesetzt: {NOTEBOOK_NAME}")

# --- 3. Zentrale Konfiguration (Config-Objekt) ---
config = {
    "project_name": "PPO_Portfolio_SP500",
    "run_name": f"PPO_LSTM_CSV_{uuid.uuid4().hex[:8]}",
    "use_wandb": True,
    "save_model": True,
    "env_id": ENV,

    "feature_csv_path": os.path.join(BASE_DIR, 'processed_data', 'features_cleaned.csv'),
    "toy_data_csv_path": None,
    "model_save_dir": os.path.join(BASE_DIR, 'models'),

    # [NEU] W&B Log-Verzeichnis im Google Drive
    "wandb_log_dir": os.path.join(BASE_DIR, 'data', 'wandb_logs'),

    "train_start_date": '2005-01-01',
    "train_end_date": '2019-12-31',
    "eval_start_date": '2020-01-01',
    "eval_end_date": '2024-12-31',

    "initial_balance": 10000.0,
    "window_size": 30,
    "transaction_cost_pct": 0.001,

    "total_timesteps": 2_000_000,
    "num_cpu_cores": 2,

    "n_steps": 4096,
    "batch_size": 1024,
    "n_epochs": 8,
    "learning_rate": 0.0001,
    "gamma": 0.99,
    "gae_lambda": 0.95,
    "clip_range": 0.1,
    "vf_coef": 0.5,
    "ent_coef": 0.0,

    "extractor_type": "LSTM",
    "extractor_hidden_size": 128,
    "policy_pi_arch": [64],
    "policy_vf_arch": [64]
}

os.makedirs(config['model_save_dir'], exist_ok=True)
os.makedirs(config['wandb_log_dir'], exist_ok=True) # [NEU] Erstelle das Log-Verzeichnis

print(f"--- Konfiguration geladen für: {ENV} ---")
print(f"Datenquelle (CSV): {config['feature_csv_path']}")
print(f"W&B Logs werden gespeichert in: {config['wandb_log_dir']}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Umgebung: Google Colab. Drive gemountet.
Notebook-Name für W&B gesetzt: PPO_Training_V3_CSV_Input.ipynb
--- Konfiguration geladen für: colab ---
Datenquelle (CSV): /content/drive/MyDrive/01_Data/projects/PPO_portfolio_optimization/processed_data/features_cleaned.csv
W&B Logs werden gespeichert in: /content/drive/MyDrive/01_Data/projects/PPO_portfolio_optimization/data/wandb_logs


In [37]:
# %% [code]
#
# 4. Angepasste Environment Class (FIXED: V5 Logic in V6 Architecture)
# ----------------------------------------------------------------------
class PortfolioEnv(gym.Env):
    metadata = {"render_modes": ["human"]}

    def __init__(
        self,
        data_config: dict,
        env_config: dict,
        start_date: str,
        end_date: str
    ):
        super().__init__()
        self.data_config = data_config
        self.env_config = env_config
        self.start_date_str = start_date
        self.end_date_str = end_date

        self.window_size = self.env_config['window_size']
        self.initial_balance = self.env_config['initial_balance']
        self.transaction_cost_pct = self.env_config['transaction_cost_pct']

        self.done = False
        self.current_step = 0

        # --- Lade- & Setup-Funktionen ---
        self._load_data()
        self._set_time_slices()
        # --------------------------------

        self.num_portfolio_components = self.num_assets + 1
        self.portfolio_value = 0.0
        self.prev_portfolio_value = 0.0

        # Start-Gewichte: 100% Cash (Index 0)
        self.portfolio_weights = np.zeros(self.num_portfolio_components, dtype=np.float32)
        self.portfolio_weights[0] = 1.0

        # Action Space: Wir erlauben hier breitere Werte, da wir sie intern normalisieren
        # (Dies entspricht der "Logits"-Idee aus V5)
        self.action_space = spaces.Box(
            low=-20.0, high=20.0, shape=(self.num_portfolio_components,), dtype=np.float32
        )

        # Observation Space
        self.observation_space = spaces.Dict(
            {
                "market_data": spaces.Box(
                    low=-np.inf,
                    high=np.inf,
                    shape=(
                        self.window_size,
                        self.num_assets,
                        self.num_features
                    ),
                    dtype=np.float32,
                ),
                "portfolio_weights": spaces.Box(
                    low=0.0,
                    high=1.0,
                    shape=(self.num_portfolio_components,),
                    dtype=np.float32,
                ),
            }
        )

    def _load_data(self):
        """ Lädt CSV und wandelt in 3D-Tensor um (Identisch zu V6) """
        csv_path = self.data_config.get('feature_csv_path')
        if not csv_path or not os.path.exists(csv_path):
            raise FileNotFoundError(f"CSV nicht gefunden: {csv_path}")

        # (Lade-Logik wie in V6 - hier gekürzt für Übersichtlichkeit, da identisch)
        print(f"(Worker) Lade Daten: {csv_path}")
        df = pd.read_csv(csv_path, header=[0, 1], index_col=0, parse_dates=True)
        if not df.index.is_monotonic_increasing:
            df.sort_index(inplace=True)

        self.full_data_index = df.index
        self.num_features = df.columns.get_level_values(0).nunique()
        self.num_assets = df.columns.get_level_values(1).nunique()

        total_steps = len(df)
        self._market_data_numpy = df.stack(level=1, future_stack=True).values.reshape(
            total_steps, self.num_assets, self.num_features
        ).astype(np.float32)

        print(f"(Worker) Daten geladen: (T={total_steps}, A={self.num_assets}, F={self.num_features})")

    def _set_time_slices(self):
        start_ts = pd.to_datetime(self.start_date_str)
        end_ts = pd.to_datetime(self.end_date_str)
        start_idx_loc = self.full_data_index.searchsorted(start_ts, side='left')
        end_idx_loc = self.full_data_index.searchsorted(end_ts, side='right') - 1
        self.start_tick = start_idx_loc + self.window_size
        self.end_tick = end_idx_loc

    def _get_observation(self) -> Dict[str, np.ndarray]:
        end_idx = self.current_step
        start_idx = end_idx - self.window_size
        market_data = self._market_data_numpy[start_idx:end_idx]
        return {
            "market_data": market_data.astype(np.float32),
            "portfolio_weights": self.portfolio_weights.astype(np.float32),
        }

    def _calculate_reward(self) -> float:
        # [FIX aus V5]: Sichere Reward-Berechnung
        safe_prev = max(self.prev_portfolio_value, 1e-9)
        safe_curr = max(self.portfolio_value, 1e-9)

        # Log-Return
        reward = np.log(safe_curr / safe_prev)

        # NaN/Inf Guard
        if np.isnan(reward) or np.isinf(reward):
            return -10.0
        return reward

    def reset(self, seed: Optional[int] = None, options: Optional[dict] = None) -> (dict, dict):
        super().reset(seed=seed)
        self.current_step = self.start_tick
        self.portfolio_value = self.initial_balance
        self.prev_portfolio_value = self.initial_balance

        self.portfolio_weights = np.zeros(self.num_portfolio_components, dtype=np.float32)
        self.portfolio_weights[0] = 1.0 # 100% Cash

        self.done = False
        return self._get_observation(), {}

    def step(self, action: np.ndarray) -> (dict, float, bool, bool, dict):
        self.prev_portfolio_value = self.portfolio_value

        # --- [FIX] 1. Input Normalisierung (V5 Logik) ---
        # Wir behandeln den Input als "Logits" und wandeln ihn in valide Gewichte um.
        # Das verhindert, dass Gewichte > 1 oder < 0 entstehen.

        # Softplus-Transformation (macht alles positiv)
        # log(1 + exp(action)) + epsilon
        # Clipping verhindert Überlauf bei exp
        action_clipped = np.clip(action, -20, 20)
        alpha = np.log1p(np.exp(action_clipped)) + 1e-6

        # Normalisierung (Summe = 1.0)
        target_weights = alpha / np.sum(alpha)
        # -----------------------------------------------

        old_actual_weights = self.portfolio_weights

        # 2. Transaktionskosten
        turnover = np.sum(np.abs(target_weights[1:] - old_actual_weights[1:]))
        costs = turnover * self.prev_portfolio_value * self.transaction_cost_pct
        portfolio_value_after_costs = self.prev_portfolio_value - costs

        # [FIX] Bankrott-Schutz vor Markt-Update
        portfolio_value_after_costs = max(portfolio_value_after_costs, 1e-9)

        self.portfolio_weights = target_weights

        # 3. Markt-Update
        current_market_data = self._market_data_numpy[self.current_step]
        # Feature 0 = Log Returns (Annahme)
        current_log_returns = current_market_data[:, 0]

        # Clipping der Returns für Stabilität
        clipped_log_returns = np.clip(current_log_returns, -2.3, 2.3) # ca +/- 1000%
        simple_returns = np.expm1(clipped_log_returns)

        # Cash (Index 0) hat 0% Return. Assets haben simple_returns.
        returns_vector = np.concatenate(([0.0], simple_returns)).astype(np.float32)

        # 4. Neuer Portfoliowert
        # Value * Sum(Weights * (1 + Returns))
        # Da Sum(Weights) == 1, ist dies mathematisch korrekt.
        self.portfolio_value = portfolio_value_after_costs * np.sum(
            self.portfolio_weights * (1 + returns_vector)
        )

        # 5. Pleite-Check
        bankrott = False
        if self.portfolio_value <= 1e-6:
            self.portfolio_value = 1e-6
            self.done = True
            bankrott = True

        reward = self._calculate_reward()
        self.current_step += 1

        if self.current_step >= self.end_tick:
            self.done = True

        # Info Dict
        info = {
            'step': self.current_step,
            'portfolio_value': float(self.portfolio_value),
            'reward': float(reward),
            'transaction_costs': float(costs),
            'turnover': float(turnover),
            'is_bankrupt': bankrott,
            'weight_cash': float(target_weights[0])
        }

        return self._get_observation(), reward, self.done, False, info

    def close(self):
        pass

print("--- PortfolioEnv Klasse (V6 Architecture + V5 Math Fix) definiert ---")

--- PortfolioEnv Klasse (V6 Architecture + V5 Math Fix) definiert ---


In [38]:
# 5. Angepasster Feature Extractor (Dynamische Architektur)
# ----------------------------------------------------------------------
class CustomCombinedExtractor(BaseFeaturesExtractor):
    def __init__(
        self,
        observation_space: spaces.Dict,

        # Diese Argumente kommen direkt aus policy_kwargs
        extractor_type: str = "LSTM",
        hidden_size: int = 64
    ):

        # --- Dimensionen bestimmen ---
        market_space = observation_space["market_data"]
        weights_space = observation_space["portfolio_weights"]
        window_size, num_assets, num_features = market_space.shape

        self.lstm_input_size = num_assets * num_features
        self.lstm_hidden_size = hidden_size

        features_dim = self.lstm_hidden_size + weights_space.shape[0]

        # super().__init__ MUSS hier aufgerufen werden
        super().__init__(observation_space, features_dim)

        # --- Netzwerk definieren (basierend auf den direkten Argumenten) ---

        if extractor_type.upper() == "LSTM":
            self.rnn = nn.LSTM(
                input_size=self.lstm_input_size,
                hidden_size=self.lstm_hidden_size,
                batch_first=True
            )
        elif extractor_type.upper() == "GRU":
            self.rnn = nn.GRU(
                input_size=self.lstm_input_size,
                hidden_size=self.lstm_hidden_size,
                batch_first=True
            )
        # --- ENDE KORREKTUR v4 ---
        else:
            raise ValueError(f"Unbekannter extractor_type: {extractor_type}")

        print(f"CustomCombinedExtractor (Typ: {extractor_type}, Hidden: {hidden_size}) initialisiert.")


    def forward(self, observations: dict) -> torch.Tensor:
        market_data = observations["market_data"]
        portfolio_weights = observations["portfolio_weights"]
        batch_size, window_size = market_data.shape[0], market_data.shape[1]

        flat_market_data = market_data.reshape(batch_size, window_size, -1)

        rnn_out, hidden = self.rnn(flat_market_data)

        if isinstance(hidden, tuple): # LSTM
            last_hidden_state = hidden[0][-1]
        else: # GRU
            last_hidden_state = hidden[-1]

        combined_features = torch.cat([last_hidden_state, portfolio_weights], dim=1)
        return combined_features

print("--- CustomCombinedExtractor Klasse (KORRIGIERT v4) definiert ---")

--- CustomCombinedExtractor Klasse (KORRIGIERT v4) definiert ---


In [39]:
# %% [code]
#
# 6. POLICY-KLASSEN & DER "UNIFIED CALLBACK"
# (FIXED: KeyError 'weight_0' -> 'weight_cash' behoben)
# ----------------------------------------------------------------------
import pandas as pd
import torch
import torch.nn as nn
from torch.distributions import Dirichlet
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.distributions import Distribution
from gymnasium import spaces
from typing import Callable

# --- Policy-Klassen (unverändert) ---

class CustomDirichletDistribution(Distribution):
    def __init__(self, action_dim: int):
        super().__init__()
        self.action_dim = action_dim
    def proba_distribution_net(self, latent_dim: int) -> nn.Module:
        action_net = nn.Linear(latent_dim, self.action_dim)
        return action_net
    def proba_distribution(self, action_net_output: torch.Tensor) -> "CustomDirichletDistribution":
        alphas = torch.nn.functional.softplus(action_net_output) + 1.0
        self.distribution = Dirichlet(alphas)
        return self
    def log_prob(self, actions: torch.Tensor) -> torch.Tensor:
        actions_clipped = torch.clamp(actions, 1e-6, 1.0 - 1e-6)
        actions_normalized = actions_clipped / torch.sum(actions_clipped, dim=-1, keepdim=True)
        return self.distribution.log_prob(actions_normalized)
    def entropy(self) -> torch.Tensor:
        return self.distribution.entropy()
    def sample(self) -> torch.Tensor:
        return self.distribution.rsample()
    def mode(self) -> torch.Tensor:
        return self.distribution.mean
    def actions_from_params(self, action_net_output: torch.Tensor, deterministic: bool = False) -> torch.Tensor:
        self.proba_distribution(action_net_output)
        if deterministic: return self.mode()
        return self.sample()
    def log_prob_from_params(self, action_net_output: torch.Tensor) -> (torch.Tensor, torch.Tensor):
        self.proba_distribution(action_net_output)
        actions = self.sample()
        log_prob = self.log_prob(actions)
        return actions, log_prob

class CustomActorCriticPolicy(ActorCriticPolicy):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    def _get_action_dist_from_space(self, action_space: spaces.Box) -> CustomDirichletDistribution:
        if not isinstance(action_space, spaces.Box):
            raise ValueError("DirichletPolicy unterstützt nur Box Action Space.")
        action_dim = action_space.shape[0]
        return CustomDirichletDistribution(action_dim)


# --- [FINAL FIX] Unified Observability Callback ---
# Updates:
# 1. Silent Mode (keine Prints gegen Recursion)
# 2. Key-Fix: Nutzt 'weight_cash' statt 'weight_0'

class UnifiedObservabilityCallback(BaseCallback):
    def __init__(self, eval_env_fn: Callable, eval_freq: int, log_path: str, verbose=0):
        super(UnifiedObservabilityCallback, self).__init__(verbose)

        self.eval_env_fn = eval_env_fn
        self.eval_freq = eval_freq
        self.log_path = log_path
        self.best_mean_reward = -np.inf

        # Datenspeicher
        self.rollout_step_infos = []
        self.rollout_episode_infos = []

    def _init_callback(self):
        self.eval_env = DummyVecEnv([self.eval_env_fn])

    def _on_step(self) -> bool:
        """ Sammelt Infos bei JEDEM Schritt. """
        for info in self.locals.get('infos', []):
            if 'episode' in info.keys():
                self.rollout_episode_infos.append(info['episode'])
            if "portfolio_value" in info.keys():
                self.rollout_step_infos.append(info)
        return True

    def _on_rollout_end(self) -> None:
        """ Am Ende des Rollouts: Loggen & Evaluieren. """
        # 1. Logge Training-Metriken (Silent)
        self._log_rollout_aggregates()

        # 2. Evaluiere (wenn Zeit ist)
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            self._run_evaluation_and_log_table()

    def _log_rollout_aggregates(self):
        """ Berechnet und loggt Aggregate ohne Prints. """
        # Episode Metriken (Reward/Länge)
        if self.rollout_episode_infos:
            df_ep = pd.DataFrame(self.rollout_episode_infos)
            self.logger.record("rollout/ep_reward_mean", df_ep['r'].mean())
            self.logger.record("rollout/ep_len_mean", df_ep['l'].mean())

        # Live Metriken (Bankrott, Turnover)
        if self.rollout_step_infos:
            df_st = pd.DataFrame(self.rollout_step_infos)
            self.logger.record("live/bankrupt_rate", df_st['is_bankrupt'].mean())
            self.logger.record("live/avg_turnover", df_st['turnover'].mean())

            # [FIX] Sicherer Zugriff auf Cash-Gewicht (Env nutzt 'weight_cash')
            if 'weight_cash' in df_st.columns:
                self.logger.record("live/avg_cash_weight", df_st['weight_cash'].mean())
            elif 'weight_0' in df_st.columns:
                self.logger.record("live/avg_cash_weight", df_st['weight_0'].mean())

            # Optional: Max Asset Weight check (falls vorhanden)
            weight_cols = [c for c in df_st.columns if c.startswith('weight_') and c not in ['weight_cash', 'weight_0']]
            if weight_cols:
                max_w = df_st[weight_cols].max(axis=1).mean()
                self.logger.record("live/max_asset_weight", max_w)

        # Listen leeren für nächsten Rollout
        self.rollout_step_infos = []
        self.rollout_episode_infos = []

    def _run_evaluation_and_log_table(self):
        """ Führt Evaluation durch. """
        trajectory_log = []
        obs = self.eval_env.reset()
        done = False
        total_reward = 0.0

        while not done:
            action, _ = self.model.predict(obs, deterministic=True)
            obs, reward, done, info = self.eval_env.step(action)
            trajectory_log.append(info[0])
            total_reward += reward[0]
            if done[0]:
                break

        # W&B Table Logging
        try:
            # Wir loggen nur alle 5 Evaluationen die Tabelle
            df = pd.DataFrame(trajectory_log)
            table = wandb.Table(dataframe=df)
            wandb.log({f"eval/trajectory_step_{self.num_timesteps}": table}, step=self.num_timesteps)
        except Exception:
            pass # Silent fail

        # Metriken
        self.logger.record("eval/mean_reward", total_reward)
        self.logger.record("eval/mean_ep_length", len(trajectory_log))

        # Modell speichern
        if total_reward > self.best_mean_reward:
            self.best_mean_reward = total_reward
            save_path = os.path.join(self.log_path, "best_model.zip")
            self.model.save(save_path)

    def _on_training_end(self):
        self.eval_env.close()

print("--- Alle Klassen (inkl. FIXED Callback V3) definiert ---")

--- Alle Klassen (inkl. FIXED Callback V3) definiert ---


In [None]:
# %% [code]
#
# 7. HAUPT-TRAININGSFUNKTION & START
# (OPTIMIERT: WandbCallback entfernt, verlässt sich auf sync_tensorboard)
# ----------------------------------------------------------------------

def create_env(data_config: dict, env_config: dict, start_date: str, end_date: str) -> gym.Env:
    env = PortfolioEnv(
        data_config=data_config,
        env_config=env_config,
        start_date=start_date,
        end_date=end_date
    )
    env = Monitor(env)
    return env

def train(config):
    # --- 1. W&B Run initialisieren ---
    run = wandb.init(
        project=config["project_name"],
        config=config,
        name=config["run_name"],
        sync_tensorboard=True,  # [WICHTIG] Das hier übernimmt das Logging!
        monitor_gym=True,       # Video/Stats vom Gym Wrapper
        save_code=True,
        reinit="finish_previous",
        dir=config["wandb_log_dir"]
    )
    config = wandb.config

    # --- 2. Config Setup ---
    data_cfg = {
        "feature_csv_path": config.feature_csv_path,
        "toy_data_csv_path": config.toy_data_csv_path
    }
    env_cfg = {
        "initial_balance": config.initial_balance,
        "window_size": config.window_size,
        "transaction_cost_pct": config.transaction_cost_pct
    }

    # --- 3. Environments ---
    print(f"Erstelle {config.num_cpu_cores} parallele TRAIN-Umgebungen...")
    train_env_partial = partial(
        create_env, data_config=data_cfg, env_config=env_cfg,
        start_date=config.train_start_date, end_date=config.train_end_date
    )
    train_env = SubprocVecEnv([train_env_partial for _ in range(config.num_cpu_cores)])

    eval_env_partial = partial(
        create_env, data_config=data_cfg, env_config=env_cfg,
        start_date=config.eval_start_date, end_date=config.eval_end_date
    )

    # --- 4. Policy Setup ---
    policy_kwargs = dict(
        features_extractor_class=CustomCombinedExtractor,
        features_extractor_kwargs=dict(
             extractor_type=config.extractor_type,
             hidden_size=config.extractor_hidden_size
        ),
        net_arch=dict(pi=config.policy_pi_arch, vf=config.policy_vf_arch)
    )

    # --- 5. Callbacks ---
    callback_list = []

    # [OPTIMIERUNG] Kein WandbCallback mehr. Wir nutzen nur unseren Custom Callback.
    # Die Logs kommen via Tensorboard-Sync automatisch zu W&B.

    if config.save_model:
        save_path = os.path.join(config.model_save_dir, config.run_name)
        os.makedirs(save_path, exist_ok=True)
        print(f"Logs & Modelle in: {save_path}")

        eval_freq_steps = max(config.n_steps // config.num_cpu_cores, 1)

        unified_callback = UnifiedObservabilityCallback(
            eval_env_fn=eval_env_partial,
            eval_freq=eval_freq_steps,
            log_path=save_path,
            verbose=0 # Silent mode
        )
        callback_list.append(unified_callback)

    # --- 6. Model ---
    model = PPO(
        policy=CustomActorCriticPolicy,
        env=train_env,
        n_steps=config.n_steps,
        batch_size=config.batch_size,
        n_epochs=config.n_epochs,
        learning_rate=config.learning_rate,
        gamma=config.gamma,
        gae_lambda=config.gae_lambda,
        clip_range=config.clip_range,
        vf_coef=config.vf_coef,
        ent_coef=config.ent_coef,
        policy_kwargs=policy_kwargs,
        tensorboard_log=f"runs/{run.id}",
        device="cuda" if torch.cuda.is_available() else "cpu",
        verbose=0
    )

    print(f"--- PPO Start (Device: {model.device}) ---")

    # --- 7. Learn ---
    try:
        model.learn(
            total_timesteps=config.total_timesteps,
            callback=CallbackList(callback_list),
            progress_bar=False # Wichtig!
        )
        print("--- Training Done ---")
    except Exception as e:
        print(f"CRASH: {e}")
    finally:
        train_env.close()
        run.finish()
        print("--- Cleanup Done ---")

if __name__ == "__main__":
    global_config = globals().get('config')
    if global_config:
        train(global_config)
    else:
        print("FEHLER: Config fehlt.")

  self.scope.user = {"email": email}


Erstelle 2 parallele TRAIN-Umgebungen...




Logs & Modelle in: /content/drive/MyDrive/01_Data/projects/PPO_portfolio_optimization/models/PPO_LSTM_CSV_a4fc52fd
CustomCombinedExtractor (Typ: LSTM, Hidden: 128) initialisiert.
--- PPO Start (Device: cuda) ---
(Worker) Lade Daten: /content/drive/MyDrive/01_Data/projects/PPO_portfolio_optimization/processed_data/features_cleaned.csv
(Worker) Daten geladen: (T=5284, A=503, F=3)


  self.FromDatetime(datetime.datetime.utcnow())
  return datetime.utcnow().replace(tzinfo=utc)
