In [1]:
# 1. Installationen
# ----------------------------------------------------------------------
# Wir installieren gymnasium, stable-baselines3, wandb und shimmy für Kompatibilität
!pip install gymnasium stable-baselines3[extra] wandb shimmy pandas numpy torch


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# 2. Imports & Konfiguration
# ----------------------------------------------------------------------
import os
import uuid
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import gymnasium as gym
from gymnasium import spaces
from functools import partial
from typing import Callable

# Stable Baselines 3
from stable_baselines3 import PPO
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback, CallbackList

# W&B
import wandb

# Colab Setup
ENV = "colab"
try:
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    WANDB_API_KEY = userdata.get('WANDB_API_KEY') 
    BASE_DIR = '/content/drive/MyDrive/01_Data/projects/PPO_portfolio_optimization'
    print("Umgebung: Google Colab. Drive gemountet.")
except ImportError:
    ENV = "local"
    WANDB_API_KEY = "5040c3fbca0b98e69a044c329e27480694067462" # Fallback
    BASE_DIR = "./"
    print("Lokale Umgebung.")

if WANDB_API_KEY:
    os.environ['WANDB_API_KEY'] = WANDB_API_KEY
    wandb.login(key=WANDB_API_KEY)

# --- CONFIG UPDATE ---
config = {
    "project_name": "PPO_Portfolio_SP500",
    "run_name": f"PPO_LSTM_Transfer_{uuid.uuid4().hex[:8]}", # Neuer Name
    "use_wandb": True,
    "save_model": True,
    "feature_csv_path": os.path.join(BASE_DIR, 'processed_data', 'features_cleaned.csv'),
    "model_save_dir": os.path.join(BASE_DIR, 'models'),
    "wandb_log_dir": os.path.join(BASE_DIR, 'data', 'wandb_logs'),
    
    # --- MODELL LADEN (CLOUD) ---
    # Hier den Pfad zum Artifact einfügen, falls du fortsetzen willst.
    # Format: 'entity/project/artifact_name:version' oder ':alias'
    # Beispiel: 'cb-ml/PPO_Portfolio_SP500/PPO_LSTM_CSV_f074b6dc_BEST:latest'
    "load_artifact_path": "",  # <-- HIER EINFÜGEN ZUM LADEN (sonst leer lassen)
    
    "train_start_date": '2006-01-01',
    "train_end_date": '2019-12-31',
    "eval_start_date": '2020-01-01',
    "eval_end_date": '2023-12-31',
    
    "initial_balance": 10000.0,
    "window_size": 30,
    "transaction_cost_pct": 0.0005,
    
    "total_timesteps": 30_000_000, 
    "learning_rate": 0.0003,       
    "extractor_rnn_dropout": 0.2,  
    
    "num_cpu_cores": 4,
    "n_steps": 4096,
    "batch_size": 1024,
    "n_epochs": 5,
    "ent_coef": 0.0001,
    "clip_range": 0.2,
    "extractor_type": "LSTM",
    "extractor_hidden_size": 128,
    "policy_pi_arch": [64],
    "policy_vf_arch": [64]
}

os.makedirs(config['model_save_dir'], exist_ok=True)
os.makedirs(config['wandb_log_dir'], exist_ok=True)

2025-11-25 06:24:27.149666: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764051867.167065    9414 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764051867.172632    9414 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1764051867.191642    9414 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764051867.191659    9414 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1764051867.191661    9414 computation_placer.cc:177] computation placer alr

Lokale Umgebung.


[34m[1mwandb[0m: Currently logged in as: [33mchristoph-bieritz[0m ([33mcb-ml[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
# 4. Environment Class (MIT SMART DATA LOADER)
# ----------------------------------------------------------------------
class PortfolioEnv(gym.Env):
    metadata = {"render_modes": ["human"]}

    def __init__(self, data_config, env_config, start_date, end_date):
        super().__init__()
        self.data_config = data_config
        self.env_config = env_config
        self.start_date_str = start_date
        self.end_date_str = end_date
        
        self.window_size = self.env_config['window_size']
        self.initial_balance = self.env_config['initial_balance']
        self.transaction_cost_pct = self.env_config['transaction_cost_pct']
        
        # 1. Daten laden (Robust)
        self._load_and_reshape_data()
        self._set_time_slices()
        
        # 2. Spaces definieren
        self.num_portfolio_components = self.num_assets + 1
        self.portfolio_value = self.initial_balance
        self.prev_portfolio_value = self.initial_balance
        self.portfolio_weights = np.zeros(self.num_portfolio_components, dtype=np.float32)
        self.portfolio_weights[0] = 1.0
        
        self.action_space = spaces.Box(low=-20.0, high=20.0, shape=(self.num_portfolio_components,), dtype=np.float32)
        
        self.observation_space = spaces.Dict({
            "market_data": spaces.Box(low=-np.inf, high=np.inf, shape=(self.window_size, self.num_assets, self.num_features), dtype=np.float32),
            "portfolio_weights": spaces.Box(low=0.0, high=1.0, shape=(self.num_portfolio_components,), dtype=np.float32),
        })

    def _load_and_reshape_data(self):
        """Lädt CSV und erkennt automatisch Format (Long vs Wide)"""
        csv_path = self.data_config.get('feature_csv_path')
        print(f"(Worker) Lade Daten: {csv_path}")
        
        # Erstmal generisch laden
        df = pd.read_csv(csv_path)
        
        # CHECK: Ist es das 'Long Format' (Date, Ticker als Spalten)?
        if 'Date' in df.columns and 'Ticker' in df.columns:
            # Datum konvertieren
            df['Date'] = pd.to_datetime(df['Date'])
            # Index setzen
            df.set_index(['Date', 'Ticker'], inplace=True)
            
            # Unstacken zu Wide Format: Index=Date, Cols=(Feature, Ticker)
            df_wide = df.unstack(level='Ticker')
            
            # Metadaten extrahieren
            self.feature_names = df_wide.columns.levels[0].tolist()
            self.asset_names = df_wide.columns.levels[1].tolist()
            
            # Daten bereinigen (Forward Fill für fehlende Tage)
            df_wide = df_wide.ffill().bfill()
            
            # In 3D-Tensor umwandeln (Time, Assets, Features)
            total_timesteps = len(df_wide)
            self.num_assets = len(self.asset_names)
            self.num_features = len(self.feature_names)
            
            # WICHTIG: Sortierung sicherstellen für korrekten Reshape
            values = df_wide.stack(level='Ticker', future_stack=True).values
            
            self._market_data_numpy = values.reshape(total_timesteps, self.num_assets, self.num_features).astype(np.float32)
            self.full_data_index = df_wide.index
            
            # Versuche Returns-Spalte zu finden (für Reward Berechnung)
            try:
                self.return_feature_idx = next(i for i, name in enumerate(self.feature_names) 
                                             if 'return' in name.lower() or 'pct' in name.lower() or 'change' in name.lower())
                print(f"Return-Feature gefunden an Index {self.return_feature_idx}: {self.feature_names[self.return_feature_idx]}")
            except StopIteration:
                print("WARNUNG: Kein explizites Return-Feature gefunden. Nutze Index 0.")
                self.return_feature_idx = 0
                
        else:
            # Fallback: Annahme MultiIndex Header im CSV
            df = pd.read_csv(csv_path, header=[0, 1], index_col=0, parse_dates=True)
            df.sort_index(inplace=True)
            self.full_data_index = df.index
            self.num_features = df.columns.get_level_values(0).nunique()
            self.num_assets = df.columns.get_level_values(1).nunique()
            self._market_data_numpy = df.stack(level=1, future_stack=True).values.reshape(
                len(df), self.num_assets, self.num_features
            ).astype(np.float32)
            self.return_feature_idx = 0 # Fallback

    def _set_time_slices(self):
        start_ts = pd.to_datetime(self.start_date_str)
        end_ts = pd.to_datetime(self.end_date_str)
        
        # Robuste Index-Suche
        if start_ts < self.full_data_index[0]: start_ts = self.full_data_index[0]
        if end_ts > self.full_data_index[-1]: end_ts = self.full_data_index[-1]

        start_idx = self.full_data_index.searchsorted(start_ts)
        end_idx = self.full_data_index.searchsorted(end_ts)
        
        if end_idx >= len(self.full_data_index): end_idx = len(self.full_data_index) - 1

        self.start_tick = max(start_idx, self.window_size)
        self.end_tick = end_idx
        self.current_step = self.start_tick

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = self.start_tick
        self.portfolio_value = self.initial_balance
        self.prev_portfolio_value = self.initial_balance
        self.portfolio_weights = np.zeros(self.num_portfolio_components, dtype=np.float32)
        self.portfolio_weights[0] = 1.0
        self.done = False
        return self._get_observation(), {}

    def _get_observation(self):
        end = self.current_step
        start = end - self.window_size
        return {
            "market_data": self._market_data_numpy[start:end],
            "portfolio_weights": self.portfolio_weights
        }

    def step(self, action):
        self.prev_portfolio_value = self.portfolio_value
        
        # 1. Softmax (Action -> Weights)
        action_clipped = np.clip(action, -20, 20)
        exp_values = np.exp(action_clipped)
        target_weights = exp_values / np.sum(exp_values)
        
        # 2. Kosten
        turnover = np.sum(np.abs(target_weights[1:] - self.portfolio_weights[1:]))
        costs = turnover * self.prev_portfolio_value * self.transaction_cost_pct
        
        # 3. Returns berechnen
        current_data = self._market_data_numpy[self.current_step]
        asset_returns = current_data[:, self.return_feature_idx]
        
        # Portfolio Rendite
        weighted_return = np.sum(target_weights[1:] * asset_returns)
        
        val_after_cost = self.prev_portfolio_value - costs
        self.portfolio_value = val_after_cost * (1 + weighted_return)
        
        # Bankrott Schutz
        if self.portfolio_value < 1e-6:
            self.portfolio_value = 1e-6
            self.done = True
            
        # Reward: Log Return mit Scaling x100
        reward_scale = 100.0 
        log_return = np.log(self.portfolio_value / max(self.prev_portfolio_value, 1e-6))
        reward = log_return * reward_scale
        
        if np.isnan(reward) or np.isinf(reward): reward = -10.0 
        
        self.portfolio_weights = target_weights
        self.current_step += 1
        
        if self.current_step >= self.end_tick:
            self.done = True
            
        info = {
            "portfolio_value": self.portfolio_value,
            "turnover": turnover,
            "weight_cash": target_weights[0],
            "is_bankrupt": self.portfolio_value <= 1e-5
        }
        
        return self._get_observation(), reward, self.done, False, info

In [4]:
# 5. Custom Policy & Callbacks (MIT DROPOUT & CLOUD TRIGGER)
# ----------------------------------------------------------------------
import os
import numpy as np
import torch
import torch.nn as nn
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.callbacks import BaseCallback
import wandb # Wichtig für den Upload

# --- 1. Feature Extractor (LSTM mit Dropout) ---
class CustomCombinedExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space, extractor_type="LSTM", hidden_size=128, rnn_dropout=0.0):
        market_space = observation_space["market_data"]
        weights_space = observation_space["portfolio_weights"]
        
        features_dim = hidden_size + weights_space.shape[0]
        super().__init__(observation_space, features_dim)
        
        # LSTM Definition
        # WICHTIG: Dropout funktioniert bei PyTorch LSTMs nur, wenn num_layers >= 2
        self.lstm = nn.LSTM(
            input_size=market_space.shape[1] * market_space.shape[2],
            hidden_size=hidden_size,
            num_layers=2,    # 2 Layer für Dropout-Effektivität
            dropout=rnn_dropout if rnn_dropout > 0 else 0.0,
            batch_first=True
        )
        
    def forward(self, observations):
        market = observations["market_data"]
        weights = observations["portfolio_weights"]
        batch_size = market.shape[0]
        
        # Flatten
        market_flat = market.reshape(batch_size, market.shape[1], -1)
        
        # LSTM Pass
        _, (hidden, _) = self.lstm(market_flat)
        
        # Wir nehmen den letzten Hidden State
        return torch.cat([hidden[-1], weights], dim=1)

# --- 2. Callbacks ---

class TensorboardCallback(BaseCallback):
    """ Loggt Turnover, Cash und Bankrott-Rate. """
    def __init__(self, verbose=0):
        super().__init__(verbose)
        self.buffer = {"turnover": [], "cash": [], "bankrupt": []}

    def _on_step(self) -> bool:
        infos = self.locals.get("infos", [])
        for info in infos:
            if "turnover" in info: self.buffer["turnover"].append(info["turnover"])
            if "weight_cash" in info: self.buffer["cash"].append(info["weight_cash"])
            if "is_bankrupt" in info: self.buffer["bankrupt"].append(info["is_bankrupt"])
        return True

    def _on_rollout_end(self):
        if self.buffer["turnover"]:
            self.logger.record("live/avg_turnover", np.mean(self.buffer["turnover"]))
            self.logger.record("live/avg_cash", np.mean(self.buffer["cash"]))
            self.logger.record("live/bankrupt_rate", np.mean(self.buffer["bankrupt"]))
        self.buffer = {"turnover": [], "cash": [], "bankrupt": []}

# [NEU] Dieser Callback lädt das beste Modell sofort hoch, wenn es gefunden wird
class TriggerWandbUploadCallback(BaseCallback):
    def __init__(self, save_path, verbose=0):
        super().__init__(verbose)
        self.save_path = save_path

    def _on_step(self) -> bool:
        # Diese Methode wird vom EvalCallback aufgerufen
        best_model_path = os.path.join(self.save_path, "best_model.zip")
        
        if os.path.exists(best_model_path) and wandb.run is not None:
            try:
                # Wir nutzen den Namen des aktuellen Runs + _BEST
                artifact = wandb.Artifact(name=f"{wandb.run.name}_BEST", type="model")
                artifact.add_file(best_model_path)
                wandb.log_artifact(artifact)
                if self.verbose > 0:
                    print(f" [Cloud] ☁️ Neues Highscore-Modell hochgeladen!")
            except Exception as e:
                print(f" [Cloud] ❌ Upload fehlgeschlagen: {e}")
        return True

class WandbCloudSaveCallback(BaseCallback):
    """ Reguläres Backup alle X Schritte """
    def __init__(self, save_freq, save_path, name_prefix):
        super().__init__(0)
        self.save_freq = save_freq
        self.save_path = save_path
        self.name_prefix = name_prefix

    def _on_step(self):
        if self.n_calls % self.save_freq == 0:
            path = os.path.join(self.save_path, f"{self.name_prefix}_{self.num_timesteps}_steps")
            self.model.save(path)
            if wandb.run is not None:
                try:
                    artifact = wandb.Artifact(name=f"{self.name_prefix}_ckpt", type="model")
                    artifact.add_file(f"{path}.zip")
                    wandb.log_artifact(artifact)
                except Exception as e:
                    print(f"Checkpoint Upload Fehler: {e}")
        return True

In [None]:
# 6. Training Loop (MIT W&B LOAD & SAVE)
# ----------------------------------------------------------------------

def linear_schedule(initial_value: float) -> Callable[[float], float]:
    def func(progress_remaining: float) -> float:
        return progress_remaining * initial_value
    return func

def train(config):
    # W&B Init
    run = wandb.init(
        project=config["project_name"],
        config=config,
        name=config["run_name"],
        sync_tensorboard=True,
        monitor_gym=True,
        dir=config["wandb_log_dir"]
    )
    run.log_code(".", include_fn=lambda path: path.endswith(".py") or path.endswith(".ipynb"))
    cfg = wandb.config
    
    # Pfade & Env
    data_cfg = {"feature_csv_path": cfg.feature_csv_path}
    env_cfg = {
        "initial_balance": cfg.initial_balance,
        "window_size": cfg.window_size,
        "transaction_cost_pct": cfg.transaction_cost_pct
    }
    
    num_cpu = 4 
    train_env = SubprocVecEnv([
        partial(create_env, data_cfg, env_cfg, cfg.train_start_date, cfg.train_end_date)
        for _ in range(num_cpu)
    ])
    
    callbacks = [TensorboardCallback()]
    
    # --- MODEL DEFINITION (Architektur) ---
    policy_kwargs = dict(
        features_extractor_class=CustomCombinedExtractor,
        features_extractor_kwargs=dict(
            hidden_size=cfg.extractor_hidden_size,
            rnn_dropout=cfg.get("extractor_rnn_dropout", 0.0)
        ),
        net_arch=dict(pi=cfg.policy_pi_arch, vf=cfg.policy_vf_arch)
    )

    # --- LADE-LOGIK (CLOUD RESTORE) ---
    model = None
    
    if cfg.get("load_artifact_path"):
        print(f"--- Versuche Modell aus W&B Cloud zu laden: {cfg.load_artifact_path} ---")
        try:
            # 1. Artifact herunterladen
            artifact = run.use_artifact(cfg.load_artifact_path, type='model')
            artifact_dir = artifact.download()
            print(f"Artifact heruntergeladen nach: {artifact_dir}")
            
            # 2. Modell laden (Wir suchen nach .zip Dateien im Ordner)
            model_file = next((f for f in os.listdir(artifact_dir) if f.endswith(".zip")), None)
            if model_file:
                full_path = os.path.join(artifact_dir, model_file)
                
                # Parameter für Fine-Tuning (Lernrate etc.) überschreiben wir
                custom_objects = {
                    "learning_rate": linear_schedule(cfg.learning_rate),
                    "ent_coef": cfg.ent_coef,
                    "clip_range": cfg.clip_range
                }
                
                model = PPO.load(
                    full_path, 
                    env=train_env, 
                    custom_objects=custom_objects,
                    device="cuda" if torch.cuda.is_available() else "cpu",
                    print_system_info=True
                )
                print(f"✅ ERFOLG: Modell '{model_file}' geladen und bereit für Training!")
            else:
                print("❌ FEHLER: Keine .zip Datei im Artifact gefunden.")
        except Exception as e:
            print(f"❌ CRASH beim Laden aus Cloud: {e}")
            print("Starte stattdessen neues Modell...")
            model = None

    # Fallback: Neues Modell
    if model is None:
        print("Initialisiere NEUES PPO Modell...")
        model = PPO(
            "MultiInputPolicy",
            train_env,
            n_steps=cfg.n_steps,
            batch_size=cfg.batch_size,
            learning_rate=linear_schedule(cfg.learning_rate), 
            ent_coef=cfg.ent_coef,
            clip_range=cfg.clip_range,
            policy_kwargs=policy_kwargs,
            tensorboard_log=f"runs/{run.id}",
            device="cuda" if torch.cuda.is_available() else "cpu",
            verbose=1
        )

    # --- CALLBACKS SETUP ---
    if cfg.save_model:
        save_path = os.path.join(cfg.model_save_dir, cfg.run_name)
        os.makedirs(save_path, exist_ok=True)
        
        # A) Der "Trigger", der bei neuem Best-Model sofort hochlädt
        upload_trigger = TriggerWandbUploadCallback(save_path, verbose=1)
        
        # B) Der EvalCallback, der prüft und lokal speichert
        eval_callback = EvalCallback(
            DummyVecEnv([lambda: create_env(data_cfg, env_cfg, cfg.eval_start_date, cfg.eval_end_date)]),
            best_model_save_path=save_path,
            log_path=save_path,
            eval_freq=cfg.n_steps,
            deterministic=True,
            verbose=1,
            callback_on_new_best=upload_trigger # <--- HIER VERKNÜPFEN WIR SIE
        )
        callbacks.append(eval_callback)
        
        # C) Der Regular Checkpoint Saver (alle 100k Steps)
        callbacks.append(WandbCloudSaveCallback(100_000, save_path, cfg.run_name))

    # --- START ---
    try:
        print(f"Starte Training für {cfg.total_timesteps} Schritte...")
        model.learn(total_timesteps=cfg.total_timesteps, callback=CallbackList(callbacks))
        print("Training beendet.")
    finally:
        train_env.close()
        run.finish()

def create_env(data_config, env_config, start, end):
    env = PortfolioEnv(data_config, env_config, start, end)
    return Monitor(env)

if __name__ == "__main__":
    if 'config' in globals():
        train(config)
    else:
        print("Bitte Config-Zelle ausführen!")

2025-11-25 06:25:28.063875: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764051928.080081    9600 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-11-25 06:25:28.081966: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-11-25 06:25:28.082307: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764051928.085433    9600 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-11-25 06:25:

Initialisiere NEUES PPO Modell...
Using cuda device
(Worker) Lade Daten: ./processed_data/features_cleaned.csv
Return-Feature gefunden an Index 0: LogReturns
Starte Training für 30000000 Schritte...
Logging to runs/wjhcgljh/PPO_1




Eval num_timesteps=16384, episode_reward=26.78 +/- 0.00
Episode length: 1006.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 1.01e+03 |
|    mean_reward     | 26.8     |
| time/              |          |
|    total_timesteps | 16384    |
---------------------------------
New best mean reward!
 [Cloud] ☁️ Neues Highscore-Modell hochgeladen!
-------------------------------------
| live/              |              |
|    avg_cash        | 0.0019833725 |
|    avg_turnover    | 1.0375264    |
|    bankrupt_rate   | 0            |
| rollout/           |              |
|    ep_len_mean     | 3.52e+03     |
|    ep_rew_mean     | -63.8        |
| time/              |              |
|    fps             | 877          |
|    iterations      | 1            |
|    time_elapsed    | 18           |
|    total_timesteps | 16384        |
-------------------------------------
Eval num_timesteps=32768, episode_reward=26.76 +/- 0.00
Episode length: