In [None]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from sklearn.preprocessing import MinMaxScaler
from stable_baselines3 import PPO
import matplotlib.pyplot as plt

# Load CSV data
df = pd.read_csv("cleaned_crypto_data.csv", parse_dates=['Date'])
df.set_index('Date', inplace=True)

# Print columns to check their names
print("CSV columns:", df.columns.tolist())

# Automatically select numeric columns for prices (crypto + ETFs)
price_columns = df.select_dtypes(include=[np.number]).columns.tolist()
print("Using price columns:", price_columns)
prices_df = df[price_columns]

prices = prices_df.values

# Normalize prices for use in environment
scaler = MinMaxScaler()
prices_norm = scaler.fit_transform(prices)

# Create dummy micro indicators (zeros, or add your own indicators)
micro_indicators = np.zeros_like(prices)

# Create dummy regime (0 = bullish)
regime_classes = np.zeros(len(prices), dtype=int)

class PortfolioEnv(gym.Env):
    def __init__(self, prices, regimes, indicators, initial_balance=10000, fee=0.001):
        super().__init__()
        self.prices = prices
        self.regimes = regimes
        self.indicators = indicators
        self.initial_balance = initial_balance
        self.transaction_fee = fee

        self.n_assets = prices.shape[1]
        self.num_regimes = 3

        self.action_space = spaces.Box(low=0, high=1, shape=(self.n_assets,), dtype=np.float32)
        obs_dim = self.n_assets + 2 + self.num_regimes + self.indicators.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32)

    def reset(self):
        self.step_idx = 0
        self.portfolio_value = self.initial_balance
        self.weights = np.ones(self.n_assets) / self.n_assets
        self.holdings = (self.portfolio_value * self.weights) / self.prices[self.step_idx]
        return self._get_obs()

    def _get_obs(self):
        regime_onehot = np.zeros(self.num_regimes)
        regime_onehot[self.regimes[self.step_idx]] = 1
        obs = np.concatenate([
            self.prices[self.step_idx] / (self.prices[0] + 1e-8),
            [self.portfolio_value / self.initial_balance],
            [0.5],  # risk appetite (fixed)
            regime_onehot,
            self.indicators[self.step_idx]
        ])
        return obs

    def step(self, action):
        action = np.clip(action, 0, 1)
        new_weights = action / (np.sum(action) + 1e-8)  # normalize weights

        transaction_cost = np.sum(np.abs(self.weights - new_weights)) * self.portfolio_value * self.transaction_fee
        self.portfolio_value -= transaction_cost

        self.weights = new_weights
        next_prices = self.prices[self.step_idx + 1]
        current_prices = self.prices[self.step_idx]
        returns = (next_prices - current_prices) / (current_prices + 1e-8)  # avoid div by zero

        portfolio_return = np.dot(self.weights, returns)
        reward = portfolio_return - transaction_cost / max(self.portfolio_value, 1e-8)
        self.portfolio_value *= (1 + portfolio_return)

        self.step_idx += 1
        done = self.step_idx >= len(self.prices) - 2
        return self._get_obs(), reward, done, {}

    def render(self):
        print(f"Step {self.step_idx}, Portfolio Value: {self.portfolio_value:.2f}")

# Create and train environment
env = PortfolioEnv(prices_norm, regime_classes, micro_indicators)
model = PPO('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=50000)

# Evaluate
obs = env.reset()
rewards, values = [], [env.portfolio_value]
done = False
while not done:
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)
    rewards.append(reward)
    values.append(env.portfolio_value)

# Plot portfolio value over time
plt.plot(values)
plt.title("Portfolio Value Over Time")
plt.xlabel("Days")
plt.ylabel("Value ($)")
plt.grid()
plt.show()


CSV columns: ['Crypto', 'Open', 'High', 'Low', 'Close', 'Volume']
Using price columns: ['Open', 'High', 'Low', 'Close', 'Volume']
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




-----------------------------
| time/              |      |
|    fps             | 634  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 408         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.006664174 |
|    clip_fraction        | 0.0578      |
|    clip_range           | 0.2         |
|    entropy_loss         | -7.09       |
|    explained_variance   | 0.117       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00835     |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00559    |
|    std                  | 0.998       |
|    value_loss           | 0.0379      |
----------------------------------