# Reinforcement Learning for Portfolio Optimisation

This notebook demonstrates how to optimise a financial portfolio using both classical methods (Modern Portfolio Theory) and modern reinforcement learning (RL) techniques, including Proximal Policy Optimisation (PPO).

It is intended as a portfolio project example for roles involving applied AI, machine learning in finance, and reinforcement learning in production environments.

The following steps will be covered:

1. Download and prepare financial data
2. Classical optimisation using Markowitz's theory
3. Construction of a custom RL environment
4. Training an RL agent (PPO)
5. Comparison of results
6. Backtesting and evaluation


In [None]:
# Install required packages
!pip install yfinance stable-baselines3 gym pandas matplotlib

## Step 1: Download Financial Data

In [None]:
import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Define stock tickers and download historical prices
tickers = ['AAPL', 'GOOGL', 'MSFT', 'AMZN', 'TSLA']
data = yf.download(tickers, start="2020-01-01", end="2022-01-01")["Close"]
daily_returns = data.pct_change().dropna()
expected_returns = daily_returns.mean()
cov_matrix = daily_returns.cov()

data.head()

## Step 2: Classical Portfolio Optimisation

Using Markowitz Modern Portfolio Theory to minimise risk (portfolio variance).

In [None]:
from scipy.optimize import minimize

def compute_portfolio_variance(weights, cov_matrix):
    return np.dot(weights.T, np.dot(cov_matrix, weights))

def constraint(weights):
    return np.sum(weights) - 1

bounds = [(0, 1)] * len(tickers)
initial_weights = np.random.dirichlet(np.ones(len(tickers)), size=1).flatten()

constraints = [
    {'type': 'eq', 'fun': constraint},
    {'type': 'ineq', 'fun': lambda w: w - 0.05}
]

result = minimize(compute_portfolio_variance, initial_weights, args=(cov_matrix,),
                  method='SLSQP', bounds=bounds, constraints=constraints)

optimized_weights = result.x
portfolio_return = np.dot(expected_returns, optimized_weights)
portfolio_variance_value = compute_portfolio_variance(optimized_weights, cov_matrix)

print("Optimised Weights:", optimized_weights)
print("Expected Return:", portfolio_return)
print("Portfolio Variance:", portfolio_variance_value)

# Plot
plt.bar(tickers, optimized_weights)
plt.title("Optimised Portfolio Weights (Classical)")
plt.xlabel("Stock")
plt.ylabel("Weight")
plt.show()

## Step 3: Create a Custom RL Environment
This environment simulates portfolio allocation over time.

In [None]:
import gym
from gym import spaces

class PortfolioEnv(gym.Env):
    def __init__(self, returns, initial_balance=1000):
        super(PortfolioEnv, self).__init__()
        self.returns = returns.values
        self.n_assets = self.returns.shape[1]
        self.initial_balance = initial_balance
        self.max_steps = len(returns) - 1

        self.action_space = spaces.Box(low=0.05, high=1.0, shape=(self.n_assets,), dtype=np.float32)
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.n_assets,), dtype=np.float32)

    def reset(self):
        self.current_step = 0
        self.weights = np.ones(self.n_assets) / self.n_assets
        self.nav = self.initial_balance
        return self.returns[self.current_step]

    def step(self, action):
        action = np.clip(action, 0.05, 1)
        action = action / np.sum(action)
        self.current_step += 1

        prev_returns = self.returns[self.current_step - 1]
        next_returns = self.returns[self.current_step]
        self.nav *= (1 + np.dot(prev_returns, self.weights))
        self.weights = action

        reward = np.dot(next_returns, self.weights)
        done = self.current_step >= self.max_steps - 1
        obs = self.returns[self.current_step]
        return obs, reward, done, {}

## Step 4: Train PPO Agent on Portfolio Environment

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env

env = PortfolioEnv(daily_returns)
check_env(env)

model = PPO("MlpPolicy", env, verbose=0)
model.learn(total_timesteps=50000)

## Step 5: Evaluate PPO Agent

In [None]:
obs = env.reset()
rewards = []

for _ in range(env.max_steps):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)
    rewards.append(reward)
    if done:
        break

print(f"Final NAV: £{env.nav:.2f}")
print(f"Mean Daily Return (RL): {np.mean(rewards):.4f}")

# Plot performance
plt.plot(np.cumprod(1 + np.array(rewards)), label="PPO Agent")
plt.axhline(y=(1 + portfolio_return) ** len(rewards), color='r', linestyle='--', label="Classical")
plt.title("Cumulative Return: PPO vs Classical")
plt.xlabel("Time Step")
plt.ylabel("Portfolio Growth")
plt.legend()
plt.show()