# Learn when to buy/sell given observations of recent price history only (the network is let to decide what features are important to extract from the price history)

The autocorrelogram is in a sense the "best" feature to use as this is the only statistical difference between fractional Brownian motion and regular Brownian motion, for which no trading strategy should be profitable on average.

Below we only provide the network policy approximator with the price history, $\textit{not} $ the autocorrelogram. Therefore the performance of the network is essentially driven by its ability to capture autocorrelation as an important feature and reconstruct autocorrelation from data.

In [None]:
from time import perf_counter

from pprint import pprint
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay
from IPython.display import clear_output

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from env.rl_trading_env import rl_trading_env
from agent import *

USE_CUDA = False

In [None]:
sns.set()

### Load CSV data

$H=0.7$ : trend-following signal

In [None]:
fname = 'fbm_0700.csv'
dfs = pd.read_csv(fname)

### Load Gym environment

In [None]:
env = rl_trading_env(dfs, 
                     n_lag=10, 
                     observe_type='return',
                     reward_mode='pnl',
                    )

### Load model

In [None]:
model = MLPModel(env.observation_space.shape[0],
                 env.action_space.n, 
                 use_cuda=USE_CUDA,
                )

### Load agent

In [None]:
learning_rate = 0.01
gamma = 0.9
seed = 1235

config = {
    'env': env,
    'learning_rate': learning_rate,
    'seed': seed,
    'gamma': gamma,
    'verbose': 10,
    'max_episode_length': 250,
    'use_mean_baseline': True,
    'use_cuda': USE_CUDA,
    'model': model,
}

print("Current config is:")
pprint(config)

agent = REINFORCE(config)

## Train

In [None]:
t1_start = perf_counter()  
agent.train(n_trajectories=15, n_update=2000)
t1_stop = perf_counter() 
print('Elapsed time during training: {:.2f}s'.format(t1_stop-t1_start))

In [None]:
env.render()

In [None]:
agent_trained = agent

## Test on new price paths

In [None]:
fname = 'fbm_0700_test.csv'
dfs = pd.read_csv(fname)
env = rl_trading_env(dfs, 10)
config['env'] = env
agent = REINFORCE(config)

In [None]:
for _ in range(10):
    state = torch.FloatTensor(env.reset())
    done = False
    PnL = []

    while not done:
        action = int(agent.model.select_action(state))
        state, reward, done, _ = env.step(action)
        state = torch.FloatTensor(state)
        PnL.append(reward)
    plt.plot(np.array(PnL).cumsum())    