In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from copy import deepcopy
import argparse

import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd

import gym
import gym.spaces
import numpy as np
from tqdm import tqdm


# Random process

In [3]:
class OrnsteinUhlenbeckProcess:
    """
    Ornstein-Uhnlenbeck process
    Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
    """

    def __init__(self, action_dim, mu=0, theta=0.15, sigma=0.2):
        self.action_dim = action_dim
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.X = np.ones(self.action_dim) * self.mu

    def reset(self):
        self.X = np.ones(self.action_dim) * self.mu

    def sample(self):
        dx = self.theta * (self.mu - self.X)
        dx = dx + self.sigma * np.random.randn(len(self.X))
        self.X = self.X + dx
        return self.X


In [4]:
class GaussianNoise:
    """
    Simple Gaussian noise
    """

    def __init__(self, action_dim, sigma=0.2):
        self.action_dim = action_dim
        self.sigma = sigma

    def sample(self):
        s = np.random.normal(scale=self.sigma, size=self.action_dim)
        return s

# Models

In [5]:
class RLNN(nn.Module):

    def __init__(self, state_dim, action_dim, max_action):
        super(RLNN, self).__init__()
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.max_action = max_action

    def set_params(self, params):
        """
        Set the params of the network to the given parameters
        """
        cpt = 0
        for param in self.parameters():
            tmp = np.product(param.size())

            if torch.cuda.is_available():
                param.data.copy_(torch.from_numpy(
                    params[cpt:cpt + tmp]).view(param.size()).cuda())
            else:
                param.data.copy_(torch.from_numpy(
                    params[cpt:cpt + tmp]).view(param.size()))
            cpt += tmp

    def get_params(self):
        """
        Returns parameters of the actor
        """
        return deepcopy(np.hstack([to_numpy(v).flatten() for v in
                                   self.parameters()]))

    def get_grads(self):
        """
        Returns the current gradient
        """
        return deepcopy(np.hstack([to_numpy(v.grad).flatten() for v in self.parameters()]))

    def get_size(self):
        """
        Returns the number of parameters of the network
        """
        return self.get_params().shape[0]

    def load_model(self, filename, net_name):
        """
        Loads the model
        """
        if filename is None:
            return

        self.load_state_dict(
            torch.load('{}/{}.pkl'.format(filename, net_name),
                       map_location=lambda storage, loc: storage)
        )

    def save_model(self, output, net_name):
        """
        Saves the model
        """
        torch.save(
            self.state_dict(),
            '{}/{}.pkl'.format(output, net_name)
        )


# ES

In [6]:
class sepCEM:

    """
    Cross-entropy methods.
    """

    def __init__(self, num_params,
                 mu_init=None,
                 sigma_init=1e-3,
                 pop_size=256,
                 damp=1e-3,
                 damp_limit=1e-5,
                 parents=None,
                 elitism=False,
                 antithetic=False):

        # misc
        self.num_params = num_params

        # distribution parameters
        if mu_init is None:
            self.mu = np.zeros(self.num_params)
        else:
            self.mu = np.array(mu_init)
        self.sigma = sigma_init
        self.damp = damp
        self.damp_limit = damp_limit
        self.tau = 0.95
        self.cov = self.sigma * np.ones(self.num_params)

        # elite stuff
        self.elitism = elitism
        self.elite = np.sqrt(self.sigma) * np.random.rand(self.num_params)
        self.elite_score = None

        # sampling stuff
        self.pop_size = pop_size
        self.antithetic = antithetic

        if self.antithetic:
            assert (self.pop_size % 2 == 0), "Population size must be even"
        if parents is None or parents <= 0:
            self.parents = pop_size // 2
        else:
            self.parents = parents
        self.weights = np.array([np.log((self.parents + 1) / i)
                                 for i in range(1, self.parents + 1)])
        self.weights /= self.weights.sum()

    def ask(self, pop_size):
        """
        Returns a list of candidates parameters
        """
        if self.antithetic and not pop_size % 2:
            epsilon_half = np.random.randn(pop_size // 2, self.num_params)
            epsilon = np.concatenate([epsilon_half, - epsilon_half])

        else:
            epsilon = np.random.randn(pop_size, self.num_params)

        inds = self.mu + epsilon * np.sqrt(self.cov)
        if self.elitism:
            inds[-1] = self.elite

        return inds

    def tell(self, solutions, scores):
        """
        Updates the distribution
        """
        scores = np.array(scores)
        scores *= -1
        idx_sorted = np.argsort(scores)

        old_mu = self.mu
        self.damp = self.damp * self.tau + (1 - self.tau) * self.damp_limit
        self.mu = self.weights @ solutions[idx_sorted[:self.parents]]

        z = (solutions[idx_sorted[:self.parents]] - old_mu)
        self.cov = 1 / self.parents * self.weights @ (
            z * z) + self.damp * np.ones(self.num_params)

        self.elite = solutions[idx_sorted[0]]
        self.elite_score = scores[idx_sorted[0]]
        print(self.cov)

    def get_distrib_params(self):
        """
        Returns the parameters of the distrubtion:
        the mean and sigma
        """
        return np.copy(self.mu), np.copy(self.cov)


class Control:

    """
    Cross-entropy methods.
    """

    def __init__(self, num_params, mu_init, pop_size=256, sigma_init=1e-3):

        # misc
        self.num_params = num_params
        self.pop = np.sqrt(sigma_init) * np.random.randn(pop_size, num_params) + mu_init
        self.mu = np.zeros(num_params)

    def ask(self, pop_size):
        """
        Returns a list of candidates parameters
        """
        return self.pop

    def tell(self, solutions, scores):
        """
        Updates the distribution
        """
        self.mu = solutions[np.argmax(scores)]
        self.pop = solutions
        np.random.shuffle(self.pop)


# Memory

In [7]:

class Memory():

    def __init__(self, memory_size, state_dim, action_dim):

        # params
        self.memory_size = memory_size
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.pos = 0
        self.full = False

        if USE_CUDA:
            self.states = torch.zeros(self.memory_size, self.state_dim).cuda()
            self.actions = torch.zeros(
                self.memory_size, self.action_dim).cuda()
            self.n_states = torch.zeros(
                self.memory_size, self.state_dim).cuda()
            self.rewards = torch.zeros(self.memory_size, 1).cuda()
            self.dones = torch.zeros(self.memory_size, 1).cuda()

        else:
            self.states = torch.zeros(self.memory_size, self.state_dim)
            self.actions = torch.zeros(self.memory_size, self.action_dim)
            self.n_states = torch.zeros(self.memory_size, self.state_dim)
            self.rewards = torch.zeros(self.memory_size, 1)
            self.dones = torch.zeros(self.memory_size, 1)

    def size(self):
        if self.full:
            return self.memory_size
        return self.pos

    def get_pos(self):
        return self.pos

    # Expects tuples of (state, next_state, action, reward, done)

    def add(self, datum):

        state, n_state, action, reward, done = datum

        self.states[self.pos] = FloatTensor(state)
        self.n_states[self.pos] = FloatTensor(n_state)
        self.actions[self.pos] = FloatTensor(action)
        self.rewards[self.pos] = FloatTensor([reward])
        self.dones[self.pos] = FloatTensor([done])

        self.pos += 1
        if self.pos == self.memory_size:
            self.full = True
            self.pos = 0

    def sample(self, batch_size):

        upper_bound = self.memory_size if self.full else self.pos
        batch_inds = torch.LongTensor(
            np.random.randint(0, upper_bound, size=batch_size))

        return (self.states[batch_inds],
                self.n_states[batch_inds],
                self.actions[batch_inds],
                self.rewards[batch_inds],
                self.dones[batch_inds])

    def get_reward(self, start_pos, end_pos):

        tmp = 0
        if start_pos <= end_pos:
            for i in range(start_pos, end_pos):
                tmp += self.rewards[i]
        else:
            for i in range(start_pos, self.memory_size):
                tmp += self.rewards[i]

            for i in range(end_pos):
                tmp += self.rewards[i]

        return tmp

    def repeat(self, start_pos, end_pos):

        if start_pos <= end_pos:
            for i in range(start_pos, end_pos):

                self.states[self.pos] = self.states[i].clone()
                self.n_states[self.pos] = self.n_states[i].clone()
                self.actions[self.pos] = self.actions[i].clone()
                self.rewards[self.pos] = self.rewards[i].clone()
                self.dones[self.pos] = self.dones[i].clone()

                self.pos += 1
                if self.pos == self.memory_size:
                    self.full = True
                    self.pos = 0

        else:
            for i in range(start_pos, self.memory_size):

                self.states[self.pos] = self.states[i].clone()
                self.n_states[self.pos] = self.n_states[i].clone()
                self.actions[self.pos] = self.actions[i].clone()
                self.rewards[self.pos] = self.rewards[i].clone()
                self.dones[self.pos] = self.dones[i].clone()

                self.pos += 1
                if self.pos == self.memory_size:
                    self.full = True
                    self.pos = 0

            for i in range(end_pos):

                self.states[self.pos] = self.states[i].clone()
                self.n_states[self.pos] = self.n_states[i].clone()
                self.actions[self.pos] = self.actions[i].clone()
                self.rewards[self.pos] = self.rewards[i].clone()
                self.dones[self.pos] = self.dones[i].clone()

                self.pos += 1
                if self.pos == self.memory_size:
                    self.full = True
                    self.pos = 0



# parameters

In [8]:
parser = argparse.ArgumentParser()

parser.add_argument('-f')
parser.add_argument('--mode', default='train', type=str,)
parser.add_argument('--env', default='HalfCheetah-v2', type=str)
parser.add_argument('--start_steps', default=10000, type=int)

# DDPG parameters
parser.add_argument('--actor_lr', default=0.001, type=float)
parser.add_argument('--critic_lr', default=0.001, type=float)
parser.add_argument('--batch_size', default=100, type=int)
parser.add_argument('--discount', default=0.99, type=float)
parser.add_argument('--reward_scale', default=1., type=float)
parser.add_argument('--tau', default=0.005, type=float)
parser.add_argument('--layer_norm', dest='layer_norm', action='store_true')

# TD3 parameters
parser.add_argument('--use_td3', dest='use_td3', action='store_true')
parser.add_argument('--policy_noise', default=0.2, type=float)
parser.add_argument('--noise_clip', default=0.5, type=float)
parser.add_argument('--policy_freq', default=2, type=int)

# Gaussian noise parameters
parser.add_argument('--gauss_sigma', default=0.1, type=float)

# OU process parameters
parser.add_argument('--ou_noise', dest='ou_noise', action='store_true')
parser.add_argument('--ou_theta', default=0.15, type=float)
parser.add_argument('--ou_sigma', default=0.2, type=float)
parser.add_argument('--ou_mu', default=0.0, type=float)

# ES parameters
parser.add_argument('--pop_size', default=10, type=int)
parser.add_argument('--elitism', dest="elitism",  action='store_true')
parser.add_argument('--n_grad', default=5, type=int)
parser.add_argument('--sigma_init', default=1e-3, type=float)
parser.add_argument('--damp', default=1e-3, type=float)
parser.add_argument('--damp_limit', default=1e-5, type=float)
parser.add_argument('--mult_noise', dest='mult_noise', action='store_true')

# Training parameters
parser.add_argument('--n_episodes', default=1, type=int)
parser.add_argument('--max_steps', default=1000000, type=int)
parser.add_argument('--mem_size', default=1000000, type=int)
parser.add_argument('--n_noisy', default=0, type=int)

# Testing parameters
parser.add_argument('--filename', default="", type=str)
parser.add_argument('--n_test', default=1, type=int)

# misc
parser.add_argument('--output', default='/content/results/', type=str)
parser.add_argument('--period', default=5000, type=int)
parser.add_argument('--n_eval', default=10, type=int)
parser.add_argument('--save_all_models',
                    dest="save_all_models", action="store_true")
parser.add_argument('--debug', dest='debug', action='store_true')
parser.add_argument('--seed', default=-1, type=int)
parser.add_argument('--render', dest='render', action='store_true')

_StoreTrueAction(option_strings=['--render'], dest='render', nargs=0, const=True, default=False, type=None, choices=None, help=None, metavar=None)

In [9]:
import os

USE_CUDA = torch.cuda.is_available()


def prRed(prt):
    print("\033[91m{}\033[00m" .format(prt))


def prGreen(prt):
    print("\033[92m{}\033[00m" .format(prt))


def prYellow(prt):
    print("\033[93m{}\033[00m" .format(prt))


def prLightPurple(prt):
    print("\033[94m{}\033[00m" .format(prt))


def prPurple(prt):
    print("\033[95m{}\033[00m" .format(prt))


def prCyan(prt):
    print("\033[96m{}\033[00m" .format(prt))


def prLightGray(prt):
    print("\033[97m{}\033[00m" .format(prt))


def prBlack(prt):
    print("\033[98m{}\033[00m" .format(prt))


def to_numpy(var):
    return var.cpu().data.numpy() if USE_CUDA else var.data.numpy()


def to_tensor(x, dtype="float"):
    """
    Numpy array to tensor
    """

    FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
    LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
    ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

    if dtype == "float":
        x = np.array(x, dtype=np.float64).tolist()
        return FloatTensor(x)
    elif dtype == "long":
        x = np.array(x, dtype=np.long).tolist()
        return LongTensor(x)
    elif dtype == "byte":
        x = np.array(x, dtype=np.byte).tolist()
        return ByteTensor(x)
    else:
        x = np.array(x, dtype=np.float64).tolist()

    return FloatTensor(x)


def soft_update(target, source, tau):
    """
    Performs a soft target update
    """
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(
            target_param.data * (1.0 - tau) + param.data * tau
        )


def hard_update(target, source):
    """
    Performs a hard target update
    """
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(param.data)

def get_output_folder(parent_dir, env_name):
    """Return save folder.
    Assumes folders in the parent_dir have suffix -run{run
    number}. Finds the highest run number and sets the output folder
    to that number + 1. This is just convenient so that if you run the
    same script multiple times tensorboard can plot all of the results
    on the same plots with different names.
    Parameters
    ----------
    parent_dir: str
      Path of the directory containing all experiment runs.
    Returns
    -------
    parent_dir/run_dir
      Path to this run's save directory.
    """
    os.makedirs(parent_dir, exist_ok=True)
    experiment_id = 0
    for folder_name in os.listdir(parent_dir):
        if not os.path.isdir(os.path.join(parent_dir, folder_name)):
            continue
        try:
            folder_name = int(folder_name.split('-run')[-1])
            if folder_name > experiment_id:
                experiment_id = folder_name
        except:
            pass
    experiment_id += 1

    parent_dir = os.path.join(parent_dir, env_name)
    parent_dir = parent_dir + '-run{}'.format(experiment_id)
    os.makedirs(parent_dir, exist_ok=True)
    return parent_dir

In [10]:
args = parser.parse_args()
args.output = get_output_folder(args.output, args.env)
with open(args.output + "/parameters.txt", 'w') as file:
    for key, value in vars(args).items():
        file.write("{} = {}\n".format(key, value))

# Hope :)))

In [11]:
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    FloatTensor = torch.cuda.FloatTensor
else:
    FloatTensor = torch.FloatTensor

In [12]:
def evaluate(actor, env, memory=None, n_episodes=1, random=False, noise=None, render=False):
    """
    Computes the score of an actor on a given number of runs,
    fills the memory if needed
    """

    if not random:
        def policy(state):
            state = FloatTensor(state.reshape(-1))
            action = actor(state).cpu().data.numpy().flatten()

            if noise is not None:
                action += noise.sample()

            return np.clip(action, -max_action, max_action)

    else:
        def policy(state):
            return env.action_space.sample()

    scores = []
    steps = 0

    for _ in range(n_episodes):

        score = 0
        obs = deepcopy(env.reset())
        done = False

        while not done:

            # get next action and act
            action = policy(obs)
            n_obs, reward, done, _ = env.step(action)
            # done_bool = 0 if steps + \
                # 1 == env._max_episode_steps else float(done)
            score += reward
            steps += 1

            # adding in memory
            if memory is not None:
                memory.add((obs.flatten(), n_obs.flatten(), action, reward, done))
            obs = n_obs

            # render if needed
            if render:
                env.render()

            # reset when done
            if done:
                
                env.reset()
                break
            
            # print(f"step {steps}, score: ",score)

        scores.append(score)
        
    # print("MEAN SCORE: ",np.mean(scores))


    return np.mean(scores), steps

In [13]:
class Actor(RLNN):

    def __init__(self, state_dim, action_dim, max_action, args):
        super(Actor, self).__init__(state_dim, action_dim, max_action)

        self.l1 = nn.Linear(state_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, action_dim)

        if args.layer_norm:
            self.n1 = nn.LayerNorm(400)
            self.n2 = nn.LayerNorm(300)
        self.layer_norm = args.layer_norm

        self.optimizer = torch.optim.Adam(self.parameters(), lr=args.actor_lr)
        self.tau = args.tau
        self.discount = args.discount
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.max_action = max_action

    def forward(self, x):

        if not self.layer_norm:
            x = torch.tanh(self.l1(x))
            x = torch.tanh(self.l2(x))
            x = self.max_action * torch.tanh(self.l3(x))

        else:
            x = torch.tanh(self.n1(self.l1(x)))
            x = torch.tanh(self.n2(self.l2(x)))
            x = self.max_action * torch.tanh(self.l3(x))

        return x

    def update(self, memory, batch_size, critic, actor_t):

        # Sample replay buffer
        states, _, _, _, _ = memory.sample(batch_size)

        # Compute actor loss
        if args.use_td3:
            actor_loss = -critic(states, self(states))[0].mean()
        else:
            actor_loss = -critic(states, self(states)).mean()

        # Optimize the actor
        self.optimizer.zero_grad()
        actor_loss.backward()
        self.optimizer.step()

        # Update the frozen target models
        for param, target_param in zip(self.parameters(), actor_t.parameters()):
            target_param.data.copy_(
                self.tau * param.data + (1 - self.tau) * target_param.data)
            
class Critic(RLNN):
    def __init__(self, state_dim, action_dim, max_action, args):
        super(Critic, self).__init__(state_dim, action_dim, 1)

        self.l1 = nn.Linear(state_dim + action_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, 1)

        if args.layer_norm:
            self.n1 = nn.LayerNorm(400)
            self.n2 = nn.LayerNorm(300)

        self.layer_norm = args.layer_norm
        self.optimizer = torch.optim.Adam(self.parameters(), lr=args.critic_lr)
        self.tau = args.tau
        self.discount = args.discount
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.max_action = max_action

    def forward(self, x, u):

        if not self.layer_norm:
            x = F.leaky_relu(self.l1(torch.cat([x, u], 1)))
            x = F.leaky_relu(self.l2(x))
            x = self.l3(x)

        else:
            x = F.leaky_relu(self.n1(self.l1(torch.cat([x, u], 1))))
            x = F.leaky_relu(self.n2(self.l2(x)))
            x = self.l3(x)

        return x

    def update(self, memory, batch_size, actor_t, critic_t):

        # Sample replay buffer
        states, n_states, actions, rewards, dones = memory.sample(batch_size)

        # Q target = reward + discount * Q(next_state, pi(next_state))
        with torch.no_grad():
            target_Q = critic_t(n_states, actor_t(n_states))
            target_Q = rewards + (1 - dones) * self.discount * target_Q

        # Get current Q estimate
        current_Q = self(states, actions)

        # Compute critic loss
        critic_loss = nn.MSELoss()(current_Q, target_Q)

        # Optimize the critic
        self.optimizer.zero_grad()
        critic_loss.backward()
        self.optimizer.step()

        # Update the frozen target models
        for param, target_param in zip(self.parameters(), critic_t.parameters()):
            target_param.data.copy_(
                self.tau * param.data + (1 - self.tau) * target_param.data)

# trading env

In [14]:
%cd /content/drive/MyDrive/UIT/Mạng Neural và Giải thuật di truyền/Project/CEMRL

/content/drive/MyDrive/UIT/Mạng Neural và Giải thuật di truyền/Project/CEMRL


In [15]:
!git clone https://github.com/notadamking/Stock-Trading-Environment

fatal: destination path 'Stock-Trading-Environment' already exists and is not an empty directory.


In [16]:
%cd /content/drive/MyDrive/UIT/Mạng Neural và Giải thuật di truyền/Project/CEMRL/Stock-Trading-Environment

/content/drive/MyDrive/UIT/Mạng Neural và Giải thuật di truyền/Project/CEMRL/Stock-Trading-Environment


# Run

In [17]:
# import gym_anytrading
from env.StockTradingEnv import StockTradingEnv
# del StockTradingEnv

In [18]:
df = pd.read_csv('/content/drive/MyDrive/UIT/Mạng Neural và Giải thuật di truyền/Project/CEMRL/Stock-Trading-Environment/data/AAPL.csv')
df['Date'] = pd.to_datetime(df['Date'])
# df.dtypes
# df.set_index('Date', inplace=True)
df.tail()

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Volume
5250,5250,2018-11-12,199.0,199.85,193.79,194.17,51135518.0
5251,5251,2018-11-13,191.63,197.18,191.4501,192.23,46882936.0
5252,5252,2018-11-14,193.9,194.48,185.93,186.8,60800957.0
5253,5253,2018-11-15,188.39,191.97,186.9,191.41,46478801.0
5254,5254,2018-11-16,190.5,194.9695,189.46,193.53,36186440.0


In [19]:
env = StockTradingEnv(df)




In [20]:
state_dim = env.observation_space.shape[0] * env.observation_space.shape[1]
action_dim = env.action_space.shape[0]
max_action = int(env.action_space.high[0])


In [21]:
memory = Memory(args.mem_size, state_dim, action_dim)

In [22]:
critic = Critic(state_dim, action_dim, max_action, args)
critic_t = Critic(state_dim, action_dim, max_action, args)

In [23]:
actor = Actor(state_dim, action_dim, max_action, args)
actor_t = Actor(state_dim, action_dim, max_action, args)

In [24]:
a_noise = GaussianNoise(action_dim, sigma=args.gauss_sigma)

In [25]:
if USE_CUDA:
    print("Use Cuda")
    critic.cuda()
    critic_t.cuda()
    actor.cuda()
    actor_t.cuda()

# CEM
es = sepCEM(actor.get_size(), 
            mu_init=actor.get_params(), 
            sigma_init=args.sigma_init, 
            damp=args.damp, 
            damp_limit=args.damp_limit, 
            pop_size=args.pop_size, 
            antithetic=not args.pop_size % 2, parents=args.pop_size // 2, elitism=args.elitism)

Use Cuda


In [26]:
step_cpt = 0
total_steps = 0
actor_steps = 0
df = pd.DataFrame(columns=["total_steps", "average_score",
                            "average_score_rl", "average_score_ea", "best_score"])

In [None]:
while total_steps < args.max_steps:

    fitness = []
    fitness_ = []
    es_params = es.ask(args.pop_size)

    # udpate the rl actors and the critic
    if total_steps > args.start_steps:

        for i in range(args.n_grad):

            # set params
            actor.set_params(es_params[i])
            actor_t.set_params(es_params[i])
            actor.optimizer = torch.optim.Adam(
                actor.parameters(), lr=args.actor_lr)

            # critic update
            for _ in tqdm(range(actor_steps // args.n_grad)):
                critic.update(memory, args.batch_size, actor, critic_t)

            # actor update
            for _ in tqdm(range(actor_steps)):
                actor.update(memory, args.batch_size,
                                critic, actor_t)

            # get the params back in the population
            es_params[i] = actor.get_params()
    actor_steps = 0

    # evaluate noisy actor(s)
    for i in range(args.n_noisy):
        actor.set_params(es_params[i])
        f, steps = evaluate(actor, env, memory=memory, n_episodes=args.n_episodes, render=args.render, noise=a_noise)
        actor_steps += steps
        prCyan('Noisy actor {} fitness:{}'.format(i, f))

    # evaluate all actors
    for params in es_params:

        actor.set_params(params)
        f, steps = evaluate(actor, env, memory=memory, n_episodes=args.n_episodes, render=args.render)
        actor_steps += steps
        fitness.append(f)

        # print scores
        prLightPurple('Actor fitness:{}'.format(f))

    # update es
    es.tell(es_params, fitness)

    # update step counts
    total_steps += actor_steps
    step_cpt += actor_steps

    # save stuff
    if step_cpt >= args.period:

        # evaluate mean actor over several runs. Memory is not filled
        # and steps are not counted
        actor.set_params(es.mu)
        f_mu, _ = evaluate(actor, env, memory=None, n_episodes=args.n_eval,
                            render=args.render)
        prRed('Actor Mu Average Fitness:{}'.format(f_mu))

        df.to_pickle(args.output + "/log.pkl")
        res = {"total_steps": total_steps,
                "average_score": np.mean(fitness),
                "average_score_half": np.mean(np.partition(fitness, args.pop_size // 2 - 1)[args.pop_size // 2:]),
                "average_score_rl": np.mean(fitness[:args.n_grad]),
                "average_score_ea": np.mean(fitness[args.n_grad:]),
                "best_score": np.max(fitness),
                "mu_score": f_mu}

        if args.save_all_models:
            os.makedirs(args.output + "/{}_steps".format(total_steps),
                        exist_ok=True)
            critic.save_model(
                args.output + "/{}_steps".format(total_steps), "critic")
            actor.set_params(es.mu)
            actor.save_model(
                args.output + "/{}_steps".format(total_steps), "actor_mu")
        else:
            critic.save_model(args.output, "critic")
            actor.set_params(es.mu)
            actor.save_model(args.output, "actor")
        df = df.append(res, ignore_index=True)
        step_cpt = 0
        print(res)

    print("Total steps", total_steps)



[94mActor fitness:193299.44425220048[00m
[94mActor fitness:1105954.6379850004[00m
[94mActor fitness:3989.8767462166393[00m
[94mActor fitness:60471.880861721525[00m
[94mActor fitness:35265.53269394196[00m
[94mActor fitness:20265.469026603372[00m
[94mActor fitness:25991.28825629195[00m
[94mActor fitness:21652.68408475084[00m
[94mActor fitness:98552.62403664511[00m
[94mActor fitness:23968.671349353823[00m
[0.00100398 0.00126551 0.00099924 ... 0.00144615 0.00125553 0.00105746]
[91mActor Mu Average Fitness:114770.35930721062[00m
{'total_steps': 47277, 'average_score': 158941.2109292726, 'average_score_half': 298708.82396590186, 'average_score_rl': 279796.2745078161, 'average_score_ea': 38086.14735072902, 'best_score': 1105954.6379850004, 'mu_score': 114770.35930721062}
Total steps 47277


100%|██████████| 9455/9455 [00:39<00:00, 239.58it/s]
100%|██████████| 47277/47277 [03:24<00:00, 230.64it/s]
100%|██████████| 9455/9455 [00:41<00:00, 230.29it/s]
100%|██████████| 47277/47277 [03:14<00:00, 243.21it/s]
100%|██████████| 9455/9455 [00:37<00:00, 251.04it/s]
100%|██████████| 47277/47277 [03:36<00:00, 218.61it/s]
100%|██████████| 9455/9455 [00:38<00:00, 248.36it/s]
100%|██████████| 47277/47277 [03:31<00:00, 223.85it/s]
100%|██████████| 9455/9455 [00:38<00:00, 242.72it/s]
100%|██████████| 47277/47277 [03:15<00:00, 241.74it/s]


[94mActor fitness:113.92291497949694[00m
[94mActor fitness:113.90846661473444[00m
[94mActor fitness:-1588.3456906219344[00m
[94mActor fitness:113.95252121159118[00m
[94mActor fitness:3514.0205413655194[00m
[94mActor fitness:2107.2112110853545[00m
[94mActor fitness:35269.168236479214[00m
[94mActor fitness:41221.71350532527[00m
[94mActor fitness:14725.009621807369[00m
[94mActor fitness:464.2987920451782[00m
[0.00108084 0.00107813 0.00131796 ... 0.14634938 0.00110256 0.02896247]
[91mActor Mu Average Fitness:-2188.031492099604[00m
{'total_steps': 57978, 'average_score': 9605.48601202918, 'average_score_half': 19367.424623212548, 'average_score_rl': 453.4917507098815, 'average_score_ea': 18757.48027334848, 'best_score': 41221.71350532527, 'mu_score': -2188.031492099604}
Total steps 57978


100%|██████████| 2140/2140 [00:08<00:00, 247.71it/s]
100%|██████████| 10701/10701 [00:42<00:00, 248.95it/s]
100%|██████████| 2140/2140 [00:08<00:00, 250.31it/s]
100%|██████████| 10701/10701 [01:00<00:00, 177.23it/s]
100%|██████████| 2140/2140 [00:08<00:00, 261.93it/s]
100%|██████████| 10701/10701 [00:42<00:00, 249.14it/s]
100%|██████████| 2140/2140 [00:08<00:00, 255.95it/s]
100%|██████████| 10701/10701 [00:40<00:00, 265.81it/s]
100%|██████████| 2140/2140 [00:08<00:00, 264.74it/s]
100%|██████████| 10701/10701 [00:40<00:00, 265.37it/s]
  prev_cost + additional_cost) / (self.shares_held + shares_bought)


[94mActor fitness:488142.0538782354[00m
[94mActor fitness:6889312.5[00m
[94mActor fitness:6889312.5[00m
[94mActor fitness:6889312.5[00m
[94mActor fitness:6889312.5[00m
[94mActor fitness:11350.748723180051[00m
[94mActor fitness:16612.81565234455[00m
[94mActor fitness:11629.216032497465[00m
[94mActor fitness:446.75298373379667[00m
[94mActor fitness:-1422.4047523166369[00m
[0.00110146 0.00164202 0.001434   ... 0.02504692 0.00117624 0.00734977]
[91mActor Mu Average Fitness:6889312.5[00m
{'total_steps': 102106, 'average_score': 2808400.9182517673, 'average_score_half': 5609078.4107756475, 'average_score_rl': 5609078.4107756475, 'average_score_ea': 7723.425727887845, 'best_score': 6889312.5, 'mu_score': 6889312.5}
Total steps 102106


100%|██████████| 8825/8825 [00:32<00:00, 267.85it/s]
100%|██████████| 44128/44128 [02:46<00:00, 264.93it/s]
100%|██████████| 8825/8825 [00:34<00:00, 258.75it/s]
100%|██████████| 44128/44128 [02:52<00:00, 255.52it/s]
100%|██████████| 8825/8825 [00:33<00:00, 259.71it/s]
100%|██████████| 44128/44128 [02:51<00:00, 257.36it/s]
100%|██████████| 8825/8825 [00:33<00:00, 266.68it/s]
100%|██████████| 44128/44128 [02:48<00:00, 262.42it/s]
100%|██████████| 8825/8825 [00:34<00:00, 252.58it/s]
100%|██████████| 44128/44128 [02:45<00:00, 266.89it/s]


[94mActor fitness:6889312.5[00m
[94mActor fitness:6889312.5[00m
[94mActor fitness:6889312.5[00m
[94mActor fitness:551.2746755010025[00m
[94mActor fitness:6889312.5[00m
[94mActor fitness:6889312.5[00m
[94mActor fitness:6889312.5[00m
[94mActor fitness:6889312.5[00m
[94mActor fitness:6889312.5[00m
[94mActor fitness:6889312.5[00m
[0.00880868 0.0076386  0.00759963 ... 0.01138739 0.00105706 0.00104953]
[91mActor Mu Average Fitness:6889312.5[00m
{'total_steps': 149618, 'average_score': 6200436.37746755, 'average_score_half': 6889312.5, 'average_score_rl': 5511560.2549351, 'average_score_ea': 6889312.5, 'best_score': 6889312.5, 'mu_score': 6889312.5}
Total steps 149618


100%|██████████| 9502/9502 [00:35<00:00, 268.28it/s]
100%|██████████| 47512/47512 [02:55<00:00, 269.96it/s]
100%|██████████| 9502/9502 [00:35<00:00, 270.27it/s]
 13%|█▎        | 6383/47512 [00:23<02:31, 272.05it/s]