# Imports 

In [1]:
import time
import copy
import numpy as np, random
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from math import sin,pi
import matplotlib.pyplot as plt
import gym
from copy import deepcopy
from gym import spaces
import collections
import tensorflow_probability as tfp




# Create Environment #

### stockMEnv ###

In [2]:
run stockMarketEnvironmentREDUX

In [3]:
env = stockMEnv()
observation = env.reset()
#print(observation)

In [4]:
observation, reward, done, info, cash = env.step(1)

# Create REINFORCE model

In [5]:
class PositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, maxlen, embed_dim):
        super(PositionEmbedding, self).__init__()
        self.pos_emb = tf.keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[1]
        batchS = tf.shape(x)[0]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        positions = tf.expand_dims(positions, axis = 0)
        positions = tf.repeat(positions, batchS, axis=0)
        return x + positions

In [6]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
    # Attention and Normalization
    x = tf.keras.layers.MultiHeadAttention(
        key_dim=head_size, num_heads=num_heads, dropout=dropout
    )(inputs, inputs)
    x = tf.keras.layers.Dropout(dropout)(x)
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)
    res = x + inputs

    # Feed Forward Part
    x = tf.keras.layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(res)
    x = tf.keras.layers.Dropout(dropout)(x)
    x = tf.keras.layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)
    return x + res

In [7]:
#Lapan recomenda ter o Actor e o Critic na mesma rede para partilharem low-level features. O valor do critic é como adicionar uma
#baseline que depende do estado(V(s)) para estabilizar o gradiente. No reinforce fizemos isso na loss com um valor igual para qualquer
#caso. O melhoramente aqui é depender de cada estado
def AC(obs_shape,hidden_size,actions_n,head_size=64,num_heads=2,ff_dim=2,num_transformer_blocks=2,dropout=0.4):
    #reshape = tf.keras.layers.Reshape((obs_shape[0]*obs_shape[1]))
    flatten = tf.keras.layers.Flatten()
    dense1 = tf.keras.layers.Dense(units=hidden_size, activation='relu')
    dense2 = tf.keras.layers.Dense(units=hidden_size, activation='relu')
    dense3 = tf.keras.layers.Dense(units=hidden_size, activation='relu')
    dense_action = tf.keras.layers.Dense(units= actions_n, activation='softmax') 
    dense_value = tf.keras.layers.Dense(units= 1, activation='linear')
    input_model = tf.keras.Input(shape=obs_shape)#pq espaço de obs é 3x90, de ser 90 time steps que olhamos para trás em cada decisão
                                        # não interessa o batch size é como se fosse aplicado a um unico exemplo
    x = input_model
    x = PositionEmbedding(maxlen=env.number_past_steps+1, embed_dim=observation.shape[1])(x)
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)
    x = flatten(x)
    x = dense1(x)  
    x = dense2(x)
    x = dense3(x)
    action = dense_action(x)
    value = dense_value(x)
    return tf.keras.Model(inputs=input_model,outputs=[action,value])


hidden_size=100  
gamma = 0.99
ac = AC(observation.shape,hidden_size, env.nActions)
AC_net_weights_file = 'AC_REDUX_MW.h5'
opt = tf.keras.optimizers.Adam(learning_rate=0.000006, epsilon=0.001)#ter cuidado a escolher lr, loss tende a explodir

In [8]:
ac.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 61, 2)]      0           []                               
                                                                                                  
 position_embedding (PositionEm  (None, 61, 2)       122         ['input_1[0][0]']                
 bedding)                                                                                         
                                                                                                  
 multi_head_attention (MultiHea  (None, 61, 2)       1410        ['position_embedding[0][0]',     
 dAttention)                                                      'position_embedding[0][0]']     
                                                                                              

# Create Agent

In [9]:
class Agent():
      def __init__(self,env):
        self.env = env
        self._reset()
        self.render = False
        
      def _reset(self):
        self.obs = env.reset()
        self.render = False
        
      def play_step(self,state):
        prob = reinforce(np.array([state])) #obter as prob sobre as ações do modelo
        prob = prob[0]
        action = np.random.choice(a=[i for i in range(len(prob))],p=prob)
        #action = int(action.numpy()[0])
        next_state, reward, done, cash, _ = self.env.step(action)
        
        if done:
            if self.render:
                self.env.render()
            self._reset()
        return reward,next_state,action,done

In [10]:
class multiEnvAgent:
    def __init__(self,envs):
        self.envs = envs 
        self.numEnvs = len(envs)
        #self.aux_shape = (self.numenvs,) + self.envs[0].observation_space.shape
        self.render = False
        self.rewards = np.zeros(self.numEnvs)
        self.obs = np.array([env.reset() for env in self.envs]) 
    
    def _reset(self):
        self.obs = np.array([env.reset() for env in self.envs]) 
        self.render = False
    
    def play_step(self,states):
        """states is an array with state for each env[i] i in range(numEnvs)"""
        probs,values = ac(states) #obter as prob sobre as ações do modelo sobre cada env, shape=(num_envs,num_actions)
        #pdb.set_trace()
        probs = np.array(probs)
        values = np.reshape(values,(values.shape[0],))#(20,1)->(20,)
        actions = np.array([np.random.choice(a=[i for i in range(len(prob))],p=prob) for prob in probs])
        actions = actions.astype(np.int32)
        
        next_states = self.obs.copy()
        rewards = np.zeros(self.numEnvs)
        for i in range(numEnvs):
            next_states[i], rewards[i], done, cash, _ = envs[i].step(actions[i])
        
        if done: # done ocorre em simultâneo (neste env) para todos os envs
            if self.render:
                self.envs[0].render()
            self._reset()   
        return rewards,next_states,actions,values,done

In [11]:
numEnvs =20
envs = [stockMEnv() for i in range(numEnvs)]

# Train Functions

In [12]:
import pdb

In [13]:
def train_ac_multiEnvs(states, rewards, actions,values):
    """ o número de episódios é igual ao número de envs
      states, rewards, actions são matrizes com o numero de colunas igual ao 
      comprimento dos episodios e o numero de linhas igual ao numero de episodios"""
    sum_reward = np.zeros(rewards.shape[1])
    discnt_rewards = np.zeros((rewards.shape[0],rewards.shape[1]))
    rewards = np.flip(rewards, axis=1)
    for i in range(rewards.shape[1]):# vamos neste loop calcular a expected cumulative reward
        sum_reward = rewards[i] + gamma*sum_reward
        discnt_rewards[i] = sum_reward
    discnt_rewards = np.flip(discnt_rewards, axis=-1)  
        
    states = np.reshape(states, (-1, states.shape[-2], states.shape[-1]))
    discnt_rewards = np.reshape(discnt_rewards,-1)
    actions = np.reshape(actions,-1)
    values = np.reshape(values,-1)
    with tf.GradientTape() as tape:
        probs,_ = ac(states)
        probs = tf.gather(probs,actions,axis=1, batch_dims=1) 
        #values = tf.gather(values1,values,axis=1, batch_dims=1)
        loss = -tf.math.log(probs)*(discnt_rewards-values)+(discnt_rewards-values)+0.001*probs*tf.math.log(probs)
        pdb.set_trace()
        #loss = policy loss+critic loss onde policy loss é igual à do reinforce com baseline dependente do estado e a do critic
        #é a diferença entre V(s') e V(s). Adicionamos também um entropy bonus para melhorar a exploração regulado por um
        #parametro beta=0.001
    grads = tape.gradient(loss, ac.trainable_variables)
    opt.apply_gradients(zip(grads, ac.trainable_variables))
    ac.save_weights(AC_net_weights_file)

In [14]:
def play_demo_episode(env, single_env_agent):
    done = False
    next_state = env.reset()
    while not done:
        reward,next_state,action,done= single_env_agent.play_step(next_state)

# Train

In [15]:
EPOCH_NUM = 500
epoch = 0
SHOW_LOG_FREQ = 100

In [None]:
# multiEnvsAgent train
#reinforce.load_weights(REINFORCE_net_weights_file)
magent = multiEnvAgent(envs)
single_env_agent = Agent(env)
lenEpisode = env.lenEpisode - env.number_past_steps + 1 #141
numActions = 19
EPOCH_NUM = 500
epoch = 0

# arrays onde  a primeira dimensão é o comprimento de cada episódio 
rewards = np.zeros((lenEpisode, magent.numEnvs))
states = np.zeros((lenEpisode, magent.numEnvs,envs[0].observation_space.shape[0],envs[0].observation_space.shape[1]))
actions =   np.zeros((lenEpisode, magent.numEnvs),dtype=np.int32)  
values = np.zeros((lenEpisode, magent.numEnvs),dtype=np.int32) 
while EPOCH_NUM>epoch:
    epoch+=1
    done = False
    state = np.array([magent.envs[i].reset() for i in range(magent.numEnvs)])
    states[0] = state #(141,20,61,2)
    #rewards = np.zeros(agent.numEnvs)
    total_reward = 0
    total_reward_list = []
    print(epoch, end=' ')
    if (epoch+1) % SHOW_LOG_FREQ == 0:
        #print('\n'+'\t'.join(map(str, [epoch+1,np.mean(total_rewards)])))
        total_rewards_list = []
        magent.render = True
        play_demo_episode(env, single_env_agent)
        single_env_agent.render = False
        continue
    time =0
    while not done:
        # states[time +1],rewards[time+1],done,_ = agent.play_step(state) # ,actions[time]
        rewards[time +1],states[time +1],actions[time +1],values[time +1],done = magent.play_step(state)
        state = states[time +1]
        time = time + 1
        #for i in range(numEnvs):
        #    rewards[i,time.append(reward[i])
        #    states.append(state[i])
        #    actions.append(action[i])
        
        #total_reward += np.sum(reward)
        if done:
            train_ac_multiEnvs(states, rewards, actions,values)
            """
            rewards = np.zeros((lenEpisode, magent.numEnvs))
            states = np.zeros((lenEpisode, magent.numEnvs,envs[0].observation_space.shape[0],envs[0].observation_space.shape[1]))
            actions =   np.zeros((lenEpisode, magent.numEnvs),dtype=np.int32)  
            values = np.zeros((lenEpisode, magent.numEnvs),dtype=np.int32) 
            #total_reward_list.append(total_reward)
            """

1 

In [None]:
import pdb

In [None]:
np.log(1.e-10)
