In [None]:
import tensorflow as tf
import tensorflow_probability as tfp
import matplotlib.pyplot as plt
import numpy as np
import gym
import copy
import datetime
import math
import time
from IPython.display import clear_output

from tensorflow import keras
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from keras_radam import RAdam

In [None]:
MAX_EP = 1000
EP_LENGTH = 400
GAMMA = 0.99 # discount future reward (UP?)
EPSILON = 0.2 # clip Ratio range[1-EPSILON,1+EPSILON]
ACTOR_LR = 1e-5 # LR
CRITIC_LR = 2e-5 # LR
BATCH = 32 # learning step
ACTOR_EPOCH = 10 # epoch
CRITIC_EPOCH = 10 # epoch
ENTROPY_WHEIGHT = 0.01 # sigma's entropy in Actor loss
ACTION_INTERVAL = 1 # take action every ACTION_INTERVAL steps
TRAIN = True

env = gym.make("Pendulum-v0")
SampleState = env.reset()

STATE_SIZE = len(SampleState)
ACTION_SIZE = 1

CTN_ACTION_RANGE = 2
print("state Size = ",STATE_SIZE)
print("state Sample = ",SampleState)
print("CTN_ACTION_RANGE = ",CTN_ACTION_RANGE)

In [None]:
class buffer(object):
    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []
    def clearBuffer(self):
        self.states = []
        self.actions = []
        self.rewards = []
    def getStates(self):
        return self.states
    def getActions(self):
        return self.actions
    def getRewards(self):
        return self.rewards
    
    def saveState(self,state):
        self.states.append(state)
    def saveAction(self,action):
        self.actions.append(action)
    def saveReward(self,reward):
        self.rewards.append(reward)
    def saveBuffers(self,state,action,reward):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)


class PPO(object):
    def __init__(self,stateSize,actionSize,actionRange,criticLR,actorLR,gamma,epsilon,entropyWeight,loadNN,saveDir,loadDir):
        self.stateSize = stateSize
        self.actionSize = actionSize
        self.actionRange = actionRange
        self.criticLR = criticLR
        self.actorLR = actorLR
        self.GAMMA = gamma
        self.EPSILON = epsilon
        self.saveDir = saveDir
        self.entropyWeight = entropyWeight
        
        if loadNN:
            # load Model
            self.actor,self.oldactor,self.critic = self.loadNN(loadDir)
        else:
            # critc NN
            self.critic = self.buildCriticNet(self.stateSize,1)
            
            # actor & oldActor NN
            self.oldactor = self.buildActorNet(self.stateSize,self.actionRange)
            self.actor = self.buildActorNet(self.stateSize,self.actionRange)

    # Build Net
    def buildActorNet(self,inputSize,continuousActionRange):
        # buildActor NN
        stateInput = layers.Input(shape = (inputSize,),name ='stateInput')
        
        dense1 = layers.Dense(200,activation='relu',name = 'dense1',)(stateInput)
        dense2 = layers.Dense(50,activation='relu',name = 'dense2')(dense1)
        mu = continuousActionRange * layers.Dense(1,activation='tanh',name = 'muOut')(dense2) # mu，既正态分布mean
        sigma = layers.Dense(1,activation='softplus',name = 'sigmaOut')(dense2) #sigma，既正态分布
        muSig = layers.concatenate([mu,sigma],name = 'muSigOut')
        model = keras.Model(inputs = stateInput,outputs = muSig)
        #actorOPT = optimizers.Adam(learning_rate = self.actorLR)
        actorOPT = RAdam(self.actorLR)
        model.compile(optimizer = actorOPT,loss = self.aLoss())
        return model

    def buildCriticNet(self,inputSize,outputSize):
        # buildCritic NN
        stateInput = keras.Input(shape = (inputSize,))
        dense1 = layers.Dense(200,activation='relu')(stateInput)
        dense2 = layers.Dense(50,activation='relu')(dense1)
        output = layers.Dense(outputSize)(dense2)
        model = keras.Model(inputs = stateInput,outputs = output)
        criticOPT = optimizers.Adam(learning_rate=self.criticLR)
        model.compile(optimizer = criticOPT,loss = self.cLoss())
        return model
    
    # loss Function
    def cLoss(self):
        # Critic Loss
        def loss(y_true, y_pred):
            # y_true: discountedR
            # y_pred: critcV = model.predict(states)
            
            advantage = y_true - y_pred # TD error
            loss = tf.reduce_mean(tf.square(advantage))
            return loss
        return loss
    
    def aLoss(self):
            
        def loss(y_true,y_pred):
            # y_true: [[actions,adv,piProb]]
            # y_pred: muSigma = self.actor(state)
            actions = y_true[:,0] # shape : (length,)
            advantage = y_true[:,1] # shape : (length,)
            oldpiProb = y_true[:,2] # shape : (length,)
            #oldpiProb = tf.reshape(oldpiProb,(tf.size(oldpiProb),1)) # shape : (length,1)
            
            dist = tfp.distributions.Normal(y_pred[:,0],y_pred[:,1])
            piProb = dist.prob(actions)
            
            ratio = piProb/(oldpiProb+1e-6)
            surr = ratio * advantage
            clipValue = tf.clip_by_value(ratio,1. - self.EPSILON,1. + self.EPSILON ) * advantage
            
            entropy = tf.reduce_mean(dist.entropy())
            
            loss = -tf.reduce_mean(tf.minimum(surr,clipValue)) + self.entropyWeight * entropy
            return loss
        return loss
        
    # get Action&V
    def chooseAction(self,state):
        # let actor choose action,use the normal distribution
        # state = np.expand_dims(state,0)
        muSigma = self.actor(state) # get mu & sigma
        mu = muSigma[0][0]
        sigma = muSigma[0][1]
        if math.isnan(mu) or math.isnan(sigma):
            # check mu or sigma is nan
            print("mu or sigma is nan")
            time.sleep(100000)
        normDist = np.random.normal(loc=mu,scale=sigma) # normalDistribution
        action = np.clip(normDist,-self.actionRange,self.actionRange) # 在正态分布中随机get一个action
        return action,mu,sigma
    
    def getCriticV(self,state):
        # just get critic's predict value
        '''if state.ndim < 2:
            state = np.expand_dims(state,0)'''
        return self.critic.predict(state)
    
    # Other
    def discountReward(self,nextState,rewards):
        # Discount future rewards
        nextV = self.getCriticV(nextState)
        discountedRewards = []
        for r in rewards[::-1]:
            nextV = r +self.GAMMA*nextV
            discountedRewards.append(nextV)
        discountedRewards.reverse() # \ESREVER/
        discountedRewards = np.squeeze(discountedRewards)
        discountedRewards = np.expand_dims(discountedRewards,axis=1)
        #discountedRewards = np.array(discountedRewards)[:, np.newaxis]
        return discountedRewards
    
    def distProb(self,mu,sig,x):
        # 获取在正态分布mu,sig下当取x值时的概率
        # return shape : (length,1)
        mu = np.reshape(mu,(np.size(mu),))
        sig = np.reshape(sig,(np.size(sig),))
        x = np.reshape(x,(np.size(x),))
        
        dist = tfp.distributions.Normal(mu,sig)
        prob = dist.prob(x)
        prob = np.reshape(prob,(np.size(x),1))
        return prob
    
    # Train Functions
    def trainCritcActor(self,states,actions,rewards,nextState,criticEpochs,actorEpochs):
        # Train ActorNN and CriticNN
        # states: Buffer States
        # actions: Buffer Actions
        # rewards: Buffer Rewards,not discounted yet
        # nextState: next single state
        # criticEpochs: just criticNN'Epochs
        # acotrEpochs: just acotrNN'Epochs
        discountedR = self.discountReward(nextState,rewards)
        
        criticMeanLoss = self.trainCritic(states,discountedR,criticEpochs)
        actorMeanLoss = self.trainActor(states,actions,discountedR,actorEpochs)
        print("A_Loss:",actorMeanLoss,"C_Loss:",criticMeanLoss)
        return actorMeanLoss,criticMeanLoss
    
    def trainCritic(self,states,discountedR,epochs):
        # Trian Critic 
        # states: Buffer States
        # discountedR: Discounted Rewards
        # Epochs: just Epochs
        
        # IDK why this should be list...It just work...
        # If discountR in np.array type it will throw 'Failed to find data adapter that can handle'        
        discountedR = discountedR.tolist()
        his = self.critic.fit(x = states,y = discountedR,epochs=epochs,verbose = 0)
        return np.mean(his.history['loss'])
    
    def trainActor(self,states,actions,discountedR,epochs):
        # Trian Actor
        # states: Buffer States
        # actions: Buffer Actions
        # discountedR: Discounted Rewards
        # Epochs: just Epochs
        states = np.asarray(states)
        actions = np.asarray(actions,dtype=np.float32)
        
        # predict Musig with old Actor NN
        oldPiMuSig = self.oldactor.predict(states)
        oldPiProb = self.distProb(oldPiMuSig[:,0],oldPiMuSig[:,1],actions)

        criticV = self.critic.predict(states)
        advantage = copy.deepcopy(discountedR - criticV)
        
        # pack [[actions,advantage,oldPiProb]] as y_true
        y_true = np.append(actions,advantage,axis=1)
        y_true = np.append(y_true,oldPiProb,axis = 1)

        # train start
        his = self.actor.fit(x = states,y = y_true,epochs = epochs,verbose = 0)
        return np.mean(his.history['loss'])
    
    def updateOldActor(self):
        # updateOldActorNN
        self.oldactor.set_weights(self.actor.get_weights())
    
    # save&load
    def saveNN(self,score):
        score = "_" + str(round(score))
        actor_save_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S")+score+"/actor.h5"
        critic_save_dir = self.saveDir+datetime.datetime.now().strftime("%H%M%S")+score+"/critic.h5"
        self.actor.save(actor_save_dir)
        self.critic.save(critic_save_dir)
        print("Model Saved")
    
    def loadNN(self,loadDir):
        actorDir = loadDir+"/actor.h5"
        criticDir = loadDir+"/critic.h5"
        actor_net_loaded = tf.keras.models.load_model(actorDir)
        old_Actor_net_loaded = tf.keras.models.load_model(actorDir)
        critic_net_loaded = tf.keras.models.load_model(criticDir)
        
        print("Model Loaded")
        return actor_net_loaded,old_Actor_net_loaded,critic_net_loaded
        

In [None]:
modelDir = "PPO-Model/"+datetime.datetime.now().strftime("%m%d-%H%M")+"/"
allEpRewardsHis = []
allEpSigmaHis = []
agent = PPO(STATE_SIZE,ACTION_SIZE,CTN_ACTION_RANGE,CRITIC_LR,ACTOR_LR,GAMMA,EPSILON,ENTROPY_WHEIGHT,loadNN=False,saveDir=modelDir,loadDir = "NAH")
bestScore = 200.
stopTrainCounter = 0

TotalRewardHis = []
TotalSigmaHis = []
TotalActorLossHis = []
TotalCriticLossHis = []
epHis = []
for ep in range(MAX_EP):
    stopTrainCounter -= 1
    epHis.append(ep)
    s = env.reset()
    s = s.tolist()
    epBuffer = buffer()
    epTotalReward = 0
    
    # Historys
    epStepHis = []
    epRewardHis = []
    epSigmaHis = []
    epActorLossHis = []
    epCriticLossHis = []
    for t in range(EP_LENGTH):
        env.render()
        if t%ACTION_INTERVAL ==0: # take action every ACTION_INTERVAL steps
            a,mu,sigma = agent.chooseAction(np.expand_dims(s,0))
            a = [a]
            
            nextS,r,done,_ = env.step(a) # Take Action
            nextS = np.squeeze(nextS).tolist()
            r = (r+8)/8 #normalize reward
            epTotalReward +=r

            # Save Buffers & History
            epBuffer.saveBuffers(s,a,r)
            epRewardHis.append(r)
            epStepHis.append(t)
            epSigmaHis.append(sigma)
        else: # take 0 action between ACTION_INTERVAL steps
            nextS,r,done,_ = env.step([0])
            nextS = np.squeeze(nextS).tolist()
            r = (r+8)/8 #normalize reward
            epTotalReward +=r

        s = nextS
        
        # time to update PPO!
        if (t+1)% BATCH*2 == 0 or t == EP_LENGTH-1:
            bs = epBuffer.getStates()
            ba = epBuffer.getActions()
            br = epBuffer.getRewards()
            epBuffer.clearBuffer()
            agent.updateOldActor()
            if TRAIN:
                epActorLoss, epCriticLoss= agent.trainCritcActor(bs,ba,br,np.expand_dims(s,0),CRITIC_EPOCH,ACTOR_EPOCH)
                epActorLossHis.append(epActorLoss)
                epCriticLossHis.append(epCriticLoss)
    
    # save Historys
    TotalActorLossHis.append(np.mean(epActorLossHis))
    TotalCriticLossHis.append(np.mean(epCriticLossHis))
    if ep == 0:
        allEpRewardsHis.append(epTotalReward)
    else:
        
        allEpRewardsHis.append(epTotalReward)
        #allEpRewards.append(allEpRewards[-1]*0.9+epTotalReward*0.1)
    TotalRewardHis.append(epTotalReward)
    TotalSigmaHis.append(np.mean(epSigmaHis))
    
    clear_output()
    print('Ep %i Over:' % ep,'TotalReward:%i' % epTotalReward)
    
    # figure
    plt.figure(figsize=(21,13))
    plt.subplot(3,2,1)
    plt.plot(epStepHis,epRewardHis)
    plt.title("this ep reward his")
    plt.subplot(3,2,2)
    plt.scatter(epHis,TotalRewardHis)
    plt.title("total epReward his")
    plt.subplot(3,2,3)
    plt.plot(epStepHis,epSigmaHis)
    plt.title("this ep Sigma his")
    plt.subplot(3,2,4)
    plt.plot(epHis,TotalSigmaHis)
    plt.title("Average Sigma his")
    plt.subplot(3,2,5)
    plt.plot(epHis,TotalActorLossHis)
    plt.title("Average ActorLoss his")
    plt.subplot(3,2,6)
    plt.plot(epHis,TotalCriticLossHis)
    plt.title("Average CriticLoss his")
    # whent get a new record
    if (bestScore < epTotalReward):
        figDir = modelDir+datetime.datetime.now().strftime("%H%M%S")+"_"+str(round(epTotalReward))+".png"
        bestScore = epTotalReward
        # save NN & pic
        agent.saveNN(epTotalReward)
        plt.savefig(figDir)
        if ep != 0:
            TRAIN = False
            stopTrainCounter = 5
    plt.show()
    if stopTrainCounter <= 0:
        TRAIN = True