In [1]:
# Find RL_Note path and append sys path
import os, sys
cwd = os.getcwd()
pos = cwd.find('RL_Note')
root_path = cwd[0:pos] + 'RL_Note'
sys.path.append(root_path)
print(root_path)
workspace_path = root_path + "\\pys"

e:\MyNote\RL_Note


In [2]:
import gym
import random
import numpy as np
import matplotlib.pyplot as plt
from pys.env_config  import env_configs
from pys.agent.ddpg_agent   import DDPGAgent
from pys.agent.td3_agent    import TD3Agent
from pys.agent.sac_agent    import SACAgent

In [3]:
def done_function(state):
    return False

def reward_function(state, action, next_state, done):
    costh   = state[0]
    sinth   = state[1]
    th      = np.arctan2(sinth,costh)
    thdot   = state[2]
    u       = action[0]
    costs   = th ** 2 + 0.1 * thdot + 0.001 * (u ** 2)
    return -costs

Set Environment and Agent

In [4]:
cfg = {\
        "ENV":"Pendulum-v0",\
        # "ENV":"LunarLanderContinuous-v2",\
        "RL":"DDPG",\
        "ER":"ER",\
        "HER":\
            {
                "REPLAY_N":8,\
                "STRATEGY":"RANDOM",\
                "REWARD_FUNC":reward_function,\
                "DONE_FUNC":done_function,\
            },\
        "BATCH_SIZE":32,\
        }
env_config = env_configs[cfg["ENV"]]
if cfg["ER"] == "HER":
    FILENAME = cfg["ENV"] + '_' + cfg["RL"] + '_' + cfg["ER"] + '_' + cfg["HER"]["STRATEGY"]
else:
    FILENAME = cfg["ENV"] + '_' + cfg["RL"] + '_' + cfg["ER"]
EPISODES = env_config["EPISODES"]
END_SCORE = env_config["END_SCORE"]

Train Model

In [5]:
%matplotlib tk

figure = plt.gcf()
figure.set_size_inches(8,6)
env = gym.make(cfg["ENV"])
if cfg["RL"] == "DDPG":
    agent = DDPGAgent(env, cfg)
elif cfg["RL"] == "TD3":
    agent = TD3Agent(env, cfg)
elif cfg["RL"] == "SAC":
    agent = SACAgent(env, cfg)

if __name__ == "__main__":
    scores_avg, scores_raw, episodes, losses = [], [], [], []
    critic_mean, actor_mean = [], []
    score_avg = 0
    end = False
    show_media_info = True
    goal = np.array([1.0,0.0,0.0])
    
    for e in range(EPISODES):
        done = False
        score = 0
        state = env.reset()
        critic_losses = []
        actor_losses = []
        while not done:
            # if e%100 == 0:
            #     env.render()
            # Interact with env.
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            agent.remember(state, action, reward, next_state, done, goal)
            critic_loss, actor_loss = agent.train_model()
            state = next_state
            # 
            score += reward
            critic_losses.append(critic_loss)
            actor_losses.append(actor_loss)
            if show_media_info:
                print("-------------- Variable shapes --------------")
                print("State Shape : ", np.shape(state))
                print("Action Shape : ", np.shape(action))
                print("Reward Shape : ", np.shape(reward))
                print("done Shape : ", np.shape(done))
                print("---------------------------------------------")
                show_media_info = False
            if done:
                score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
                print("episode: {0:3d} | score avg: {1:3.2f} | mem size {2:6d} |"
                    .format(e, score_avg, len(agent.memory)))

                episodes.append(e)
                scores_avg.append(score_avg)
                scores_raw.append(score)
                critic_mean.append(np.mean(critic_losses))
                actor_mean.append(np.mean(actor_losses))
                # View data
                plt.clf()
                plt.subplot(311)
                plt.plot(episodes, scores_avg, 'b')
                plt.plot(episodes, scores_raw, 'b', alpha=0.8, linewidth=0.5)
                plt.xlabel('episode'); plt.ylabel('average score'); plt.grid()
                plt.title(cfg["ENV"] +'_' + cfg["RL"] +'_' + cfg["ER"])
                plt.subplot(312)
                plt.plot(episodes, critic_mean, 'b.',markersize=3)
                plt.xlabel('episode'); plt.ylabel('critic loss'); plt.grid()
                plt.subplot(313)
                plt.plot(episodes, actor_mean, 'b.',markersize=3)
                plt.xlabel('episode'); plt.ylabel('actor loss'); plt.grid()
                plt.savefig(workspace_path + "\\result\\img\\" + FILENAME + "_TF.jpg", dpi=100)

                # 이동 평균이 0 이상일 때 종료
                if score_avg > END_SCORE:
                    agent.save_model(workspace_path + "\\result\\save_model\\")
                    end = True
                    break
        if end == True:
            env.close()
            np.save(workspace_path + "\\result\\data\\" + FILENAME + "_TF_epi",  episodes)
            np.save(workspace_path + "\\result\\data\\" + FILENAME + "_TF_scores_avg",scores_avg)
            np.save(workspace_path + "\\result\\data\\" + FILENAME + "_TF_scores_raw",scores_raw)
            np.save(workspace_path + "\\result\\data\\" + FILENAME + "_TF_critic_mean",critic_mean)
            np.save(workspace_path + "\\result\\data\\" + FILENAME + "_TF_actor_mean",actor_mean)
            print("End")
            break

Pendulum-v0_DDPG_HER_RANDOM
States 3, Actions 1
1 th Action space -2.00 ~ 2.00
-------------- Variable shapes --------------
State Shape :  (3,)
Action Shape :  (1,)
Reward Shape :  ()
done Shape :  ()
---------------------------------------------
episode:   0 | score avg: -1411.33 | mem size   1800 |
Start to train, check batch shapes
**** shape of mini_batch (32, 6) <class 'list'>
**** shape of states (32, 3) <class 'tensorflow.python.framework.ops.EagerTensor'>
**** shape of actions (32, 1) <class 'tensorflow.python.framework.ops.EagerTensor'>
**** shape of rewards (32, 1) <class 'tensorflow.python.framework.ops.EagerTensor'>
**** shape of next_states (32, 3) <class 'tensorflow.python.framework.ops.EagerTensor'>
**** shape of dones (32, 1) <class 'tensorflow.python.framework.ops.EagerTensor'>
**** shape of goals (32, 3) <class 'tensorflow.python.framework.ops.EagerTensor'>
episode:   1 | score avg: -1360.30 | mem size   3600 |
episode:   2 | score avg: -1375.54 | mem size   5400 |
e

Test Model

In [6]:
env = gym.make(cfg["ENV"])
if cfg["RL"] == "DDPG":
    agent = DDPGAgent(env, cfg)
elif cfg["RL"] == "TD3":
    agent = TD3Agent(env, cfg)
elif cfg["RL"] == "SAC":
    agent = SACAgent(env, cfg)
agent.load_model(workspace_path + "\\result\\save_model\\")

if __name__ == "__main__":
    score_avg = 0
    for e in range(10):
        done = False
        score = 0
        state = env.reset()
        while not done:
            env.render()
            action = agent.get_action(state)
            next_state, reward, done, info = env.step(action)
            state = next_state
            # 
            score += reward
            if done:
                score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
                print("episode: {0:3d} | score avg: {1:3.2f} |"
                    .format(e+1, score_avg))

Pendulum-v0_DDPG_HER_RANDOM
States 3, Actions 1
1 th Action space -2.00 ~ 2.00
episode:   1 | score avg: -319.16 |
episode:   2 | score avg: -299.62 |
episode:   3 | score avg: -282.13 |
episode:   4 | score avg: -266.06 |
episode:   5 | score avg: -263.20 |
episode:   6 | score avg: -249.10 |
episode:   7 | score avg: -236.56 |
episode:   8 | score avg: -225.00 |
episode:   9 | score avg: -238.33 |
episode:  10 | score avg: -226.28 |
