In [1]:
import os
import glob
import time
from datetime import datetime

from State import State

import torch
import numpy as np
import pandas as pd
from PPO import PPO

Device set to : NVIDIA GeForce RTX 3070


In [2]:
################################### Training ###################################
def train():
    print("============================================================================================")

    env = State()
    env.create_state([-1, 0, 1, 1, 2, 2])
    
    has_continuous_action_space = False # continuous action space; else discrete

    num_of_episodes = 25000
    max_ep_len = 108                   # max timesteps in one episode
    max_training_timesteps = int(108*num_of_episodes)   # break training loop if timeteps > max_training_timesteps

    print_freq = max_ep_len * 10        # print avg reward in the interval (in num timesteps)

    action_std = 0.6            # starting std for action distribution (Multivariate Normal)
    action_std_decay_rate = 0.03       # linearly decay action_std (action_std = action_std - action_std_decay_rate)
    min_action_std = 0.03               # minimum action_std (stop decay after action_std <= min_action_std)
    action_std_decay_freq = int(1e5)  # action_std decay frequency (in num timesteps)
    #####################################################

    ## Note : print/log frequencies should be > than max_ep_len

    ################ PPO hyperparameters ################
    update_timestep = max_ep_len/2       # update policy every n timesteps
    K_epochs = 20               # update policy for K epochs in one PPO update

    eps_clip = 0.2          # clip parameter for PPO
    gamma = 0.8            # discount factor

    lr_actor = 0.00005       # learning rate for actor network
    lr_critic = 0.0001       # learning rate for critic network

    random_seed = 0         # set random seed if required (0 = no random seed)
    #####################################################

    state_dim = 9
    action_dim = 512

    torch.manual_seed(random_seed)
    np.random.seed(random_seed)

    ################# training procedure ################

    # initialize a PPO agent
    ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std)

    # track total training time
    start_time = datetime.now().replace(microsecond=0)
    print("Started training at (GMT) : ", start_time)

    print("============================================================================================")


    # printing and logging variables
    print_running_reward = 0
    print_running_episodes = 0

    time_step = 0
    i_episode = 0
    
    # training loop

    while time_step <= max_training_timesteps:

        state = env.reset()
        current_ep_reward = 0

        for t in range(1, max_ep_len+1):

            # select action with policy

            action = ppo_agent.select_action(state)
            state, reward, done = env.step(action)

            # saving reward and is_terminals
            ppo_agent.buffer.rewards.append(reward)
            ppo_agent.buffer.is_terminals.append(done)

            time_step +=1
            current_ep_reward += reward

            # update PPO agent
            if time_step % update_timestep == 0:
                ppo_agent.update()

            # if continuous action space; then decay action std of ouput action distribution
            if has_continuous_action_space and time_step % action_std_decay_freq == 0:
                ppo_agent.decay_action_std(action_std_decay_rate, min_action_std)


            # printing average reward
            if time_step % print_freq == 0:

                # print average reward till last episode
                print_avg_reward = print_running_reward / print_running_episodes
                print_avg_reward = round(print_avg_reward, 2)

                print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward))

                print_running_reward = 0
                print_running_episodes = 0

            # break; if the episode is over
            if done:
                break

        print_running_reward += current_ep_reward
        print_running_episodes += 1

        i_episode += 1
    torch.save(ppo_agent.policy.state_dict(), "./RLmodel2.pt")


In [3]:
if __name__ == '__main__':

    train()

Started training at (GMT) :  2023-06-26 22:59:23
Episode : 9 		 Timestep : 1080 		 Average Reward : 68831.27
Episode : 19 		 Timestep : 2160 		 Average Reward : 72137.8
Episode : 29 		 Timestep : 3240 		 Average Reward : 77029.76
Episode : 39 		 Timestep : 4320 		 Average Reward : 77289.36
Episode : 49 		 Timestep : 5400 		 Average Reward : 74277.48
Episode : 59 		 Timestep : 6480 		 Average Reward : 77264.78
Episode : 69 		 Timestep : 7560 		 Average Reward : 71692.68
Episode : 79 		 Timestep : 8640 		 Average Reward : 71116.14
Episode : 89 		 Timestep : 9720 		 Average Reward : 81590.06
Episode : 99 		 Timestep : 10800 		 Average Reward : 83631.46
Episode : 109 		 Timestep : 11880 		 Average Reward : 85065.24
Episode : 119 		 Timestep : 12960 		 Average Reward : 85331.34
Episode : 129 		 Timestep : 14040 		 Average Reward : 85569.8
Episode : 139 		 Timestep : 15120 		 Average Reward : 87705.4
Episode : 149 		 Timestep : 16200 		 Average Reward : 79729.26
Episode : 159 		 Timestep : 1