In [1]:
import os
os.chdir('../')
os.getcwd()

'e:\\github_clone\\Deep-Inverse-Reinforcement-Learning'

In [2]:
from IRL import PPO
import gym
import torch
import numpy as np

In [4]:
env_name = "LunarLander-v2"
has_continuous_action_space = False

max_ep_len = 400                    # max timesteps in one episode
max_training_timesteps = int(1e5)   # break training loop if timeteps > max_training_timesteps

print_freq = max_ep_len * 4     # print avg reward in the interval (in num timesteps)
log_freq = max_ep_len * 2       # log avg reward in the interval (in num timesteps)
save_model_freq = int(2e4)      # save model frequency (in num timesteps)

action_std = None

update_timestep = max_ep_len * 4      # update policy every n timesteps
K_epochs = 40               # update policy for K epochs
eps_clip = 0.2              # clip parameter for PPO
gamma = 0.99                # discount factor

lr_actor = 0.0003       # learning rate for actor network
lr_critic = 0.001       # learning rate for critic network

random_seed = 0         

In [5]:
print("training environment name : " + env_name)

env = gym.make(env_name)

# state space dimension
state_dim = env.observation_space.shape[0]

# action space dimension
if has_continuous_action_space:
    action_dim = env.action_space.shape[0]
else:
    action_dim = env.action_space.n

training environment name : LunarLander-v2


In [6]:
log_dir = "PPO_logs"
if not os.path.exists(log_dir):
      os.makedirs(log_dir)

log_dir = log_dir + '/' + env_name + '/'
if not os.path.exists(log_dir):
      os.makedirs(log_dir)

In [7]:
run_num = 0
current_num_files = next(os.walk(log_dir))[2]
run_num = len(current_num_files)


#### create new log file for each run 
log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"

print("current logging run number for " + env_name + " : ", run_num)
print("logging at : " + log_f_name)

current logging run number for LunarLander-v2 :  0
logging at : PPO_logs/LunarLander-v2//PPO_LunarLander-v2_log_0.csv


In [8]:
run_num_pretrained = 0      #### change this to prevent overwriting weights in same env_name folder

directory = "PPO_preTrained"
if not os.path.exists(directory):
      os.makedirs(directory)

directory = directory + '/' + env_name + '/'
if not os.path.exists(directory):
      os.makedirs(directory)


checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, random_seed, run_num_pretrained)
print("save checkpoint path : " + checkpoint_path)

save checkpoint path : PPO_preTrained/LunarLander-v2/PPO_LunarLander-v2_0_0.pth


In [9]:
if random_seed:
    print("--------------------------------------------------------------------------------------------")
    print("setting random seed to ", random_seed)
    torch.manual_seed(random_seed)
    env.seed(random_seed)
    np.random.seed(random_seed)

In [10]:
ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, has_continuous_action_space, action_std)

In [11]:
from datetime import datetime

start_time = datetime.now().replace(microsecond=0)
print("Started training at (GMT) : ", start_time)
log_f = open(log_f_name,"w+")
log_f.write('episode,timestep,reward\n')


# printing and logging variables
print_running_reward = 0
print_running_episodes = 0

log_running_reward = 0
log_running_episodes = 0

time_step = 0
i_episode = 0

while time_step <= max_training_timesteps:
    
    state, _ = env.reset()
    current_ep_reward = 0

    for t in range(1, max_ep_len+1):
        
        # select action with policy
        action = ppo_agent.select_action(state)
        state, reward, done, _, _ = env.step(action)
        
        # saving reward and is_terminals
        ppo_agent.buffer.rewards.append(reward)
        ppo_agent.buffer.is_terminals.append(done)
        
        time_step +=1
        current_ep_reward += reward

        # update PPO agent
        if time_step % update_timestep == 0:
            ppo_agent.update()

        # log in logging file
        if time_step % log_freq == 0:

            # log average reward till last episode
            log_avg_reward = log_running_reward / log_running_episodes
            log_avg_reward = round(log_avg_reward, 4)

            log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward))
            log_f.flush()

            log_running_reward = 0
            log_running_episodes = 0

        # printing average reward
        if time_step % print_freq == 0:

            # print average reward till last episode
            print_avg_reward = print_running_reward / print_running_episodes
            print_avg_reward = round(print_avg_reward, 2)

            print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward))

            print_running_reward = 0
            print_running_episodes = 0
            
        # save model weights
        if time_step % save_model_freq == 0:
            print("--------------------------------------------------------------------------------------------")
            print("saving model at : " + checkpoint_path)
            ppo_agent.save(checkpoint_path)
            print("model saved")
            print("Elapsed Time  : ", datetime.now().replace(microsecond=0) - start_time)
            print("--------------------------------------------------------------------------------------------")
            
        # break; if the episode is over
        if done:
            break

    print_running_reward += current_ep_reward
    print_running_episodes += 1

    log_running_reward += current_ep_reward
    log_running_episodes += 1

    i_episode += 1


log_f.close()
env.close()

Started training at (GMT) :  2024-08-06 20:36:09


  if not isinstance(terminated, (bool, np.bool8)):


Episode : 17 		 Timestep : 1600 		 Average Reward : -163.36
Episode : 34 		 Timestep : 3200 		 Average Reward : -128.01
Episode : 53 		 Timestep : 4800 		 Average Reward : -151.8
Episode : 72 		 Timestep : 6400 		 Average Reward : -138.29
Episode : 88 		 Timestep : 8000 		 Average Reward : -174.23
Episode : 105 		 Timestep : 9600 		 Average Reward : -195.23
Episode : 124 		 Timestep : 11200 		 Average Reward : -159.34
Episode : 141 		 Timestep : 12800 		 Average Reward : -167.01
Episode : 155 		 Timestep : 14400 		 Average Reward : -99.92
Episode : 174 		 Timestep : 16000 		 Average Reward : -110.32
Episode : 193 		 Timestep : 17600 		 Average Reward : -110.16
Episode : 212 		 Timestep : 19200 		 Average Reward : -118.54
--------------------------------------------------------------------------------------------
saving model at : PPO_preTrained/LunarLander-v2/PPO_LunarLander-v2_0_0.pth
model saved
Elapsed Time  :  0:00:35
----------------------------------------------------------------