In [1]:
# Imports
import numpy as np
from Gym_new import ImprovedLTYEnv
import time
from ACNet import ActorNetwork, CriticNetwork
from Utilities import normalize_observation, discount_rewards_2, calculate_gaes, plot_learning_curve
from PPO_Agent import PPOAgent
import matplotlib.pyplot as plt

pygame 2.5.2 (SDL 2.28.2, Python 3.8.10)
Hello from the pygame community. https://www.pygame.org/contribute.html
completed test gym with car


2023-11-23 14:19:11.347129: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-23 14:19:11.762065: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-23 14:19:11.765235: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class Trainer():
    def __init__(self, 
                 actor_model, critic_model, 
                 env, map='map3',
                 max_steps=200, num_episodes=300,
                 print_freq=10, batch_size=5,
                 epochs=4, update_frequency=10,
                 epsilon=0.2, target_kl_div=0.01,
                 max_policy_iters=1, max_value_iters=1,
                 policy_lr=0.0003, value_lr=0.0003,
                 max_grad_norm=None, train=False,
                 checkpoint_dir='new_models/'):


        # Init Params
        self.max_steps = max_steps
        self.num_episodes = num_episodes
        self.print_freq = print_freq
        self.batch_size = batch_size
        self.epochs = epochs
        self.update_frequency = update_frequency
        self.epsilon =  epsilon
        self.target_kl_div = target_kl_div
        self.max_policy_iters = max_policy_iters
        self.max_value_iters = max_value_iters
        self.policy_lr = policy_lr
        self.value_lr = value_lr
        self.max_grad_norm = max_grad_norm
        self.train = train
        self.map = map
        #self.checkpoint_dir=checkpoint_dir


        # Init Models and Environment
        self.actor_model = actor_model
        self.critic_model =  critic_model
        self.env = env

        if self.train:
            self.checkpoint_dir = checkpoint_dir
        else:
            self.checkpoint_dir = 'models/'
    
        # Initialize PPO Agent
        self.ppo = PPOAgent(self.actor_model,
                            self.critic_model,
                            self.epsilon,
                            self.target_kl_div,
                            self.max_policy_iters,
                            self.max_value_iters,
                            self.policy_lr,
                            self.value_lr,
                            self.max_grad_norm,
                            checkpoint_dir=self.checkpoint_dir)
    
    def work(self):
        if self.train:
            num_steps = 0
            ep_rewards = []
            policy_losses = []
            value_losses = []
            max_ep_reward = 0
            pl = 0
            vl = 0

            for episode in range(self.num_episodes):
                obs = np.array(self.env.reset())
                done = False
                goal = False
                loop_count = 0
                ep_reward = 0

                while not done:
                    loop_count += 1
                    obs = normalize_observation(obs)

                    action, value, log_prob = self.ppo.get_action_value(obs)

                    new_obs, reward, done, goal, info = self.env.step(np.squeeze(action))

                    num_steps += 1

                    ep_reward += reward

                    self.ppo.store_memory(obs, np.squeeze(action), np.squeeze(value), np.squeeze(log_prob), reward)

                    # Online Training
                    if num_steps % self.update_frequency == 0:

                        for epoch in range(self.epochs):
                            
                            # Access the memory
                            states, actions, values, old_log_probs, rewards = self.ppo.access_memory()

                            # Calculate advantages
                            advantages = calculate_gaes(rewards, values)

                            # Calculate returns
                            returns = discount_rewards_2(advantages, values)

                            # Shuffle the data and generate batches
                            n_states = len(states)
                            batch_start = np.arange(0, n_states, self.batch_size)
                            indices = np.arange(n_states, dtype=np.int64)
                            np.random.shuffle(indices)
                            batches = [indices[i:i+self.batch_size] for i in batch_start]

                            for batch in batches:
                                pl = self.ppo.train_policy(states[batch], actions[batch], advantages[batch], old_log_probs[batch])
                                vl = self.ppo.train_value(states[batch], returns[batch])

                        self.ppo.clear_memory()
                    
                    if loop_count >= self.max_steps:
                        break

                    obs = np.array(new_obs)

                # Store losses after every episode
                policy_losses.append(pl.numpy())
                value_losses.append(vl.numpy())

                # Saves the model weight if the model reaches the goal
                if goal:
                    if ep_reward >= max_ep_reward:
                        max_ep_reward = ep_reward
                        self.ppo.save_models()
                
                ep_rewards.append(ep_reward)

                if (episode +1) % self.print_freq == 0:
                    print('Episode {} | Avg Reward {:.1f}'.format(
                        episode + 1, np.mean(ep_rewards[-self.print_freq:])
                    ))

            # Plot Rewards, Policy Loss and Value Loss
            x = [i+1 for i in range(len(ep_rewards))]
            plot_learning_curve(x, ep_rewards)
            plt.ylabel('Total Rewards')
            plt.xlabel('Episodes')
            plt.show()

            x = [i+1 for i in range(len(policy_losses))]
            plot_learning_curve(x, policy_losses)
            plt.ylabel('Policy Loss')
            plt.xlabel('Episodes')
            plt.show()

            x = [i+1 for i in range(len(value_losses))]
            plot_learning_curve(x, value_losses)
            plt.ylabel('Value Loss')
            plt.xlabel('Episodes')
            plt.show()

        else:
            env = ImprovedLTYEnv(map=self.map)
            self.ppo.load_models()

            for _ in range(10):
                obs = np.array(env.reset())
                done = False
                goal = False
                total_reward = 0
                step = 0

                while not done:

                    obs = normalize_observation(obs)
                    action, value, log_prob = self.ppo.get_action_value(obs)
                    new_obs, reward, done, goal, info = env.step(np.squeeze(action))
                    total_reward = total_reward + reward
                    env.render()
                    time.sleep(0.01)
                    step +=1
                    obs = np.array(new_obs)
                
                print("no. step : ", step)
                print(" ")
                print("total reward :", total_reward)

            env.close()


In [3]:
# Define Actor and Critic Models and Necessary params to provide to class Trainer
action_space = 2
hidden_layer_size = 32
num_hidden_layers_actor = 2
num_hidden_layers_critic = 3

actor_model = ActorNetwork(action_space, hidden_layer_size, num_hidden_layers_actor)
critic_model = CriticNetwork(hidden_layer_size, num_hidden_layers_critic)

env = env = ImprovedLTYEnv()

# Map options: Option1: map='map1a' Option2: map='map3' (Default)
# Necessary Params: 1. actor_model, 2. critic_model, 3. env
# Optional Params: Refer to the __init__ () constructor in Trainer Class
# The trained model works on both the maps. Set train=False to load the pre-trained model. Select the map to test the model
  
trainer = Trainer(actor_model, critic_model, env, train=False, map='map3')

trainer.work()



....loading_models....
We have reached goal.
no. step :  156
 
total reward : 1003.2574206068216
no. step :  156
 
total reward : 302.5381948179604
We have reached goal.
no. step :  153
 
total reward : 702.7608104569317
We have reached goal.
no. step :  155
 
total reward : 1004.3727052265406
We have reached goal.
no. step :  152
 
total reward : 702.9112717116923
no. step :  156
 
total reward : 303.74487334853416
We have reached goal.
no. step :  156
 
total reward : 1004.6289124450683
no. step :  153
 
total reward : 304.0649804366231
We have reached goal.
no. step :  291
 
total reward : 1438.2303991474805
We have reached goal.
no. step :  154
 
total reward : 702.237750428021
