In [7]:

!pip install roboschool==1.0.48 gym==0.15.4  #install compatible versions of roboschool and  gym 

!pip install box2d-py # 2d rigid body physics engine

!pip install pybullet # for physics simulation and robotics



**Import libraries and set device to gpu**


In [8]:
import os  #provides functions for interacting with the operating system
import glob
import time
import torch
import torch.nn as nn          #pytorch neural networks package
from torch.distributions import MultivariateNormal  #prob distibution for continious action space 
import numpy as np 
import gym           # a toolkit for developing and comparing reinforcement learning algorithms
import roboschool

In [9]:
#Switch to GPU
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
print(device)

cuda:0


In [10]:

class PPOMemory:
    def __init__(self):
        self.actions = []   # initialise actions
        self.states = []    # initialise states
        self.probs = []     # initialise probabilities
        self.rewards = []   # initialise rewards
        self.dones = []     # initialise dones 
    
     #update memory  with the new state,action and logarithmic action probability 
    def store_memory(self, state, action, action_logprob):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(action_logprob)
    
    
    #reset the memory
    def clear_memory(self):
        self.actions=[]
        self.states=[]
        self.probs=[]
        self.rewards=[]
        self.dones=[]


class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, action_std_init):
        super(ActorCritic, self).__init__()

        self.action_dim = action_dim  #initilize with the action space dimention
        self.action_var = torch.full((action_dim,), action_std_init * action_std_init).to(device)

        # actor neural network
        self.actor = nn.Sequential(
                        nn.Linear(state_dim,64),
                        nn.Tanh(),
                        nn.Linear(64 ,64),
                        nn.Tanh(),
                        nn.Linear(64, action_dim), 
                        nn.Tanh()   
                    )


        
        # critic neural network
        self.critic = nn.Sequential(
                        nn.Linear(state_dim, 64),
                        nn.Tanh(),
                        nn.Linear(64, 64),
                        nn.Tanh(),
                        nn.Linear(64, 1)  #the critic will map state to its quality which is the Q value
                    )
    
    def act(self, state):
        action_mean = self.actor(state)
        cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
        dist = MultivariateNormal(action_mean, cov_mat)   #normal prob distribution using diag covariance matrix
        action = dist.sample()                   #sampling actions
        action_logprob = dist.log_prob(action)   #log scale 
        return action.detach(), action_logprob.detach()

    
    def criticize(self, state, action):
        action_mean = self.actor(state)
        action_var = self.action_var.expand_as(action_mean)
        cov_mat = torch.diag_embed(action_var).to(device)
        dist = MultivariateNormal(action_mean, cov_mat)
        action_probs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state)
        
        return action_probs, state_values, dist_entropy

#PPO algorithm
class PPO:
    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip,  action_std_init=0.1):


        self.action_std = action_std_init

        self.gamma = gamma  #discount factor
        self.eps_clip = eps_clip  #epsilon, clip parameter 
        self.K_epochs = K_epochs
        
        self.buffer = PPOMemory()

        self.policy = ActorCritic(state_dim, action_dim,  action_std_init).to(device)
        self.optimizer = torch.optim.Adam([
                        {'params': self.policy.actor.parameters(), 'lr': lr_actor},
                        {'params': self.policy.critic.parameters(), 'lr': lr_critic}
                    ])

        self.policy_old = ActorCritic(state_dim, action_dim,  action_std_init).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()
    #select the adequate action according to probs
    def select_action(self, state):

        with torch.no_grad():
            state = torch.FloatTensor(state).to(device)
            action, action_logprob = self.policy_old.act(state)

        self.buffer.store_memory(state,action,action_logprob)


        return action.detach().cpu().numpy().flatten()




    def update(self):

        # Monte Carlo estimate of returns
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.dones)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
            
        # Normalizing the rewards
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        # rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        # convert list to tensor
        old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device)
        old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device)
        old_probs = torch.squeeze(torch.stack(self.buffer.probs, dim=0)).detach().to(device)

        # Optimize policy for K epochs
        for _ in range(self.K_epochs):

            # Evaluating old actions and values
            probs, state_values, dist_entropy = self.policy.criticize(old_states, old_actions)

            # match state_values tensor dimensions with rewards tensor
            state_values = torch.squeeze(state_values)
            
            # Finding the ratio (pi_theta / pi_theta__old)
            ratios = torch.exp(probs - old_probs.detach())

            # Finding Surrogate Loss
            advantages = rewards - state_values.detach()   
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages

            # final loss of clipped objective PPO
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
            
            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
            
        # Copy new weights into old policy
        self.policy_old.load_state_dict(self.policy.state_dict())

        # clear buffer
        self.buffer.clear_memory()
    
    #saving
    def save(self, checkpoint_path):
        torch.save(self.policy_old.state_dict(), checkpoint_path)
   
    #loading
    def load(self, checkpoint_path):
        self.policy_old.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
        self.policy.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))


# **Training**

**environnement & algorithm Hyperparemeters** 

In [11]:
env_name = "RoboschoolAtlasForwardWalk-v1"
max_ep_len = 1000                   # max timesteps in one episode
max_training_timesteps = int(2000000)   # break training loop if timeteps > max_training_timesteps
print_freq = max_ep_len * 10        # print avg reward in the interval (in num timesteps)
log_freq = max_ep_len * 2           # log avg reward in the interval (in num timesteps)
save_model_freq = 10000             # save model frequency (in num timesteps)
action_std = 0.1                    # starting std for action distribution (Multivariate Normal)
update_timestep = max_ep_len * 4      # update policy every n timesteps
K_epochs = 80               # update policy for K epochs in one PPO update The idea in PPO is that you want to reuse the batch many times to update the current policy.This means you repeat your training' k. epoch amount of times for the same batch of trajectories.
eps_clip = 0.2          # clip parameter for PPO 2 
gamma = 0.99            # discount factor
lr_actor = 0.001       # learning rate for actor network
lr_critic = 0.001       # learning rate for critic network
random_seed = 0         # set random seed if required (0 = no random seed)



In [None]:
# Training 
env = gym.make(env_name) # create the environnement 
# state space dimension
state_dim = env.observation_space.shape[0]
# action space dimension
action_dim = env.action_space.shape[0]


# logging 


# assure that log files for multiple runs are NOT overwritten we will have a log file for each run 

log_dir = "Logs"
if not os.path.exists(log_dir):# if no log directory
      os.makedirs(log_dir) # create it 

#### get number of log files in log directory
run_num = 0
current_num_files = next(os.walk(log_dir))[2]
run_num = len(current_num_files) #number of existing log files 



#### create new log file for each run 
log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"
print("logging at : " + log_f_name)



# checkpointing 

run_num_pretrained = 0 # change this to prevent overwriting weights in same env_name folder

directory = "Checkpoints" # initialise the checkpoint directory 
if not os.path.exists(directory):
      os.makedirs(directory) # if no checkpoints dir create one 
#create new chpt file for each run 
directory = directory + '/' + env_name + '/'
if not os.path.exists(directory):
      os.makedirs(directory)


checkpoint_path = directory + "checkpoint_{}_{}.pth".format(random_seed, run_num_pretrained)
print("save checkpoint path : " + checkpoint_path)
print("state space dimension : ", state_dim)
print("action space dimension : ", action_dim)

# training procedure

Agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip,  action_std) # initialize a PPO agent

# precise logging file
log_f = open(log_f_name,"w+")
log_f.write('episode,timestep,reward\n')


# printing and logging variables
print_running_reward = 0
print_running_episodes = 0
log_running_reward = 0
log_running_episodes = 0
time_step = 0 # to keep track of number of timesteps 
episode_n = 0 # to keep track of number of episodes 


# training loop
while time_step <= max_training_timesteps: # while training is not done 
    #new episode
    state = env.reset() # reset the environnement, It returns an initial observation
    current_ep_reward = 0 # reset reward for each episode 

    for t in range(1, max_ep_len): # 
        
        # select action with policy
        action = Agent.select_action(state)
        state, reward, done, _ = env.step(action) # take a step and returns four parameters, namely observation, reward, done and info.

        Agent.buffer.rewards.append(reward)# saving reward 
        Agent.buffer.dones.append(done)# saving is_terminals
        
        time_step +=1 # the agent took a step  
        current_ep_reward += reward # accumulate the reward for each episode

        # update PPO agent each update_timestep
        if time_step % update_timestep == 0:
            Agent.update()



        # log in logging file
        if time_step % log_freq == 0:

            # log average reward till last episode
            log_avg_reward = log_running_reward / log_running_episodes
            log_avg_reward = round(log_avg_reward, 4)

            log_f.write('{},{},{}\n'.format(episode_n, time_step, log_avg_reward))
            log_f.flush()
            log_running_reward = 0
            log_running_episodes = 0

        # printing average reward
        if time_step % print_freq == 0:

            # print average reward till last episode
            print_avg_reward = print_running_reward / print_running_episodes
            print_avg_reward = round(print_avg_reward, 2)

            print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(episode_n, time_step, print_avg_reward))

            print_running_reward = 0
            print_running_episodes = 0
            
        # save model weights
        if time_step % save_model_freq == 0:
            Agent.save(checkpoint_path)
            print("model saved")
            
        # break; if the episode is over
        if done:
            break

    print_running_reward += current_ep_reward
    print_running_episodes += 1

    log_running_reward += current_ep_reward
    log_running_episodes += 1

    episode_n += 1 # increment number of episodes 


log_f.close()
env.close()



logging at : Logs/PPO_RoboschoolAtlasForwardWalk-v1_log_1.csv
save checkpoint path : Checkpoints/RoboschoolAtlasForwardWalk-v1/checkpoint_0_0.pth
state space dimension :  70
action space dimension :  30
Episode : 422 		 Timestep : 10000 		 Average Reward : 49.33
model saved
Episode : 848 		 Timestep : 20000 		 Average Reward : 49.85
model saved
Episode : 1271 		 Timestep : 30000 		 Average Reward : 51.04
model saved
Episode : 1692 		 Timestep : 40000 		 Average Reward : 51.01
model saved
Episode : 2109 		 Timestep : 50000 		 Average Reward : 52.33
model saved
Episode : 2529 		 Timestep : 60000 		 Average Reward : 53.01
model saved
Episode : 2940 		 Timestep : 70000 		 Average Reward : 55.69
model saved
Episode : 3347 		 Timestep : 80000 		 Average Reward : 56.85
model saved
Episode : 3759 		 Timestep : 90000 		 Average Reward : 57.08
model saved
Episode : 4167 		 Timestep : 100000 		 Average Reward : 58.23
model saved
Episode : 4572 		 Timestep : 110000 		 Average Reward : 58.98
model 

**plotting learning curve**

In [None]:
import pandas as pd # data manipulation
import numpy as np #for mathematical operations on arrays
import matplotlib.pyplot as plt #for curves plotting 
data= pd.read_csv('/content/Logs/PPO_RoboschoolAtlasForwardWalk-v1_log_1.csv') # reaading the log file 

In [None]:
data

In [None]:
episodes=list(data["episode"])
rewards=list(data["reward"])

In [None]:
plt.plot(episodes, rewards) # plotting the average reward/episodes 
