# PPO performance on Procgen
In this notebook we are going implement a Proximal Policy Optimization (PPO) agent. The experiment purpose is to evaluate the PPO agent generalization capabilities on a subset of environments provided by the Procgen benchmark.\
For our experiment we will the following seeds and games:\
Seeds: 42,1377,47981\
Games: bigfish, dodgeball, fruitbot, maze, ninja, starpilot

# Useful links
[https://iclr-blog-track.github.io/2022/03/25/ppo-implementation-details](url)
[spinningup.openai.com/en/latest/algorithms/ppo.html](url)

In [None]:
#packages need to install to run procgen on the Gym API
#!pip install procgen gym==0.26.2 pygame ufal.pybox2d

# Importing

In [5]:
import numpy as np
import torch
from torch import nn
import random
import pandas as pd
import pickle
from collections import deque
import copy
import os,json 
import gym
from tqdm import tqdm
from torch.distributions import Categorical

## Reproducibility

In [6]:
#set the desired seed and the game from the Procgen benchmark
SEED = 1377  
GAME = 'starpilot'

# Fix the random state for reproducibility
def fix_seed(seed: int) -> None:
    """Fix all the possible sources of randomness.

    Args:
        seed: the seed to use.
    """
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

#For reproducibilty reasons we fix the seed 
fix_seed(SEED)


# Network Architecture

The backbone architecture of both the networks is inspired from the Impala paper: [https://arxiv.org/pdf/1802.01561](url) and also used in the Procgen paper: [https://arxiv.org/pdf/1912.01588](url)


In [32]:
class ResidualBlock(nn.Module):
    def __init__(self,in_channels):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=3, stride=1, padding=1)
        self.batch_norm = nn.BatchNorm2d(in_channels)
        self.relu = nn.ReLU()
    def forward(self, x):
        out = self.relu(x)
        out = self.conv1(out)
        out = self.relu(out)
        out = self.conv2(out)
        return out + x
    
class ImpalaBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ImpalaBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1)
        self.batch_norm = nn.BatchNorm2d(out_channels)
        self.res = ResidualBlock(out_channels)

    def forward(self, x):
        x = self.conv(x)
        x = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)(x)
        x = self.res(x)
        x = self.res(x)
        return x

class ActorImpalaModel(nn.Module):
    def __init__(self,in_channels,output_size):
        super(ActorImpalaModel, self).__init__()
        self.block1 = ImpalaBlock(in_channels=in_channels, out_channels=16)
        self.block2 = ImpalaBlock(in_channels=16, out_channels=32)
        self.block3 = ImpalaBlock(in_channels=32, out_channels=32)
        self.block4 = ImpalaBlock(in_channels=32, out_channels=64)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(in_features=32 * 8 * 8, out_features=256)
        self.out = nn.Linear(in_features=256,out_features=output_size)
        self.conv = nn.Conv2d(in_channels=64, out_channels=output_size, kernel_size=1, stride=1)
        self.gap = nn.AdaptiveAvgPool2d(1)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        #move the channels in the second dimension, from (n_batch,size1,size2,n_channels) to (n_batch,n_channels,size1,size2) 
        x = x.movedim(-1,1)
        x = self.std(x)
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        x = self.relu(x)
        #flat the tensors from conv layers to fully connected layers
        #1x1 convolution
        x = self.conv(x)
        #global average pooling
        x = self.gap(x)
        x = torch.flatten(x,1)
        probs = self.softmax(x)
        return probs
    
    #standardize the input
    def std(self,x):
        x = x / 255.0
        return x    

class CriticImpalaModel(nn.Module):
    def __init__(self,in_channels,output_size):
        super(CriticImpalaModel, self).__init__()
        self.block1 = ImpalaBlock(in_channels=in_channels, out_channels=16)
        self.block2 = ImpalaBlock(in_channels=16, out_channels=32)
        self.block3 = ImpalaBlock(in_channels=32, out_channels=32)
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        self.fc = nn.Linear(in_features=32 * 8 * 8, out_features=128)
        self.out = nn.Linear(in_features=128, out_features=output_size)
        self.softmax = nn.Softmax(dim=1)
        self.output_dim = 1

    def forward(self, x):
        x= x.movedim(-1,1)
        x = self.std(x)
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.relu(x)
        #flat the tensors from conv layers to fully connected layers
        x = torch.flatten(x,1)
        x = self.fc(x)
        x = self.tanh(x)
        x = self.out(x)
        return x
    
    def std(self,x):
        x = x / 255.0
        return x    




        
        
        

## Model summary

In [8]:
"""
import torch
from torchsummary import summary
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
impala = ActorImpalaModel(3,15).to(device)
summary(impala, (64,64,3))
"""


"\nimport torch\nfrom torchsummary import summary\ndevice = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\nimpala = ActorImpalaModel(3,15).to(device)\nsummary(impala, (64,64,3))\n"

# PPO implementation

In [9]:
class PPOAgent():
    def __init__(self,
                 in_ch,             #input channels
                 n_actions,         #action space size
                 n_envs = 48,       #n° of parallel environments
                 batch_size = 256,  #size of the batch
                 gamma = 0.99,      #PPO discount parameter
                 lam = 0.95,        #GAE parameter
                 epsilon = 0.2,     #PPO clipping parameter
                 lr_a = 5e-4,       #actor learning rate
                 lr_c = 5e-4,       #critic learning rate
                 epochs = 3,        #n° of epochs in PPO 
                 n_minibatch = 6,   #number of minibatch
                 device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'),
                 critic_criterion=torch.nn.MSELoss(reduction='mean')):
        
        self.in_ch = in_ch
        self.n_actions = n_actions
        self.n_envs = n_envs
        self.batch_size = batch_size
        self.gamma=gamma
        self.lam = lam
        self.epsilon = epsilon
        self.lr_a = lr_a
        self.lr_c = lr_c
        self.epochs=epochs
        self.n_minibatch = n_minibatch
        self.device = device
        self.critic_criterion = critic_criterion
        self.actor,self.critic = self.get_networks()
        self.actor_optimizer = torch.optim.AdamW(self.actor.parameters(), lr=self.lr_a, weight_decay=0.01)
        self.critic_optimizer = torch.optim.AdamW(self.critic.parameters(), lr=self.lr_c, weight_decay=0.01)
        
     
    def get_networks(self):
        actor_net = ActorImpalaModel(self.in_ch,self.n_actions).to(self.device)
        critic_net = CriticImpalaModel(self.in_ch,1).to(self.device)
        return actor_net,critic_net
    
    def get_action(self,state):
        #unsqueeze add a dimension of size 1 to simulate a batch
        state = torch.tensor(state, dtype=torch.float32, device = self.device).unsqueeze(0)
        #get probability distributions of the actions
        probabilities = self.actor(state)
        #build a distribution
        dist = Categorical(probabilities.squeeze())
        #sample action from the distribution
        action = dist.sample()
        #print(f"multinom: {int(action.squeeze().detach().numpy())}")
        prob = probabilities.squeeze()[action]
        return prob, action
    
    def play_step(self,obs):#obs has shape n_env x (64,64,3)
        probs = self.actor(obs)
        #compute log_prob and actions
        dist = Categorical(probs)
        #sample from distribution
        actions = dist.sample()
        #print(f"multinom: {int(action.squeeze().detach().numpy())}")
        log_probs = dist.log_prob(actions).detach()
        next_states,rewards,truncated, dones,infos=env.step(actions.cpu().detach().numpy())   
        actions = actions.detach()
        #dones = [d or t for d,t in zip(dones,truncated)]
        dones = np.maximum(dones,truncated)
        return next_states,rewards,dones,log_probs,actions
    
    #compute advantages and returns using GAE
    def compute_advs_rts(self,values,rewards,dones,next_values):
        A_t = 0
        advantages = []
        returns = []
        values = torch.cat([values,next_values.unsqueeze(0)])
        with torch.no_grad():
            for t in reversed(range(len(rewards))):
                delta = rewards[t] + self.gamma*values[t+1]*(~dones[t]) - values[t]
                A_t =  delta + self.gamma * self.lam * A_t * (~dones[t])
                advantages.insert(0,A_t.to(torch.float32))
                ret = A_t + values[t]
                returns.insert(0,ret.to(torch.float32))
        return advantages, returns   
        
    def train(self,states,advantages,critic_targets,old_log_probs,actions):
        actor_losses = []
        critic_losses = []
        #advantages normalization
        advantages = (advantages - advantages.mean()) / (advantages.std()+1e-8)
        #iterate for n epochs over the data, shuffle the data and create minibatches 
        tot_samples = self.n_envs*self.batch_size
        for _ in range(self.epochs):
            minibatch_size = tot_samples // self.n_minibatch
            starts = np.arange(0,tot_samples,minibatch_size)
            tot_ids = np.arange(0,tot_samples)
            #randomly shuffle the batch
            np.random.shuffle(tot_ids)
            for start in starts:
                ids = tot_ids[start:start+minibatch_size]    
                #compute the values according to the updated critic
                new_values = self.critic(states[ids])
                
                #compute the probabilities according to the updated actor
                probs = self.actor(states[ids])
                #make a distribution
                dist = Categorical(probs)
                #choose the new log probabilities
                new_log_probs = dist.log_prob(actions[ids]).unsqueeze(1)
                #compute the entropy, used to improve exploration
                entropy = dist.entropy().unsqueeze(1)
                
                ratios = torch.exp(new_log_probs-old_log_probs[ids])
                clip = torch.clamp(ratios, 1-self.epsilon, 1+self.epsilon)

                entropy_loss = (0.01*torch.unsqueeze(entropy,1)).mean(0)
                actor_loss =(-torch.min(ratios*advantages[ids], clip*advantages[ids])).mean(0)
                actor_loss = actor_loss - entropy_loss
                critic_loss = self.critic_criterion(new_values,critic_targets[ids])
                #update actor
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()
                #update critic
                self.critic_optimizer.zero_grad()
                critic_loss.backward()
                self.critic_optimizer.step()
            
                actor_losses.append(actor_loss.item())
                critic_losses.append(critic_loss.item())
        return np.mean(actor_losses),np.mean(critic_losses)


    #save and load models checkpoints
    def save_checkpoint(self,step,game=GAME,seed=SEED):
        filename1 = 'actor_checkpoint' 
        filename2 = 'critic_checkpoint'
        models_folder = "models_checkpoints"
        if not os.path.exists(models_folder):
            os.makedirs(models_folder)
        #actor saving
        path = f"{models_folder}/{game}_{filename1}_{step}_{seed}.pt"
        torch.save(self.actor.state_dict(),path)
        #critic saving
        path = f"{models_folder}/{game}_{filename2}_{step}_{seed}.pt"
        torch.save(self.critic.state_dict(),path)
        print(f"Checkpoint:{step} with seed:{seed} created!")

    def load_checkpoint(self,step,game=GAME,seed=SEED):
        if step%250 != 0 or step==0:
            raise Exception("The step should be a multiple of 250 and greater than zero")
        models_folder = "models_checkpoints"
        filename1 = 'actor_checkpoint' 
        filename2 = 'critic_checkpoint'
        #actor loading
        path = f"{models_folder}/{game}_{filename1}_{step}_{seed}.pt"
        actor_dict = torch.load(path)
        self.actor.load_state_dict(actor_dict)
        #critic loading
        path = f"{models_folder}/{game}_{filename2}_{step}_{seed}.pt"
        critic_dict = torch.load(path)
        self.critic.load_state_dict(critic_dict)
        print(f"Checkpoint:{step} have been loaded")



    

# Utils

## Save and load models

In [10]:
def save_model_weights(model_dict,seed=SEED,game=GAME,best=True):
    if best:
        filename1 = 'best_actor' 
        models_folder = "best_models"
        if not os.path.exists(models_folder):
            os.makedirs(models_folder)
        #actor saving
        path = f"{models_folder}/{game}_{filename1}_{seed}.pt"
        torch.save(model_dict,path)
    else:
        filename1 = 'model' 
        models_folder = "models"
        if not os.path.exists(models_folder):
            os.makedirs(models_folder)
        #model saving
        path = f"{models_folder}/{game}_{filename1}_{seed}.pt"
        torch.save(model_dict,path)
    b = "best" if best else ""
    print(f"{b} model weights saved")
        
def load_model_weights(seed=SEED,game=GAME,best=True):
    if best:
        filename1 = 'best_actor' 
        models_folder = "best_models"
        if not os.path.exists(models_folder):
            os.makedirs(models_folder)
        #best actor loading
        path = f"{models_folder}/{filename1}_{seed}.pt"
        model_dict = torch.load(path)
    else:
        models_folder = "models"
        filename1 = 'model' 
        #actor loading
        path = f"{models_folder}/{filename1}_{seed}.pt"
        model_dict = torch.load(path)
    b = "best" if best else ""
    print(f"{b} model weights loaded")
  
    return model_dict 
 


## Save and load data

In [11]:
def save_data(df,train=True,game=GAME,seed=SEED):
    if train:
        filename= 'train_scores'
    else:
        filename= 'test_scores'
    results_folder = "results"
    if not os.path.exists(results_folder):
        os.makedirs(results_folder)
    path = f"{results_folder}/{game}_{filename}_{SEED}.csv"
    df.to_csv(path)
    print(f"file created in {path}")
          
def load_data(game,seed,train=True):
    if train:
        filename= 'train_scores'
    else:
        filename= 'test_scores'
    results_folder = "results"
    path = f"{results_folder}/{game}_{filename}_{SEED}.csv"
    df = pd.read_csv(path)
    print(f"file csv read from {path}")
    return df

# Evaluation

In [12]:
#evaluation on the full distribution
def evaluate(agent,test_env,test_size=10):
    scores = []
    lengths = []
    for i in range(test_size):
        sum = 0
        l = 0
        done = False
        truncated = False
        state,_= test_env.reset()
        #pass the model to evaluation mode, to deal with batch_norm and dropout layers
        agent.actor.eval()
        with torch.no_grad():
            while not(done or truncated):
                _, action = agent.get_action(state)
                act = int(action.detach().cpu().numpy())
                next_state, reward, done, truncated, info = test_env.step(act)
                state = next_state
                l = l + 1
                sum = sum + reward
            scores.append(sum)
            lengths.append(l)
    average_score = np.mean(scores)
    average_length = np.mean(lengths)
    #return back to training mode
    agent.actor.train()
    return average_score,average_length


# Gym Environments

In [35]:
game = GAME
n_envs = 48
test_size = 10
#train on a fixed subset of levels in easy mode
env = gym.vector.make(f'procgen:procgen-{game}-v0',
               #render='human', 
               num_levels = 200, 
               distribution_mode = 'easy', 
               #use_backgrounds=False,
               apply_api_compatibility = True,
               start_level = SEED,
               rand_seed = SEED,
               num_envs=n_envs
               )
print(f"observation space shape:{env.observation_space.shape}")
print(f"action space size: {env.action_space[0]}")

n_states = env.observation_space.shape[0]
n_actions = np.array(env.action_space)[0]


#evaluate the agent on the full distribution of levels
test_env = gym.make(f'procgen:procgen-{game}-v0',
                   #render='human', 
                    distribution_mode = 'easy',
                    #num_levels = 0,
                    #use_backgrounds=False,
                    apply_api_compatibility = True,
                    start_level = SEED,
                    rand_seed = SEED)
#use this env to evaluate the agent on the training levels
train_env = gym.make(f'procgen:procgen-{game}-v0',
                   #render='human', 
                    distribution_mode = 'easy',
                    num_levels = 200,
                    #use_backgrounds=False,
                    apply_api_compatibility = True,
                    start_level = SEED,
                    rand_seed = SEED)

observation space shape:(48, 64, 64, 3)
action space size: Discrete(15)


# Execution

In [36]:
def exec(agent, 
         env, #training environment
         train_env, #evaluation environment for train levels
         test_env,#evaluation environment for test levels
         iterations, #total iterations
         batch_size, #PPO:batch_size 
         n_envs, #number of parallel environments
         results, #dictionary to memorize all the information
         device, 
         seed=SEED):

    #memorize latest best model
    best_model = {'model':agent.actor.state_dict(),
                  'score':0,
                  'iteration':0}
    #save results at each checkpoint in pickle file
    results_folder = "results"
    filename = 'all_res'
    if not os.path.exists(results_folder):
        os.makedirs(results_folder)
    path = f"{results_folder}/{GAME}_{filename}_{SEED}.pkl"

    #every 100 eps terminated compute an average and save the mean over 100 eps, and the start collecting again 
    train_scores = []
    train_steps = []
    train_lengths = []
    #buffers used to save scores of evaluation
    test_scores = []
    test_steps = []
    test_lengths = []
    ftr_scores = [] 
    ftr_lengths = []
    #count the total steps
    tot_steps = 0
    terminated_ep_rewards = [] #contains all the rewards from all the environments of all the terminated episodes
    #reset the environments
    states,_ = env.reset()
    states=torch.stack([torch.tensor(o,device=device) for o in states])
    #buffer used to compute the average reward of the last 100 terminated episodes 
    temp_scores = []
    temp_lengths = []
    actor_loss = []
    critic_loss = []
    for it in tqdm(range(1,iterations+1)):
        #linearly reduce the learning rates every iteration 
        scale = 1 - ((it-1)/1500)
        # anneal the optimizer's learning rate
        agent.actor_optimizer.param_groups[0]["lr"] = agent.lr_a * scale
        agent.critic_optimizer.param_groups[0]["lr"] = agent.lr_c * scale
        
        #used these buffers to collect information from the environments
        batch_states = []
        batch_values = []
        batch_rewards =  []
        batch_dones = []
        batch_actions = []
        batch_log_probs = []
        sum_rewards = np.zeros((n_envs,), dtype=float)
        sum_lengths = np.zeros((n_envs,),dtype=int)
        
        for t in range(batch_size):
            #play a single step in the environments
            next_states, rewards,dones,log_probs,actions = agent.play_step(states)
            #add the new rewards to the previous ones
            sum_rewards = sum_rewards + rewards
            sum_lengths = sum_lengths + np.ones((n_envs,),dtype=int)
            for i in range(len(dones)):
                if dones[i]:
                    terminated_ep_rewards.append(sum_rewards[i])
                    temp_scores.append(sum_rewards[i])
                    temp_lengths.append(sum_lengths[i])
            #compute an average score of the last 100 terminated episodes
            if len(temp_scores)>=100:
                train_scores.append(np.mean(temp_scores[-100:]))
                #save also the total num of steps when average is computed 
                train_steps.append(tot_steps)
                train_lengths.append(np.mean(temp_lengths[-100:]))
                #reset the buffer for new collection 
                temp_scores = []
                temp_lengths = []
            #reset tot rewards for the terminated episodes
            sum_rewards[dones] = 0
            #reset tot length for the terminated episodes
            sum_lengths[dones] = 0
            #compute state values using the critic
            values = agent.critic(states).squeeze().to(device).detach()
            #update the batches 
            batch_values.append(values)
            batch_rewards.append(rewards)
            batch_states.append(states)
            batch_dones.append(dones)
            batch_actions.append(actions)
            batch_log_probs.append(log_probs)
            states = torch.stack([torch.tensor(o,device=device) for o in next_states])
            #increase the tot_step
            tot_steps += 1
        #compute the next_values of the last states,to be used for the computation of the advantages
        next_values = agent.critic(states).squeeze().to(device).detach()
        batch_values = torch.stack(batch_values).to(device)
        batch_dones = torch.stack([torch.tensor(d, device=device, dtype=torch.bool) for d in batch_dones]).to(device)
        #batch_rewards = torch.stack(batch_rewards).to(device)
        batch_rewards = torch.stack([torch.tensor(r, device=device) for r in batch_rewards]).to(device)
        #compute GAE advantages and returns (will be used as critic targets) for each timestep
        advs,returns = agent.compute_advs_rts(batch_values,batch_rewards,batch_dones,next_values)

        #flatten the data collected from the different environments
        batch_states = torch.stack(batch_states)
        #reshape the batches by collapsing the batch and n_environment dimensions
        obs_size = batch_states.size()[2:]
        batch_states = batch_states.reshape((tuple([n_envs*batch_size])+obs_size))
        advs = torch.stack(advs).reshape(-1,1).to(device)
        returns = torch.stack(returns).reshape(-1,1).to(device)
        batch_actions = torch.stack(batch_actions).reshape(-1).to(device).detach()
        batch_log_probs = torch.stack(batch_log_probs).reshape((-1,1)).to(device).detach()

        #learning phase
        a_loss, c_loss = agent.train(batch_states,advs,returns,batch_log_probs,batch_actions)
        #save losses
        actor_loss.append(a_loss)
        critic_loss.append(c_loss)
        #print(f"mean_reward: {np.mean(terminated_ep_rewards[-20:])}, n°of steps: {tot_steps}, n° of terminated episodes: {len(terminated_ep_rewards)}")
        #evaluation phase, after each iteration compute an evaluation score
        if ((it%2) == 0):
            test_score,test_l = evaluate(agent,test_env)
            ftr_score,ftr_l = evaluate(agent,train_env)
            ftr_scores.append(ftr_score)
            ftr_lengths.append(ftr_l)
            test_scores.append(test_score)
            test_steps.append(tot_steps)
            test_lengths.append(test_l)
            if(best_model['score']<=(np.mean(test_scores[-2:]))):
                #save the model parameters
                best_model.update({'model':agent.actor.state_dict(),'score':test_score,'iteration':it})
                
            print(f"iteration:{it},'tot_steps': {tot_steps}, test_score:{test_score}, train_score:{ftr_score}")
        if (it%250 == 0):
            print(f"iteration:{it}, tot_steps:{tot_steps}")
            #save checkpoint
            agent.save_checkpoint(step=it)
            #save best_model
            save_model_weights(best_model['model'],seed=seed,best=True)
        if (it%100 == 0):
            #save results to pkl file
            with open(path, 'wb') as f:
                pickle.dump(final_res, f)
        if (it%10 == 0):    
            final_res = {'train_scores':train_scores,'train_steps':train_steps,'train_lengths':train_lengths,
                         'test_scores':test_scores,'test_steps':test_steps,'test_lengths':test_lengths,
                         'tot_eps_rws':terminated_ep_rewards,'best_model':best_model,'actor_loss':actor_loss,
                         'critic_loss':critic_loss,'ftr_scores': ftr_scores,'ftr_lengths':ftr_lengths}

            results.update(final_res)
                            
    return results

# Run

In [37]:
#tot_timesteps = 1000 * 256 * 48 = 12 288 000 
#parameters
in_ch = 3
n_actions = 15
batch_size = 256
iterations = 1000
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
agent = PPOAgent(in_ch,n_actions,n_envs=n_envs,batch_size=batch_size,device=device )

In [None]:
results = {}
final_results = exec(agent,
                     env,
                     train_env,
                     test_env,
                     iterations,
                     batch_size,
                     n_envs,
                     results,
                     device,
                     SEED)