In [None]:
!pip install procgen
#!pip install gym==0.26.2 
!pip install torchsummary
#!pip install pygame ufal.pybox2d

# Importing

In [None]:
import numpy as np
import torch
from torch import nn
import random
import pandas as pd
import pickle
from collections import deque
import copy
import os,json 
import gym
from tqdm import tqdm
from torch.distributions import Categorical

## Reproducibility

The following notebook will be executed over 3 seeds for each game.\
seeds: 42,1377,47981\
games: coinrun,fruitbot,
one run with squeeze excitation module

In [None]:
# Fix the random state for reproducibility
SEED = 42
GAME = 'jumper'

def fix_seed(seed: int) -> None:
    """Fix all the possible sources of randomness.

    Args:
        seed: the seed to use.
    """
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

#For reproducibilty reasons we fix the seed 
fix_seed(SEED)


# Network Architecture

In [None]:
#64-3+2 / 1 + 1 64
#64-3+2 / 1 + 1 64
#64
#H and W reduction with maxpool
#64-3+2 / 2 = 32
#32-3+2 / 2 = 16
#16-3+2 / 2 = 8 
#modified impala, after each convolutional layer, add a batch normalization layer
class ResidualBlock(nn.Module):
    def __init__(self,in_channels):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=3, stride=1, padding=1)
        self.batch_norm = nn.BatchNorm2d(in_channels)
        self.relu = nn.ReLU()
    def forward(self, x):
        out = self.relu(x)
        out = self.conv1(out)
        out = self.relu(out)
        out = self.conv2(out)
        return out + x

class SqueezeExcitation(nn.Module):
    def __init__(self,in_channels,ratio = 16):
        super(SqueezeExcitation, self).__init__()
        #ratio is the reduction ratio
        #as we want to compute global average pooling, we can either use adaptive avg pooling followed by some squeezing ops,
        #or use a mean op on HXW and the apply the fully connected layers
        
        self.avg_pool = nn.AdaptiveAvgPool2d(1) #output size is 1, as we want to work on channels
        #squeeze the n° of channels 
        self.fc1 = nn.Linear(in_channels,in_channels//ratio)
        self.relu = nn.ReLU()
        #expand the n° of channels 
        self.fc2 = nn.Linear(in_channels//ratio,in_channels)
        self.sigmoid= nn.Sigmoid()
    
    def forward(self,input):
        #x = self.avg_pool(input)
        #x = x.squeeze([2,3])
        x = input.mean([2,3])
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        #restore the tensor size
        x = x.unsqueeze(2).unsqueeze(3)
        #scale
        out = input*x
        return out
    
class ImpalaBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(ImpalaBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=1, padding=1)
        self.batch_norm = nn.BatchNorm2d(out_channels)
        self.res = ResidualBlock(out_channels)
        self.se =  SqueezeExcitation(out_channels)
        #self.res2 = ResidualBlock(out_channels)

    def forward(self, x):
        x = self.batch_norm(self.conv(x))
        x = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)(x)
        x = self.res(x)
        x = self.se(x)
        x = self.res(x)
        x = self.se(x)
        return x

class ActorImpalaModel(nn.Module):
    def __init__(self,in_channels,output_size):
        super(ActorImpalaModel, self).__init__()
        self.block1 = ImpalaBlock(in_channels=in_channels, out_channels=16)
        self.block2 = ImpalaBlock(in_channels=16, out_channels=32)
        self.block3 = ImpalaBlock(in_channels=32, out_channels=32)
        self.block4 = ImpalaBlock(in_channels=32, out_channels=64)
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        self.fc = nn.Linear(in_features=32 * 8 * 8, out_features=256)
        self.out = nn.Linear(in_features=256,out_features=output_size)
        self.conv = nn.Conv2d(in_channels=64, out_channels=output_size, kernel_size=1, stride=1)
        self.gap = nn.AdaptiveAvgPool2d(1)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        #move the channels in the second dimension, from (n_batch,size1,size2,n_channels) to (n_batch,n_channels,size1,size2) 
        x = x.movedim(-1,1)
        x = self.std(x)
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        x = self.relu(x)
        #flat the tensors from conv layers to fully connected layers
        #todo: replace fullyconnected layer with global average pooling with output_size num of channels
        #1x1 convolution
        x = self.conv(x)
        x = self.gap(x)
        #x = self.tanh(x)

        x = torch.flatten(x,1)
        #x = self.tanh(self.fc(x))
        #x = self.out(x)
        probs = self.softmax(x)
        return probs
    #standardize the input
    def std(self,x):
        x = x / 255.0
        return x    

class CriticImpalaModel(nn.Module):
    def __init__(self,in_channels,output_size):
        super(CriticImpalaModel, self).__init__()
        self.block1 = ImpalaBlock(in_channels=in_channels, out_channels=16)
        self.block2 = ImpalaBlock(in_channels=16, out_channels=32)
        self.block3 = ImpalaBlock(in_channels=32, out_channels=32)
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        self.fc = nn.Linear(in_features=32 * 8 * 8, out_features=128)
        self.out = nn.Linear(128,1)
        self.softmax = nn.Softmax(dim=1)
        self.output_dim = 1

    def forward(self, x):
        x= x.movedim(-1,1)
        x = self.std(x)
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.relu(x)
        #flat the tensors from conv layers to fully connected layers
        x = torch.flatten(x,1)
        x = self.fc(x)
        x = self.tanh(x)
        x = self.out(x)
        return x
    
    def std(self,x):
        x = x / 255.0
        return x    




        
        
        

In [None]:
class CriticNatureModel(nn.Module):
#64-8+2 / 3 + 1 = 20
#20 -4 +2 / 2 +1 = 10
#15-3+2 + 1  = 15
#84-8+2 / 4 +1 = 20.5
#20 -4 + 2 /2  + 1 = 10 
    def __init__(self,in_ch,output_size= 1):
        super(CriticNatureModel, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=in_ch, out_channels=32, kernel_size=8, stride=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2, padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(10*10*64,256)
        self.out = nn.Linear(256,output_size)
        #self.softmax = nn.Softmax(dim=1)
    
    def forward(self,x):
        x = x.movedim(-1,2)

        #x = x.view(x.shape[0],-1,64,64)
        x = x.reshape((x.size()[0],-1,64,64))
        x = self.std(x)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.relu(self.conv3(x))
        x = torch.flatten(x,1)
        x = self.fc(x)
        value = self.out(x)
        return value
    
    def std(self,x):
        x = x / 255.0
        return x    

class ActorNatureModel(nn.Module):
#64-8+2 / 3 + 1 = 20
#20 -4 +2 / 2 +1 = 10
#15-3+2 + 1  = 15
#84-8+2 / 4 +1 = 20.5
#20 -4 + 2 /2  + 1 = 10 
    def __init__(self,in_ch,output_size):
        super(ActorNatureModel, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=in_ch, out_channels=32, kernel_size=8, stride=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2, padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.fc = nn.Linear(10*10*64,512)
        self.out = nn.Linear(512,output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self,x,frame=True):
        x = x.movedim(-1,1)
        if frame:
            x = x.reshape((x.size()[0],-1,64,64))
        x = self.std(x)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.relu(self.conv3(x))
        x = torch.flatten(x,1)
        x = self.fc(x)
        logits = self.out(x)
        return self.softmax(logits)

    def std(self,x):
        x = x / 255.0
        return x    
class CriticNatureModel(nn.Module):
#64-8+2 / 3 + 1 = 20
#20 -4 +2 / 2 +1 = 10
#15-3+2 + 1  = 15
#84-8+2 / 4 +1 = 20.5
#20 -4 + 2 /2  + 1 = 10 
    def __init__(self,in_ch,output_size= 1):
        super(CriticNatureModel, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=in_ch, out_channels=32, kernel_size=5, stride=2, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.maxpool = nn.MaxPool2d(2,2)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        #self.conv4 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.conv4 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=1, stride=1, padding=1)
        self.fc = nn.Linear(17*17*32,128)
        self.out = nn.Linear(128,output_size)
        #self.softmax = nn.Softmax(dim=1)
    
    def forward(self,input):
        input = input.movedim(-1,1)
        x = self.std(input)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.relu(self.maxpool(x))
        x = self.conv3(x) 
        x = self.relu(self.conv4(x))
        x = torch.flatten(x,1)
        x = self.fc(x)
        value = self.out(x)
        return value
    
    def std(self,x):
        x = x / 255.0
        return x    

class ActorNatureModel(nn.Module):
#64-5+2 / 2 + 1 = 31
#32 -3 +2 / 1 +1 = 31
#relu + max pooling

#15-3+2 + 1  = 15
#15-3+2+ 1 = 15
#15-1+2 + 1 =17
#84-8+2 / 4 +1 = 20.5
#20 -4 + 2 /2  + 1 = 10 
    def __init__(self,in_ch,output_size):
        super(ActorNatureModel, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=in_ch, out_channels=32, kernel_size=5, stride=2, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.maxpool = nn.MaxPool2d(2,2)
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        #self.conv4 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.conv4 = nn.Conv2d(in_channels=64, out_channels=32, kernel_size=1, stride=1, padding=1)
        self.fc = nn.Linear(17*17*32,128)
        self.out = nn.Linear(128,output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self,input):
        input = input.movedim(-1,1)
        x = self.std(input)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.relu(self.maxpool(x))
        x = self.conv3(x) 
        x = self.relu(self.conv4(x))
        x = torch.flatten(x,1)
        x = self.fc(x)
        logits = self.out(x)
        return self.softmax(logits)

    def std(self,x):
        x = x / 255.0
        return x    



## Model summary

In [None]:
import torch
#from torchvision import models
from torchsummary import summary
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
model = ActorNatureModel(3,15).to(device)
critic = CriticNatureModel(3,1).to(device)
impala = ActorImpalaModel(3,15).to(device)
summary(impala, (64,64,3))


# PPO implementation

In [None]:
class PPOAgent():
    def __init__(self,
                 in_ch,
                 n_actions,
                 n_envs = 32,
                 batch_size = 256,
                 gamma = 0.99,
                 lam = 0.95,
                 epsilon = 0.2,
                 lr_a = 5e-4,
                 lr_c = 5e-4,
                 epochs = 3,
                 n_minibatch = 6,
                 device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'),
                 critic_criterion=torch.nn.MSELoss(reduction='mean')):
        
        self.in_ch = in_ch
        self.n_actions = n_actions
        self.n_envs = n_envs
        self.batch_size = batch_size
        self.gamma=gamma
        self.lam = lam
        self.epsilon = epsilon
        self.lr_a = lr_a
        self.lr_c = lr_c
        self.epochs=epochs
        self.n_minibatch = n_minibatch
        self.device = device
        self.critic_criterion = critic_criterion
        self.actor,self.critic = self.get_networks()
        self.actor_optimizer = torch.optim.AdamW(self.actor.parameters(), lr=self.lr_a, weight_decay=0.01)
        self.critic_optimizer = torch.optim.AdamW(self.critic.parameters(), lr=self.lr_c, weight_decay=0.01)
        
     
    def get_networks(self):
        actor_net = ActorImpalaModel(self.in_ch,self.n_actions).to(self.device)
        critic_net = CriticImpalaModel(self.in_ch,1).to(self.device)
        return actor_net,critic_net
    
    def get_action(self,state):
        #unsqueeze add a dimension of size 1 to simulate a batch
        state = torch.tensor(state, dtype=torch.float32, device = self.device).unsqueeze(0)
        #get probability distributions of the actions
        probabilities = self.actor(state)
        #build a distribution
        dist = Categorical(probabilities.squeeze())
        #sample action from the distribution
        action = dist.sample()
        #print(f"multinom: {int(action.squeeze().detach().numpy())}")
        prob = probabilities.squeeze()[action]
        return prob, action
    
    def get_state_value(self,state):
        #unsqueeze add a dimension of size 1 to simulate a batch
        state = np.array(state)
        state = torch.tensor(state, dtype=torch.float32, device = self.device).unsqueeze(0)
        value = self.critic(state)
        return value
    
    def play_step(self,obs):#obs has shape n_env x (64,64,3)
        probs = self.actor(obs)
        #compute log_prob and actions
        dist = Categorical(probs)
        #sample from distribution
        actions = dist.sample()
        #print(f"multinom: {int(action.squeeze().detach().numpy())}")
        log_probs = dist.log_prob(actions).detach()
        next_states,rewards,truncated, dones,infos=env.step(actions.cpu().detach().numpy())   
        actions = actions.detach()
        #dones = [d or t for d,t in zip(dones,truncated)]
        dones = np.maximum(dones,truncated)
        return next_states,rewards,dones,log_probs,actions
    
    #compute advantages and returns as GAE
    def compute_advs_rts(self,values,rewards,dones,next_values):
        A_t = 0
        advantages = []
        returns = []
        values = torch.cat([values,next_values.unsqueeze(0)])
        with torch.no_grad():
            for t in reversed(range(len(rewards))):
                delta = rewards[t] + self.gamma*values[t+1]*(~dones[t]) - values[t]
                A_t =  delta + self.gamma * self.lam * A_t * (~dones[t])
                advantages.insert(0,A_t.to(torch.float32))
                ret = A_t + values[t]
                returns.insert(0,ret.to(torch.float32))
        return advantages, returns   
    
    def train(self,states,advantages,critic_targets,old_log_probs,actions):
        actor_losses = []
        critic_losses = []
        #advantages normalization
        advantages = (advantages - advantages.mean()) / (advantages.std()+1e-8)
        #iterate for n epochs over the data, by shuffling the data and creating minibatches 
        tot_samples = self.n_envs*self.batch_size
        
        for _ in range(self.epochs):
            minibatch_size = tot_samples // self.n_minibatch
            starts = np.arange(0,tot_samples,minibatch_size)
            tot_ids = np.arange(0,tot_samples)
            np.random.shuffle(tot_ids)
            for start in starts:
                #randomly shuffle the batch
                ids = tot_ids[start:start+minibatch_size]
                
                #compute the values according to the updated critic
                new_values = self.critic(states[ids])
                
                #compute the probabilities according to the updated actor
                probs = self.actor(states[ids])
                #make a distribution
                dist = Categorical(probs)
                #choose the new log probabilities
                new_log_probs = dist.log_prob(actions[ids]).unsqueeze(1)
                #compute the entropy, used to improve exploration
                entropy = dist.entropy().unsqueeze(1)
                
                ratios = torch.exp(new_log_probs-old_log_probs[ids])
                clip = torch.clamp(ratios, 1-self.epsilon, 1+self.epsilon)
                
                
                entropy_loss = (0.01*torch.unsqueeze(entropy,1)).mean(0)
                actor_loss =(-torch.min(ratios*advantages[ids], clip*advantages[ids])).mean(0)
                actor_loss = actor_loss - entropy_loss

                critic_loss = self.critic_criterion(new_values,critic_targets[ids])
                #update actor
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()
                
                #update critic
                self.critic_optimizer.zero_grad()
                critic_loss.backward()
                self.critic_optimizer.step()
            
                actor_losses.append(actor_loss.item())
                critic_losses.append(critic_loss.item())
        return np.mean(actor_losses),np.mean(critic_losses)


    #save and load models checkpoints
    def save_checkpoint(self,step,game=GAME,seed=SEED):
        filename1 = 'actor_checkpoint' 
        filename2 = 'critic_checkpoint'
        models_folder = "models_checkpoints"
        if not os.path.exists(models_folder):
            os.makedirs(models_folder)
        #actor saving
        path = f"{models_folder}/{game}_{filename1}_{step}_{seed}.pt"
        torch.save(self.actor.state_dict(),path)
        #critic saving
        path = f"{models_folder}/{game}_{filename2}_{step}_{seed}.pt"
        torch.save(self.critic.state_dict(),path)
        print(f"Checkpoint:{step} with seed:{seed} created!")

    def load_checkpoint(self,step,game=GAME,seed=SEED):
        if step%250 != 0 or step==0:
            raise Exception("The step should be a multiple of 250 and greater than zero")
        models_folder = "models_checkpoints"
        filename1 = 'actor_checkpoint' 
        filename2 = 'critic_checkpoint'
        #actor loading
        path = f"{models_folder}/{game}_{filename1}_{step}_{seed}.pt"
        actor_dict = torch.load(path)
        self.actor.load_state_dict(actor_dict)
        #critic loading
        path = f"{models_folder}/{game}_{filename2}_{step}_{seed}.pt"
        critic_dict = torch.load(path)
        self.critic.load_state_dict(critic_dict)
        print(f"Checkpoint:{step} have been loaded")



    

# Utils

## Save and load models

In [None]:
def save_model_weights(model_dict,seed=SEED,game=GAME,best=True):
    if best:
        filename1 = 'best_actor' 
        #filename2 = 'best_critic'
        models_folder = "best_models"
        if not os.path.exists(models_folder):
            os.makedirs(models_folder)
        #actor saving
        path = f"{models_folder}/{game}_{filename1}_{seed}.pt"
        torch.save(model_dict,path)
    else:
        filename1 = 'model' 
        models_folder = "models"
        if not os.path.exists(models_folder):
            os.makedirs(models_folder)
        #model saving
        path = f"{models_folder}/{game}_{filename1}_{seed}.pt"
        torch.save(model_dict,path)
    b = "best" if best else ""
    print(f"{b} model weights saved")
        
def load_model_weights(seed=SEED,game=GAME,best=True):
    if best:
        filename1 = 'best_actor' 
        models_folder = "best_models"
        if not os.path.exists(models_folder):
            os.makedirs(models_folder)
        #best actor loadin
        path = f"{models_folder}/{filename1}_{seed}.pt"
        model_dict = torch.load(path)
    else:
        models_folder = "models"
        filename1 = 'model' 
        #actor loading
        path = f"{models_folder}/{filename1}_{seed}.pt"
        model_dict = torch.load(path)
    b = "best" if best else ""
    print(f"{b} model weights loaded")
  
    return model_dict 
 


## Save and load data

In [None]:
def save_data(df,train=True,game=GAME,seed=SEED):
    if train:
        filename= 'train_scores'
    else:
        filename= 'test_scores'
    results_folder = "results"
    if not os.path.exists(results_folder):
        os.makedirs(results_folder)
    path = f"{results_folder}/{game}_{filename}_{SEED}.csv"
    df.to_csv(path)
    print(f"file created in {path}")
          
def load_data(game,seed,train=True):
    if train:
        filename= 'train_scores'
    else:
        filename= 'test_scores'
    results_folder = "results"
    path = f"{results_folder}/{game}_{filename}_{SEED}.csv"
    df = pd.read_csv(path)
    print(f"file csv read from {path}")
    return df

# Evaluation

In [None]:
test_seeds = list(np.random.randint(low = 2000,high=5000, size = 50))
#evaluate the agent on the full distribution of levels
def fixed_set_evaluation(agent,test_seeds):
    scores = []
    for seed in test_seeds:
        sum = 0
        done = False
        truncated = False
        test_env = gym.make(f'procgen:procgen-{game}-v0',
               #render='human', 
               distribution_mode = 'easy', 
               #use_backgrounds=False,
               apply_api_compatibility = True,
               start_level = int(seed) ,
               rand_seed = int(seed)
               )
        state,_= test_env.reset()
        agent.actor.eval()
        with torch.no_grad():
            while not(done or truncated):
                prob, action=agent.get_action(state)
                act = int(action.detach().cpu().numpy())
                next_state, reward, done, truncated, info = test_env.step(act)
                state = next_state
                sum = sum + reward
            scores.append(sum)
    average_score = np.mean(scores)
    agent.actor.train()
    return average_score


In [None]:
#evaluation on the full distribution
def evaluate(agent,test_env,test_size=10):
    scores = []
    lengths = []
    for i in range(test_size):
        sum = 0
        l = 0
        done = False
        truncated = False
        state,_= test_env.reset()
        #pass the model to evaluation mode, to deal with batch_norm and dropout layers
        agent.actor.eval()
        with torch.no_grad():
            while not(done or truncated):
                _, action = agent.get_action(state)
                act = int(action.detach().cpu().numpy())
                next_state, reward, done, truncated, info = test_env.step(act)
                state = next_state
                l = l + 1
                sum = sum + reward
            scores.append(sum)
            lengths.append(l)
    average_score = np.mean(scores)
    average_length = np.mean(lengths)
    #return back to training mode
    agent.actor.train()
    return average_score,average_length


# Gym Environments

In [None]:
game = GAME
n_envs = 48
test_size = 10
#train on a fixed subset of levels in easy mode
env = gym.vector.make(f'procgen:procgen-{game}-v0',
               #render='human', 
               num_levels = 200, 
               distribution_mode = 'easy', 
               #use_backgrounds=False,
               apply_api_compatibility = True,
               start_level = SEED,
               rand_seed = SEED,
               num_envs=n_envs
               )
print(f"observation space shape:{env.observation_space.shape}")
print(f"action space size: {env.action_space}")

n_states = env.observation_space.shape[0]
n_actions = np.array(env.action_space)[0]


#evaluate the agent on the full distribution of levels
test_env = gym.make(f'procgen:procgen-{game}-v0',
                   #render='human', 
                    distribution_mode = 'easy',
                    #num_levels = 0,
                    #use_backgrounds=False,
                    apply_api_compatibility = True,
                    start_level = SEED,
                    rand_seed = SEED)
#use this env to evaluate the agent on the trained levels
train_env = gym.make(f'procgen:procgen-{game}-v0',
                   #render='human', 
                    distribution_mode = 'easy',
                    num_levels = 200,
                    #use_backgrounds=False,
                    apply_api_compatibility = True,
                    start_level = SEED,
                    rand_seed = SEED)

# Execution

In [None]:
def exec(agent,
         env,
         test_env,
         iterations,
         batch_size,
         n_envs,
         results,
         device,
         seed=SEED):
    
    best_model = {'model':agent.actor.state_dict(),
                  'score':0,
                  'iteration':0}
    #save results at each checkpoint
    results_folder = "results"
    filename = 'all_res'
    if not os.path.exists(results_folder):
        os.makedirs(results_folder)
    path = f"{results_folder}/{GAME}_{filename}_{SEED}.pkl"

    #every 100 eps terminated compute an average and save the mean over 100 eps, and the start collecting again 
    train_scores = []
    train_steps = []
    train_lengths = []
    test_scores = []
    test_steps = []
    test_lengths = []
    ftr_scores = [] #fixed train env
    ftr_lengths = []
    #count the total step
    tot_steps = 0
    terminated_ep_rewards = [] #contains all the rewards from all the environments of all the terminated episodes
    #reset the environments
    states,_ = env.reset()
    states=torch.stack([torch.tensor(o,device=device) for o in states])
    #buffer used to compute the average reward of the last 100 terminated episodes 
    temp_scores = []
    temp_lengths = []
    actor_loss = []
    critic_loss = []
    for it in tqdm(range(1,iterations+1)):
        #inearly anneal the learning rates at each iteration using scale
        scale = 1 - ((it-1)/1500)
        # anneal the optimizer's learning rate
        agent.actor_optimizer.param_groups[0]["lr"] = agent.lr_a * scale
        agent.critic_optimizer.param_groups[0]["lr"] = agent.lr_c * scale
        
        #used to collect informations from the environments
        batch_states = []
        batch_values = []
        batch_rewards =  []
        batch_dones = []
        batch_actions = []
        batch_log_probs = []
        sum_rewards = np.zeros((n_envs,), dtype=float)
        sum_lengths = np.zeros((n_envs,),dtype=int)
        
        for t in range(batch_size):
            #play a single step in the environments
            next_states, rewards,dones,log_probs,actions = agent.play_step(states)
            #add the new rewards to the previous ones
            sum_rewards = sum_rewards + rewards
            sum_lengths = sum_lengths + np.ones((n_envs,),dtype=int)
            #print(sum_rewards)
            for i in range(len(dones)):
                if dones[i]:
                    terminated_ep_rewards.append(sum_rewards[i])
                    temp_scores.append(sum_rewards[i])
                    temp_lengths.append(sum_lengths[i])
            #compute an average score of the last 100 terminated episodes
            if len(temp_scores)>=100:
                train_scores.append(np.mean(temp_scores[-100:]))
                #save also the total num of steps when average is computed 
                train_steps.append(tot_steps)
                train_lengths.append(np.mean(temp_lengths[-100:]))
                #reset the buffer for new collection 
                temp_scores = []
                temp_lengths = []
            #reset tot rewards for the terminated episodes
            sum_rewards[dones] = 0
            #reset tot length for the terminated episodes
            sum_lengths[dones] = 0
            #compute state values using the critic
            values = agent.critic(states).squeeze().to(device).detach()
            #update the batches 
            batch_values.append(values)
            batch_rewards.append(rewards)
            batch_states.append(states)
            batch_dones.append(dones)
            batch_actions.append(actions)
            batch_log_probs.append(log_probs)
            states = torch.stack([torch.tensor(o,device=device) for o in next_states])
            #increase the tot_step
            tot_steps += 1
        #compute the next_values of the last states,to be used for the computation of the advantages
        next_values = agent.critic(states).squeeze().to(device).detach()
        batch_values = torch.stack(batch_values).to(device)
        batch_dones = torch.stack([torch.tensor(d, device=device, dtype=torch.bool) for d in batch_dones]).to(device)
        #batch_rewards = torch.stack(batch_rewards).to(device)
        batch_rewards = torch.stack([torch.tensor(r, device=device) for r in batch_rewards]).to(device)
        #compute GAE advantages and returns (will be used as critic targets) for each timestep
        advs,returns = agent.compute_advs_rts(batch_values,batch_rewards,batch_dones,next_values)

        #flatten the data collected from the different environments
        batch_states = torch.stack(batch_states)
        #reshape the batches by collapsing the batch and n_environment dimensions
        obs_size = batch_states.size()[2:]
        batch_states = batch_states.reshape((tuple([n_envs*batch_size])+obs_size))
        advs = torch.stack(advs).reshape(-1,1).to(device)
        returns = torch.stack(returns).reshape(-1,1).to(device)
        batch_actions = torch.stack(batch_actions).reshape(-1).to(device).detach()
        batch_log_probs = torch.stack(batch_log_probs).reshape((-1,1)).to(device).detach()

        #learning phase
        a_loss, c_loss = agent.train(batch_states,advs,returns,batch_log_probs,batch_actions)
        actor_loss.append(a_loss)
        critic_loss.append(c_loss)
        print(f"mean_reward: {np.mean(terminated_ep_rewards[-20:])}, n°of steps: {tot_steps}, n° of terminated episodes: {len(terminated_ep_rewards)}")
        #evaluation phase, after each iteration compute an evaluation score
        if ((it%2) == 0):
            test_score,test_l = evaluate(agent,test_env)
            ftr_score,ftr_l = evaluate(agent,train_env)
            ftr_scores.append(ftr_score)
            ftr_lengths.append(ftr_l)
            test_scores.append(test_score)
            test_steps.append(tot_steps)
            test_lengths.append(test_l)
            if(best_model['score']<=(np.mean(test_scores[-2:]))):
                #save the model parameters
                best_model.update({'model':agent.actor.state_dict(),'score':test_score,'iteration':it})
                
            print(f"iteration:{it},'tot_steps': {tot_steps}, test_score:{test_score}, train_score:{ftr_score}")
            #if (eval_score >= 8.5):
                #break
        if (it%250 == 0):
            print(f"iteration:{it}, tot_steps:{tot_steps}")
            #save checkpoint
            agent.save_checkpoint(step=it)
            #save best_model
            save_model_weights(best_model['model'],seed=seed,best=True)
        if (it%100 == 0):
            #save results to pkl file
            with open(path, 'wb') as f:
                pickle.dump(final_res, f)
        if (it%10 == 0):    
            final_res = {'train_scores':train_scores,'train_steps':train_steps,'train_lengths':train_lengths,
                         'test_scores':test_scores,'test_steps':test_steps,'test_lengths':test_lengths,
                         'tot_eps_rws':terminated_ep_rewards,'best_model':best_model,'actor_loss':actor_loss,
                         'critic_loss':critic_loss,'ftr_scores': ftr_scores,'ftr_lengths':ftr_lengths}

            results.update(final_res)
                            
    return results

In [None]:
#tot_timesteps = 1000 * 256 * 48 = 12 288 000 
#parameters
in_ch = 3
n_actions = 15
batch_size = 256
iterations = 750
threshold = 30
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
agent = PPOAgent(in_ch,n_actions,n_envs=n_envs,batch_size=batch_size,device=device )

In [None]:
#used for debugging
results = {}
final_results = exec(agent,
     env,
     test_env,
     iterations,
     batch_size,
     n_envs,
     results,
     device,
     SEED)

In [None]:
evaluate(agent, test_env,100)


In [None]:
with open(f'results/{GAME}_all_res_{SEED}.pkl', 'rb') as f:
    results = pickle.load(f)


In [None]:
results.keys()

In [None]:
save_model_weights(agent.actor.state_dict(),best=False)

In [None]:
sc,st,l = results['train_scores'],results['train_steps'],results['train_lengths']
data = {'scores': sc, 'steps': st, 'lengths':l}
df_train = pd.DataFrame(data)
df_train['steps'] = df_train['steps']*n_envs
df_train

In [None]:
test_scores,test_steps = results['test_scores'],results['test_steps']
data1 = {'scores': test_scores, 'steps': test_steps}
df_test = pd.DataFrame(data1)
df_test['steps']=df_test['steps']*n_envs
df_test

In [None]:
ftr_scores,ftr_steps = results['ftr_scores'],results['test_steps']
data2 = {'scores': ftr_scores, 'steps': ftr_steps}
df_ftr = pd.DataFrame(data2)
df_ftr['steps']=df_ftr['steps']*n_envs
df_ftr

In [None]:
#agent.actor.load_state_dict(results['best_model']['model'])

In [None]:
#evaluate(agent,test_env,500)

In [None]:
x=df_train['scores'].rolling(10).mean()
ftr = df_ftr['scores'].rolling(20).mean()
t = df_test['scores'].rolling(20).mean()

In [None]:
import matplotlib.pyplot as plt
plt.plot(df_ftr['steps']/1e6,ftr,label='train')
plt.plot(df_test['steps']/1e6,t,label='test')
plt.xlabel('steps')
plt.ylabel('mean rewards')
# plotting the legend 
plt.legend(loc = 'lower right') 
plt.savefig(f'{GAME}_{SEED}.png')
plt.show()

In [None]:
#df = pd.DataFrame(results['tot_eps_rws'])
#plt.plot(df.rolling(50).mean())
plt.plot(df_train['steps']/1e6,x,label='train')
plt.xlabel('steps')
plt.ylabel('mean rewards')
plt.savefig(f'avgscores_{GAME}_{SEED}.png')
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.plot(df_train['steps'],x)
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.plot(df_test['steps'],t)
plt.show()

In [None]:
import matplotlib.pyplot as plt
def plot_training(score,steps,threshold):
    #each 50 episode take the average score over the 50 eps passed
    avg_score = []
    eps = []
    for i in range(1, len(score)+1):
        #if (i % 50 == 0):
        #smooth_scores
        avg_score.append(np.mean(score[i:i+300]))
            #avg_score.append(score[i])
        eps.append(i)
    #print(len(avg_score))
    label = 'Procgen_PPO'
    #plt.plot(eps,avg_score,'b.')
    plt.plot(avg_score,'b', label = label)
    plt.axhline(y = threshold, color = 'r', linestyle = '--') 
    plt.xlabel('steps')
    plt.ylabel('average rewards')
    # plotting the legend 
    plt.legend(bbox_to_anchor = (1.1, 1.1), loc = 'upper right') 
    plt.show()
#plot_training(terminated_ep_rewards,steps,10)

In [None]:
def plot_training(score,threshold):
    #each 50 episode take the average score over the 50 eps passed
    avg_score = []
    eps = []
    for i in range(1, len(score)+1):
        if (i % 30 == 0):
            avg_score.append(np.mean(score[i-30:i]))
            #avg_score.append(score[i])
            eps.append(i)
    label = 'n-stepA2C'
    plt.plot(eps,avg_score,'g^')
    plt.plot(eps,avg_score,'g', label = label)
    plt.axhline(y = threshold, color = 'r', linestyle = '--') 
    plt.xlabel('episodes')
    plt.ylabel('average rewards each 20 eps')
    # plotting the legend 
    plt.legend(bbox_to_anchor = (1.1, 1.1), loc = 'upper right') 
    plt.show()

In [None]:
"""
#COINRUN SEED 42
#used for debugging
results = {}
final_results = exec(agent,
     env,
     test_env,
     iterations,
     batch_size,
     n_envs,
     results,
     device,
     SEED)
"""