In [None]:
import numpy as np
import gym
import torch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import os
import time
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
device = torch.device("cuda:0")
dtype = torch.float32

In [None]:
class PPO_Network(nn.Module):
    def __init__(self, in_channels, num_actions):
        super().__init__()
        
        network = [
            nn.Linear(in_channels,32),
            nn.ReLU(),
            nn.Linear(32,32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, num_actions + 1)
        ]
        
        self.network = nn.Sequential(*network)
    
    def forward(self, x):
        policy, value = torch.split(self.network(x),(num_actions, 1), dim=1)
        policy = F.softmax(policy, dim=1)
        return policy, value
    
class PPO_Agent(nn.Module):
    
    def __init__(self, in_channels, num_actions):
        super().__init__()
        
        self.in_channels = in_channels
        self.num_actions = num_actions
        self.network = PPO_Network(in_channels, num_actions)
    
    def forward(self, x):
        policy, value = self.network(x)
        return policy, value
    
    def select_action(self, policy):
        return np.random.choice(range(self.num_actions) , 1, p=policy)[0]

In [None]:
class Logger:
    
    def __init__(self, filename):
        self.filename = filename
        f = open(f"{self.filename}.csv", "w")
        f.close()
        
    def log(self, msg):
        f = open(f"{self.filename}.csv", "a+")
        f.write(f"{msg}\n")
        f.close()
        
total_steps = 0
class Env_Runner:
    
    def __init__(self, env_name, agent, logger_folder):
        super().__init__
        
        
        self.env = gym.make(env_name)
        self.agent = agent
        
        self.logger = Logger(f'{logger_folder}/training_info')
        self.logger.log("training_step, return")
        
        self.ob = self.env.reset()
        self.Return = 0
        
    def run(self, steps):
        global total_steps
        
        obs = []
        actions = []
        rewards = []
        dones = []
        values = []
        action_prob = []
        
        for step in range(steps):
            
            self.ob = torch.tensor(self.ob).to(device).to(dtype)
            policy, value = self.agent(self.ob.unsqueeze(0))
            action = self.agent.select_action(policy.detach().cpu().numpy()[0])
            
            obs.append(self.ob)
            actions.append(action)
            values.append(value.detach())
            action_prob.append(policy[0,action].detach())
            
            self.ob, r, done, info = self.env.step(action)
            self.Return += r
            
            if done: # environment reset
                self.ob = self.env.reset()
                self.logger.log(f'{total_steps+step},{self.Return}')
                print("Return:",self.Return)
                self.Return = 0
            
            rewards.append(r)
            dones.append(done)
            
        total_steps += steps
                                    
        return [obs, actions, rewards, dones, values, action_prob]

In [None]:
gamma = 0.99
lam = 0.95
def compute_advantage_and_value_targets(rewards, values, dones):
    
    advantage_values = []
    old_adv_t = torch.tensor(0.0).to(device)
    
    value_targets = []
    old_value_target = values[-1]
    
    for t in reversed(range(len(rewards)-1)):
        
        if dones[t]:
            old_adv_t = torch.tensor(0.0).to(device)
        
        # ADV
        delta_t = rewards[t] + (gamma*(values[t+1])*int(not dones[t+1])) - values[t]
        
        A_t = delta_t + gamma*lam*old_adv_t
        advantage_values.append(A_t[0])
        
        old_adv_t = delta_t + gamma*lam*old_adv_t
        
        # VALUE TARGET
        value_target = rewards[t] + gamma*old_value_target*int(not dones[t+1])
        value_targets.append(value_target[0])
        
        old_value_target = value_target
    
    advantage_values.reverse()
    value_targets.reverse()
    
    return advantage_values, value_targets

In [None]:
class Batch_DataSet(torch.utils.data.Dataset):

    def __init__(self, obs, actions, adv, v_t, old_action_prob):
        super().__init__()
        self.obs = obs
        self.actions = actions
        self.adv = adv
        self.v_t = v_t
        self.old_action_prob = old_action_prob
        
    def __len__(self):
        return self.obs.shape[0]
    
    def __getitem__(self, i):
        return self.obs[i],self.actions[i],self.adv[i],self.v_t[i],self.old_action_prob[i]

In [None]:
# create folder to save networks, csv, hyperparameter
folder_name = time.asctime(time.gmtime()).replace(" ","_").replace(":","_")
os.mkdir(folder_name)

env_name = "Acrobot-v1"#"CartPole-v0"
env = gym.make(env_name)
obs_dim = env.observation_space.shape[0]
num_actions = env.action_space.n

epochs = 4
T = 65
minibatch_size = 32
lr = 1e-3
eps = 0.1
c1 = 0.1

agent = PPO_Agent(obs_dim, num_actions).to(device)
optimizer = optim.Adam(agent.parameters(), lr=lr)
actors = 4
env_runners = [Env_Runner(env_name, agent, folder_name) for i in range(actors)]

In [None]:
iterations = 1000
for i in range(iterations):
    
    # get data
    batch_obs, batch_actions, batch_adv, batch_v_t, batch_old_action_prob = None, None, None, None, None
    
    for env_runner in env_runners:
        obs, actions, rewards, dones, values, old_action_prob = env_runner.run(T)
        adv, v_t = compute_advantage_and_value_targets(rewards, values, dones)
    
        # assemble data from the different runners 
        batch_obs = torch.stack(obs[:-1]) if batch_obs == None else torch.cat([batch_obs,torch.stack(obs[:-1])])
        batch_actions = np.stack(actions[:-1]) if batch_actions is None else np.concatenate([batch_actions,np.stack(actions[:-1])])
        batch_adv = torch.stack(adv) if batch_adv == None else torch.cat([batch_adv,torch.stack(adv)])
        batch_v_t = torch.stack(v_t) if batch_v_t == None else torch.cat([batch_v_t,torch.stack(v_t)]) 
        batch_old_action_prob = torch.stack(old_action_prob[:-1]) if batch_old_action_prob == None else torch.cat([batch_old_action_prob,torch.stack(old_action_prob[:-1])])
    
    # load into dataset/loader
    dataset = Batch_DataSet(batch_obs,batch_actions,batch_adv,batch_v_t,batch_old_action_prob)
    dataloader = DataLoader(dataset, batch_size=minibatch_size, num_workers=0, shuffle=True)
    
    # update
    for epoch in range(epochs):
         
        # sample minibatches
        for i, batch in enumerate(dataloader):
            optimizer.zero_grad()
            
            # get data
            obs, actions, adv, v_target, old_action_prob = batch
            
            adv = adv.squeeze(1)
            # normalize adv values
            #adv = ( adv - torch.mean(adv) ) / ( torch.std(adv) + 1e-8)
            
            # get policy actions probs for prob ratio & value prediction
            pi, v = agent(obs)
            # get the correct policy actions
            pi = pi[range(minibatch_size),actions.long()]
            
            # probaility ratio r_t(theta)
            probability_ratio = pi / old_action_prob
            
            # compute CPI
            CPI = probability_ratio * adv
            # compute clip*A_t
            clip = torch.clamp(probability_ratio,1-eps,1+eps) * adv
            
            # policy loss | take minimum
            L_CLIP = torch.mean(torch.min(CPI, clip))
            
            # value loss | mse
            L_VF = torch.mean(torch.pow(v - v_target,2))
            
            loss = - L_CLIP + c1 * L_VF
            loss.backward()
            optimizer.step()