In [None]:
import argparse
import os
import random
import time
from distutils.util import strtobool

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter
import cv2 as cv
import matplotlib.pyplot as plt

In [None]:
seed = 1
gymID = "LunarLander-v2"
nEnvs = 8
totalNSteps = 4000000
nStepsPerRollout = 64
batchSize = int(nStepsPerRollout*nEnvs)
nMinibatches = 2
minibatchSize = int(batchSize // nMinibatches)
LearningRate = 3e-3
annealLR = True
gamma = 0.999
nUpdateEpochs = 4
clipCoefficient = 0.2
entropyLossCoefficient = 0.02
valueLossCoefficient = 0.5
maxGradNorm = 0.5
normAdvantages = True
gaeLambda = 0.99 # general advantage estimation

torch.set_printoptions(linewidth=120, precision=2, sci_mode=False, profile="short")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def saveVideo(images, outputDirectory, fps):
    height, width, _ = images[0].shape
    writer = cv.VideoWriter(outputDirectory, cv.VideoWriter_fourcc(*'H264'), fps, (width, height))
    for image in images:
        bgr_image = cv.cvtColor(image, cv.COLOR_RGB2BGR)
        writer.write(bgr_image)
    writer.release()

@torch.no_grad()
def recordVideoOfAgent(agent, environmentName, filename, fps=30):
    images = []
    environment = gym.make(environmentName, render_mode="rgb_array")
    for i in range(1):
        state, _ = environment.reset()
        totalReward = 0
        while True:
            action, _, _, _ = agent.get_action_and_value(torch.tensor(state).to(device))
            nextState, reward, terminated, truncated, _ = environment.step(action.cpu().numpy())
            nextState = torch.tensor(nextState, dtype=torch.float32)
            totalReward += reward
            images.append(environment.render())

            if terminated or truncated:
                print(f"Final reward: {totalReward:>8.2f}")
                break

            state = nextState
    saveVideo(images, f"{filename}-({totalReward:.0f}).mp4", fps)

def splitNumber(number, nSplits):
    step = number / (nSplits - 1)
    indices = [0]
    for i in range(1, nSplits - 1):
        indices.append(round(step * i))
    indices.append(number - 1)
    return indices

def linearInitialize(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer


class Agent(nn.Module):
    def __init__(self, envs):
        super(Agent, self).__init__()
        self.critic = nn.Sequential(
            linearInitialize(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 256)),
            nn.Tanh(),
            linearInitialize(nn.Linear(256, 128)),
            nn.Tanh(),
            linearInitialize(nn.Linear(128, 64)),
            nn.Tanh(),
            linearInitialize(nn.Linear(64, 1), std=1.0))
        self.actor = nn.Sequential(
            linearInitialize(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 256)),
            nn.Tanh(),
            linearInitialize(nn.Linear(256, 128)),
            nn.Tanh(),
            linearInitialize(nn.Linear(128, 64)),
            nn.Tanh(),
            linearInitialize(nn.Linear(64, envs.single_action_space.n), std=0.01))

    def get_value(self, x):
        return self.critic(x)

    def get_action_and_value(self, x, action=None):
        logits = self.actor(x)
        probabilities = Categorical(logits=logits)
        if action is None:
            action = probabilities.sample()
        return action, probabilities.log_prob(action), probabilities.entropy(), self.critic(x)

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [None]:

envs = gym.vector.make(gymID, num_envs=nEnvs)

agent = Agent(envs).to(device)
optimizer = optim.Adam(agent.parameters(), lr=LearningRate, eps=1e-5)

obs = torch.zeros((nStepsPerRollout, nEnvs) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((nStepsPerRollout, nEnvs) + envs.single_action_space.shape).to(device)
logprobs = torch.zeros((nStepsPerRollout, nEnvs)).to(device)
rewards = torch.zeros((nStepsPerRollout, nEnvs)).to(device)
dones = torch.zeros((nStepsPerRollout, nEnvs)).to(device)
values = torch.zeros((nStepsPerRollout, nEnvs)).to(device)

globalStep = 0
startTime = time.time()
nextObs, _ = envs.reset(seed=seed)
nextObs = torch.tensor(nextObs).to(device)
nextDone = torch.zeros(nEnvs).to(device)
nUpdates = int(totalNSteps // batchSize)

totalEpisodicRewards = torch.zeros(nEnvs).to(device)

In [None]:
allScores = []
for update in range(1, nUpdates + 1):
    if annealLR:
        frac = 1.0 - (update - 1.0) / nUpdates
        lrnow = frac * LearningRate
        optimizer.param_groups[0]["lr"] = lrnow

    for step in range(0, nStepsPerRollout):
        globalStep += 1 * nEnvs
        obs[step] = nextObs
        dones[step] = nextDone

        with torch.no_grad():
            action, logprob, _, value = agent.get_action_and_value(nextObs)
            values[step] = value.flatten()
        actions[step] = action
        logprobs[step] = logprob
        
        nextObs, reward, terminated, truncated, info = envs.step(action.cpu().numpy())
        done = np.logical_or.reduce([terminated, truncated])
        rewards[step] = torch.tensor(reward).to(device)
        nextObs, nextDone = torch.Tensor(nextObs).to(device), torch.Tensor(done).to(device)
            
        totalEpisodicRewards += torch.tensor(reward).to(device)
        for finalScore in totalEpisodicRewards[done]:
            print(f"Score: {finalScore:>8.2f}")
            allScores.append(finalScore.item())
        totalEpisodicRewards[done] = 0

    with torch.no_grad():
        nextValue = agent.get_value(nextObs).reshape(1, -1)
        advantages = torch.zeros_like(rewards).to(device)
        lastgaelam = 0
        for t in reversed(range(nStepsPerRollout)):
            if t == nStepsPerRollout - 1:
                nextnonterminal = 1.0 - nextDone
                nextvalues = nextValue
            else:
                nextnonterminal = 1.0 - dones[t + 1]
                nextvalues = values[t + 1]
            delta = rewards[t] + gamma * nextvalues * nextnonterminal - values[t]
            advantages[t] = lastgaelam = delta + gamma * gaeLambda * nextnonterminal * lastgaelam
        returns = advantages + values

    # flatten the batch
    batchObs = obs.reshape((-1,) + envs.single_observation_space.shape)
    batchLogProbabilities = logprobs.reshape(-1)
    batchActions = actions.reshape((-1,) + envs.single_action_space.shape)
    batchAdvantages = advantages.reshape(-1)
    batchReturns = returns.reshape(-1)
    batchValues = values.reshape(-1)

    # Optimizing the policy and value network
    batchIndices = np.arange(batchSize)
    for epoch in range(nUpdateEpochs):
        np.random.shuffle(batchIndices)
        for start in range(0, batchSize, minibatchSize):
            end = start + minibatchSize
            miniBatchIndices = batchIndices[start:end]

            _, newlogprob, entropy, newvalue = agent.get_action_and_value(batchObs[miniBatchIndices], batchActions.long()[miniBatchIndices])
            logratio = newlogprob - batchLogProbabilities[miniBatchIndices]
            ratio = logratio.exp()

            minibatchAdvantages = batchAdvantages[miniBatchIndices]
            if normAdvantages:
                minibatchAdvantages = (minibatchAdvantages - minibatchAdvantages.mean()) / (minibatchAdvantages.std() + 1e-8)

            # Policy loss
            policyLoss = -minibatchAdvantages * ratio
            policyLossClipped = -minibatchAdvantages * torch.clamp(ratio, 1 - clipCoefficient, 1 + clipCoefficient)
            policyLossFinal = torch.max(policyLoss, policyLossClipped).mean()

            # Value loss
            newvalue = newvalue.view(-1)
            valueLossUnclipped = (newvalue - batchReturns[miniBatchIndices]) ** 2
            valueLossClipped = batchValues[miniBatchIndices] + torch.clamp(newvalue - batchValues[miniBatchIndices], -clipCoefficient, clipCoefficient)
            valueLossClipped = (valueLossClipped - batchReturns[miniBatchIndices]) ** 2
            valueLoss = torch.max(valueLossUnclipped, valueLossClipped)
            valueLoss = 0.5 * valueLoss.mean() # Supposedly 0.5 is to cancel out ** 2 and speed up backprop

            entropyLoss = entropy.mean()
            loss = policyLossFinal + valueLossCoefficient*valueLoss - entropyLossCoefficient*entropyLoss
            
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(agent.parameters(), maxGradNorm)
            optimizer.step()
    
    # if update in splitNumber(nUpdates, 20) or update == nUpdates or update == 1:
    #     recordVideoOfAgent(agent, gymID, f"videos/{gymID}-{globalStep/1000:.0f}k")

In [None]:

def movingAveragePlot(values, averagingValue):
    plt.plot(range(len(values)), values, label="scores")
    plt.plot([i*averagingValue for i in range(len(values)//averagingValue)], torch.tensor(values[len(values)%averagingValue:]).view(-1, averagingValue).mean(1), label=f"averaged {averagingValue}")

movingAveragePlot(allScores, 20)

