In [1]:
import torch
from procgen import ProcgenGym3Env
from torchinfo import summary
import core

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

model = None
player = None
ppo = None
env= None
envKW = {}

modelPath = "models/"
def loadAll(fname, loadEnv=True):
    model.load_state_dict(torch.load(modelPath + fname + "/model.pth"))
    player.load_state_dict(torch.load(modelPath + fname + "/player.pth"))
    ppo.load_state_dict(torch.load(modelPath + fname + "/ppo.pth"))
    if loadEnv:
        envKW = torch.load(modelPath + fname + "/envKW.pth")
        env = ProcgenGym3Env(**envKW)
        env.callmethod("set_state", torch.load(modelPath + fname + "/env_states.pth"))
    else:
        player.reset()

def saveAll(fname):
    import os
    os.makedirs(modelPath + fname, exist_ok=True)
    torch.save(model.state_dict(), modelPath + fname + "/model.pth")
    torch.save(player.state_dict(), modelPath + fname + "/player.pth")
    torch.save(ppo.state_dict(), modelPath + fname + "/ppo.pth")
    torch.save(envKW, modelPath + fname + "/envKW.pth")
    torch.save(env.callmethod("get_state"), modelPath + fname + "/env_states.pth")
    torch.save(ppo.all_stats, modelPath + fname + "/stats.pth")


cuda:0


In [2]:
num_agents = 16
envKW = core.getKW(num=num_agents, env_name="coinrun", distribution_mode="easy", paint_vel_info=True, use_backgrounds=False, restrict_themes=True)
env = ProcgenGym3Env(**envKW)
print(env.ob_space)
print(env.ac_space)

Dict(rgb=D256[64,64,3])
D15[]


In [3]:
from CVModels import CNNAgent, ViTValue
model = ViTValue(depth=4, num_heads=4, embed_dim=32, mlp_ratio=4, valueHeadLayers=1).to(device)
# model = ViTValue(depth=3, num_heads=4, embed_dim=16, mlp_ratio=4, valueHeadLayers=1).to(device)
# model = CNNAgent([64, 64, 3], 15, channels=16, layers=[1,1,1,1], scale=[1,1,1,1], vheadLayers=1).to(device)
model.train()
summary(model, input_size=(2, 3, 64, 64))

Layer (type:depth-idx)                   Output Shape              Param #
ViTValue                                 [2, 15]                   --
├─VisionTransformer: 1-1                 --                        8,256
│    └─PatchEmbed: 2-1                   [2, 256, 32]              --
│    │    └─Conv2d: 3-1                  [2, 32, 16, 16]           1,568
│    │    └─Identity: 3-2                [2, 256, 32]              --
│    └─Dropout: 2-2                      [2, 257, 32]              --
│    └─Identity: 2-3                     [2, 257, 32]              --
│    └─Sequential: 2-4                   [2, 257, 32]              --
│    │    └─Block: 3-3                   [2, 257, 32]              12,704
│    │    └─Block: 3-4                   [2, 257, 32]              12,704
│    │    └─Block: 3-5                   [2, 257, 32]              12,704
│    │    └─Block: 3-6                   [2, 257, 32]              12,704
│    └─LayerNorm: 2-5                    [2, 257, 32]          

In [4]:
# model.load_state_dict(torch.load(modelPath + "vitNegT8BigFin" + "/model.pth"))

In [5]:
from PPO import PPO
import ProcgenPlayer

gamma = 0.99
# gamma = 0.999
rewardScale = 8
terminateReward = 1 - 10.0 / rewardScale
livingReward = 0
# livingReward = -1e-4
lr = 1e-3
# lr = 5e-4
ent_coef = 0
# ent_coef = 1e-2
print("terminateReward", terminateReward, "livingReward", livingReward, "discountedSumLiving", livingReward / (1 - gamma)) # if terminate reward > discountedSumLiving the agent will perfer to run into obstacles.
player = ProcgenPlayer.Player(env, num_agents=num_agents, epsilon=0.01, epsilon_decay=0.99, rewardScale=rewardScale, livingReward=livingReward, terminateReward=terminateReward, finishedOnly=True, maxStaleSteps=1000)
ppo = PPO(model, env, num_agents=num_agents, player=player, lr=lr, gamma=gamma, weight_decay=0.0, ent_coef=ent_coef, warmup_steps=10, train_steps=1000, batch_size=1)

terminateReward -0.25 livingReward 0 discountedSumLiving 0.0




In [6]:
# ppo.runGame()
# loss = ppo.train(debug=True)
# print(loss)
# import torchviz
# torchviz.make_dot(loss, params=dict(model.named_parameters()))

In [7]:
# loadAll("vitNegT8BigFin")
print(ppo.params)
print(player.params)
print(envKW)

{'alg_name': 'ppo', 'lr': 0.001, 'gamma': 0.99, 'lam': 0.95, 'whiten': True, 'cliprange': 0.2, 'cliprange_value': 0.2, 'vf_coef': 0.5, 'epoch_steps': 256, 'epochs_per_game': 1, 'ent_coef': 0, 'weight_decay': 0.0, 'warmup_steps': 10, 'train_steps': 1000, 'batch_size': 1}
{'alg_name': 'ppo', 'epsilon': 0.01, 'epsilon_decay': 0.99, 'rewardScale': 8, 'livingReward': 0, 'terminateReward': -0.25, 'finishedOnly': True, 'maxStaleSteps': 1000}
{'num': 16, 'env_name': 'coinrun', 'distribution_mode': 'easy', 'paint_vel_info': True, 'use_backgrounds': False, 'restrict_themes': True}


In [8]:
for i in range(200):
    ppo.runGame()
    for _ in range(1):
        ppo.train()
    if i % 10 == 0:
        # print("episodeLength", ppo.all_stats[-1]["game/episodeLength"], "episodeReward", ppo.all_stats[-1]["game/episodeReward"],
        #       "epoch", ppo.all_stats[-1]["epoch"], "steps", ppo.all_stats[-1]["steps"], 
        #       "\nloss", ppo.all_stats[-1]["ppo/loss/total"].item(), "policy", ppo.all_stats[-1]["ppo/loss/policy"].item(), 
        #       "value", ppo.all_stats[-1]["ppo/loss/value"].item(),
        #       "entropy", ppo.all_stats[-1]["ppo/policy/entropy"].item())
        print(f"episodeLength {ppo.all_stats[-1]['game/episodeLength']:.4f} episodeReward {ppo.all_stats[-1]['game/episodeReward']:.4f} " + 
              f"epoch {ppo.all_stats[-1]['epoch']} steps {ppo.all_stats[-1]['steps']} " +
              f"loss {ppo.all_stats[-1]['ppo/loss/total'].item():.4f} policy {ppo.all_stats[-1]['ppo/loss/policy'].item():.4f} " +
              f"value {ppo.all_stats[-1]['ppo/loss/value'].item():.4f} entropy {ppo.all_stats[-1]['ppo/policy/entropy'].item():.4f} " +
              f"stale {ppo.all_stats[-1]['game/staleSteps']}              ")
        # print(ppo.all_stats[-1])
    else:
        print(f"episodeLength {ppo.all_stats[-1]['game/episodeLength']:.4f} episodeReward {ppo.all_stats[-1]['game/episodeReward']:.4f} " + 
              f"epoch {ppo.all_stats[-1]['epoch']} steps {ppo.all_stats[-1]['steps']} " +
              f"loss {ppo.all_stats[-1]['ppo/loss/total'].item():.4f} policy {ppo.all_stats[-1]['ppo/loss/policy'].item():.4f} " +
              f"value {ppo.all_stats[-1]['ppo/loss/value'].item():.4f} entropy {ppo.all_stats[-1]['ppo/policy/entropy'].item():.4f} " +
              f"stale {ppo.all_stats[-1]['game/staleSteps']}              ", end="\r")
    # if i % 100 == 0:
    #     stats = ppo.all_stats[-1]
    #     for k, v in stats.items():
    #         # if "time" in k:
    #         print(k, v)

episodeLength 689.4211 episodeReward -0.1842 epoch 0 steps 4096 loss 0.0005 policy 0.0000 value 0.0005 entropy 2.7026 stale 0              
{'epoch': 0, 'steps': 4096, 'objective/vf_coef': 0.5, 'objective/ent_coef': 0, 'objective/lr': [0.0], 'game/episodeLength': 689.421052631579, 'game/nonZeroReward': 0.05263157894736842, 'game/episodeReward': -0.18421052631578946, 'game/advantageMean_PreWhiten': -0.04250799167831403, 'game/advantageStd_PreWhiten': 0.013483489077771077, 'game/staleSteps': 0, 'ppo/reward': tensor(0.), 'ppo/loss/policy': tensor(4.8654e-17, dtype=torch.float64), 'ppo/loss/value': tensor(0.0005, dtype=torch.float64), 'ppo/loss/ent': tensor(-0.), 'ppo/loss/total': tensor(0.0005, dtype=torch.float64), 'ppo/policy/entropy': tensor(2.7026), 'ppo/policy/approxkl': tensor(0.), 'ppo/policy/policykl': tensor(0.), 'ppo/policy/clipfrac': tensor(0., dtype=torch.float64), 'ppo/policy/advantages_mean': tensor(-4.8654e-17, dtype=torch.float64), 'ppo/policy/ratio_mean': tensor(1.), 'ppo

KeyboardInterrupt: 

In [None]:
# saveAll("vitNegT8BigFin400")

Experiment Notes

1. coin run hard

Higher reward per episode is better, and shorter episodes are better. Not sure if reward per timestep consolidates these.
The model can learn, and actively persue the reward but get lower reward because it now dies more. Though this is clearly better than headslamming a wall for several hundred  timesteps before randomly jumping into a coin. Shorter episodes also means higher sample size on reward per episode.

1.1 VIT 15k

LR 1e-3 warmup and cosine decay

Weight decay 0.01, living reward -0.001
Doesn't like to jump doesn't seem to avoid enemies.

Weight decay 0.01
Doesn't like to jump

No wd or livrew
Gets stuck going right

Removed Background and theme variation
got up to 87% rewards, though most epochs are ~70%
Very jumpy, spends almost no time on ground
Does not seem to avoid obstacles, always jump appears to be a passive strategy

Negative terminate reward. +1 for coin, -1 for enemy or timeout.
never got better than -1 reward.

Negative terminate reward. +1 for coin, -0.25 for enemy or timeout.
Improves for 200 epochs, gets worse around 350. Gets to around 0.5 which is 75% of coins.

1.2 VIT 60k
2 Layer value head degrades performance, is about 2k extra params. Vf loss gets larger when training.

Negative terminate reward. +1 for coin, -0.25 for enemy or timeout.
Did well until an over 1k step episode, spiked vf error.

Keep running environment on extra steps until every episode finishes.
Stale steps started very high around 90, but by epoch 70 it was down to 140. 256 steps per epoch, so a value higher than 256 is an entirely stale epoch.
Was considering capping the amount of stale steps, but it seems to work without this after a bit.
Reward is steadily increasing.
Still very jumpy but not as much as other models. Stays on ground for a few frames. Does not seem to actively avoid obstacles, though appears to be favoring landing on boxes over the ground which are always safe.
400 epochs, doesn't favor boxes as much, might be a bit better at jumping over gaps and avoiding obstacles.

Add living reward, train from previous model.
Discounted Sum of infinite living rewards is less than half the terminate reward so the agent should not perfer to hit an obstacle and end the episode.
Reduced high episode lenght faster by epoch 50
1000 step episodes at epoch 140 and derailed

For all above there was an issue with compute advantages where the first advantage of an episode would be set to zero. Probably had little impact.

3/31
Multiple tests with -0.001 living reward failed to learn. Vf loss went to 0, but episode length stayed at 1000 and reward at -1.25.
Removing living reward appears to work better at first and then back to failing. Not sure what changed since last living reward run.

Increase warmup to 40 epochs. At 50 epochs, reward was back to -0.25

Hyperparams from openai train-procgen
lr 5e-4, ent 0.01, gamma 0.999
Did better at first fell back to zero

1e-4 never gets above 0.

3 ppo epochs per game loop. Worked until epoch 242 and then all zeros.
-1e-4 living reward did not help, -0.25 term reward did not help.

maxStaleSteps 64, same
1 agent, same