In [1]:
import torch
from procgen import ProcgenGym3Env
from torchinfo import summary
import core

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

model = None
player = None
ppo = None
env= None
envKW = {}

modelPath = "models/"
def loadAll(fname, loadEnv=True):
    model.load_state_dict(torch.load(modelPath + fname + "/model.pth"))
    player.load_state_dict(torch.load(modelPath + fname + "/player.pth"))
    ppo.load_state_dict(torch.load(modelPath + fname + "/ppo.pth"))
    if loadEnv:
        envKW = torch.load(modelPath + fname + "/envKW.pth")
        env = ProcgenGym3Env(**envKW)
        env.callmethod("set_state", torch.load(modelPath + fname + "/env_states.pth"))
    else:
        player.reset()

def saveAll(fname):
    import os
    os.makedirs(modelPath + fname, exist_ok=True)
    torch.save(model.state_dict(), modelPath + fname + "/model.pth")
    torch.save(player.state_dict(), modelPath + fname + "/player.pth")
    torch.save(ppo.state_dict(), modelPath + fname + "/ppo.pth")
    torch.save(envKW, modelPath + fname + "/envKW.pth")
    torch.save(env.callmethod("get_state"), modelPath + fname + "/env_states.pth")
    torch.save(ppo.all_stats, modelPath + fname + "/stats.pth")


cuda:0


In [2]:
num_models = 2
num_agents = 16
envKW = core.getKW(num=num_models*num_agents, env_name="coinrun", distribution_mode="easy", paint_vel_info=True, use_backgrounds=False, restrict_themes=True)
env = ProcgenGym3Env(**envKW)
print(env.ob_space)
print(env.ac_space)

Dict(rgb=D256[64,64,3])
D15[]


In [7]:
from CVModels import CNNAgent, ViTValue, VectorModelValue
from CVModels import avgSync, sumSync
# model = ViTValue(depth=4, num_heads=4, embed_dim=32, mlp_ratio=4, valueHeadLayers=1).to(device)
model = ViTValue(depth=3, num_heads=4, embed_dim=16, mlp_ratio=4, valueHeadLayers=1)
model = VectorModelValue(model, n=num_models, syncFunc=avgSync).to(device)
# model = CNNAgent([64, 64, 3], 15, channels=16, layers=[1,1,1,1], scale=[1,1,1,1], vheadLayers=1).to(device)
model.train()
summary(model, input_size=(2, 2, 3, 64, 64))

Layer (type:depth-idx)                             Output Shape              Param #
VectorModelValue                                   [2, 2, 15]                --
├─ModuleList: 1-1                                  --                        --
│    └─ViTValue: 2-1                               [2, 15]                   --
│    │    └─VisionTransformer: 3-1                 --                        15,039
│    │    └─ValueHead: 3-2                         [2, 1]                    17
│    └─ViTValue: 2-2                               [2, 15]                   --
│    │    └─VisionTransformer: 3-3                 --                        15,039
│    │    └─ValueHead: 3-4                         [2, 1]                    17
Total params: 30,112
Trainable params: 30,112
Non-trainable params: 0
Total mult-adds (M): 0.84
Input size (MB): 0.20
Forward/backward pass size (MB): 4.61
Params size (MB): 0.09
Estimated Total Size (MB): 4.89

In [8]:
# model.load_state_dict(torch.load(modelPath + "vitNegT8BigFin" + "/model.pth"))

In [9]:
from PPO import VectorPPO
from ProcgenPlayer import VectorPlayer

rewardScale = 8.0
terminateReward = 1 - 10.0 / rewardScale
livingReward = 0
print("terminateReward", terminateReward, "livingReward", livingReward, "discountedSumLiving", livingReward / (1 - 0.99)) # if terminate reward > discountedSumLiving the agent will perfer to run into obstacles.
player = VectorPlayer(env, num_agents=num_agents, num_models=num_models, epsilon=0.01, epsilon_decay=0.99, rewardScale=rewardScale, livingReward=0, terminateReward=terminateReward)
ppo = VectorPPO(model, env, num_agents=num_agents, num_models=num_models, player=player, gamma=0.99, weight_decay=0.0, warmup_steps=10, train_steps=1000, sync_epochs=1)

terminateReward -0.25 livingReward 0 discountedSumLiving 0.0


In [16]:
from CVModels import printParams
# printParams(model.modelList)

print(list(model.modelList[0].parameters())[0].data, list(model.modelList[1].parameters())[0].data)
print("before sync")
model.sync()
print("after sync")
# printParams(model.modelList)
print(list(model.modelList[0].parameters())[0].data, list(model.modelList[1].parameters())[0].data)

# paramList = list(zip(*[list(model.parameters()) for model in model.modelList]))
# print(len(paramList)) # 42 num layers
# print(len(paramList[0])) # 2 num models
# print(paramList[0][0]) # param

tensor([[[-3.4612e-03, -1.5888e-03, -1.2570e-05, -1.5367e-03, -2.2071e-02,
           6.6957e-02, -2.3656e-03, -9.2508e-04, -4.2245e-04,  2.5672e-03,
           1.3760e-03, -5.8061e-03,  3.1964e-04,  2.7148e-03,  1.5907e-03,
          -5.2266e-03]]], device='cuda:0') tensor([[[-0.0008,  0.0001,  0.0214, -0.0150, -0.0291,  0.0114,  0.0010,
           0.0094, -0.0027,  0.0033,  0.0024,  0.0031,  0.0012,  0.0051,
           0.0026, -0.0113]]], device='cuda:0')
before aggregate
after aggregate
tensor([[[-0.0043, -0.0015,  0.0214, -0.0165, -0.0511,  0.0783, -0.0013,
           0.0085, -0.0031,  0.0058,  0.0038, -0.0027,  0.0015,  0.0078,
           0.0042, -0.0165]]], device='cuda:0') tensor([[[-0.0043, -0.0015,  0.0214, -0.0165, -0.0511,  0.0783, -0.0013,
           0.0085, -0.0031,  0.0058,  0.0038, -0.0027,  0.0015,  0.0078,
           0.0042, -0.0165]]], device='cuda:0')


In [13]:
# ppo.runGame()
# loss = ppo.train(debug=True)
# print(loss)
# import torchviz
# torchviz.make_dot(torch.sum(loss), params=dict(model.named_parameters()))

In [14]:
# loadAll("vitNegT8BigFin")

In [15]:
for i in range(50):
    ppo.runGame()
    ppo.train()
    if i % 10 == 0:
        # print("episodeLength", ppo.all_stats[-1]["game/episodeLength"], "episodeReward", ppo.all_stats[-1]["game/episodeReward"],
        #       "epoch", ppo.all_stats[-1]["epoch"], "steps", ppo.all_stats[-1]["steps"], 
        #       "\nloss", ppo.all_stats[-1]["ppo/loss/total"].item(), "policy", ppo.all_stats[-1]["ppo/loss/policy"].item(), 
        #       "value", ppo.all_stats[-1]["ppo/loss/value"].item(),
        #       "entropy", ppo.all_stats[-1]["ppo/policy/entropy"].item())
        print(f"episodeLength {ppo.all_stats[-1]['game/episodeLength']} episodeReward {ppo.all_stats[-1]['game/episodeReward']} " + 
              f"\nepoch {ppo.all_stats[-1]['epoch']} steps {ppo.all_stats[-1]['steps']} " +
              f"\nloss {ppo.all_stats[-1]['ppo/loss/total']} policy {ppo.all_stats[-1]['ppo/loss/policy']} " +
              f"\nvalue {ppo.all_stats[-1]['ppo/loss/value']} entropy {ppo.all_stats[-1]['ppo/policy/entropy']} " +
              f"\nstale {ppo.all_stats[-1]['game/staleSteps']}              ")
    # if i % 100 == 0:
    #     stats = ppo.all_stats[-1]
    #     for k, v in stats.items():
    #         # if "time" in k:
    #         print(k, v)

episodeLength [349.85714285714283, 305.42857142857144] episodeReward [-0.03571428571428571, -0.09523809523809523] 
epoch 0 steps 4096 
loss tensor([0.0157, 0.0170], dtype=torch.float64) policy tensor([-1.7234e-09, -2.7109e-09], dtype=torch.float64) 
value tensor([0.0157, 0.0170], dtype=torch.float64) entropy tensor([2.7052, 2.7051]) 
stale 0              
episodeLength [249.9375, 362.0] episodeReward [-0.09375, 0.078125] 
epoch 10 steps 45056 
loss tensor([0.0032, 0.0004], dtype=torch.float64) policy tensor([ 0.0017, -0.0019], dtype=torch.float64) 
value tensor([0.0015, 0.0023], dtype=torch.float64) entropy tensor([2.6921, 2.6817]) 
stale 952              
episodeLength [234.5, 377.3333333333333] episodeReward [-0.08333333333333333, -0.16666666666666666] 
epoch 20 steps 86016 
loss tensor([0.0029, 0.0030], dtype=torch.float64) policy tensor([0.0024, 0.0022], dtype=torch.float64) 
value tensor([0.0006, 0.0008], dtype=torch.float64) entropy tensor([2.6947, 2.6930]) 
stale 991            

In [None]:
# saveAll("vitNegT8BigFin400")