In [1]:
import torch
from procgen import ProcgenGym3Env
from torchinfo import summary
import core

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

model = None
player = None
ppo = None
env= None
envKW = {}

modelPath = "models/"
def loadAll(fname, loadEnv=True):
    model.load_state_dict(torch.load(modelPath + fname + "/model.pth"))
    player.load_state_dict(torch.load(modelPath + fname + "/player.pth"))
    ppo.load_state_dict(torch.load(modelPath + fname + "/ppo.pth"))
    if loadEnv:
        envKW = torch.load(modelPath + fname + "/envKW.pth")
        env = ProcgenGym3Env(**envKW)
        env.callmethod("set_state", torch.load(modelPath + fname + "/env_states.pth"))
    else:
        player.reset()

def saveAll(fname):
    import os
    os.makedirs(modelPath + fname, exist_ok=True)
    torch.save(model.state_dict(), modelPath + fname + "/model.pth")
    torch.save(player.state_dict(), modelPath + fname + "/player.pth")
    torch.save(ppo.state_dict(), modelPath + fname + "/ppo.pth")
    torch.save(envKW, modelPath + fname + "/envKW.pth")
    torch.save(env.callmethod("get_state"), modelPath + fname + "/env_states.pth")
    torch.save(ppo.all_stats, modelPath + fname + "/stats.pth")
    torch.save(player.trainEpisodeStats, modelPath + fname + "/trainEpisodeStats.pth")


cuda:0


In [2]:
num_models = 2
num_agents = 64
envKW = core.getKW(num=num_models*num_agents, env_name="coinrun", distribution_mode="easy", paint_vel_info=True, use_backgrounds=False, restrict_themes=True)
env = ProcgenGym3Env(**envKW)
print(env.ob_space)
print(env.ac_space)

Dict(rgb=D256[64,64,3])
D15[]


In [3]:
from CVModels import CNNAgent, ViTValue, ImpalaValue, VectorModelValue
from CVModels import avgSync, sumSync
modelName = "Impala"
if modelName == "Impala":
    model = ImpalaValue().to(device)
elif modelName == "ViT60":
    model = ViTValue(depth=4, num_heads=4, embed_dim=32, mlp_ratio=4, valueHeadLayers=1).to(device)
elif modelName == "ViT15":
    model = ViTValue(depth=3, num_heads=4, embed_dim=16, mlp_ratio=4, valueHeadLayers=1).to(device)
elif modelName == "resnet":
    model = CNNAgent([64, 64, 3], 15, channels=16, layers=[1,1,1,1], scale=[1,1,1,1], vheadLayers=1).to(device)
# model = ViTValue(depth=3, num_heads=4, embed_dim=16, mlp_ratio=4, valueHeadLayers=1).to(device)
model = VectorModelValue(model, n=num_models, syncFunc=avgSync).to(device)
# model = CNNAgent([64, 64, 3], 15, channels=16, layers=[1,1,1,1], scale=[1,1,1,1], vheadLayers=1).to(device)
model.train()
summary(model, input_size=(2, 2, 3, 64, 64))

Layer (type:depth-idx)                        Output Shape              Param #
VectorModelValue                              [2, 2, 15]                --
├─ModuleList: 1-1                             --                        --
│    └─ImpalaValue: 2-1                       [2, 15]                   --
│    │    └─ImpalaModel: 3-1                  [2, 256]                  622,144
│    │    └─Linear: 3-2                       [2, 15]                   3,855
│    │    └─ValueHead: 3-3                    [2, 1]                    257
│    └─ImpalaValue: 2-2                       [2, 15]                   --
│    │    └─ImpalaModel: 3-4                  [2, 256]                  622,144
│    │    └─Linear: 3-5                       [2, 15]                   3,855
│    │    └─ValueHead: 3-6                    [2, 1]                    257
Total params: 1,252,512
Trainable params: 1,252,512
Non-trainable params: 0
Total mult-adds (M): 125.39
Input size (MB): 0.20
Forward/backward pass size

In [4]:
# model.load_state_dict(torch.load(modelPath + "vitNegT8BigFin" + "/model.pth"))
from PPO import VectorPPO
from ProcgenPlayer import VectorPlayer

gamma = 0.99
# gamma = 0.999
rewardScale = 10
terminateReward = 1 - 10.0 / rewardScale

# livingReward = 0
livingReward = -1e-3
# lr = 1e-3
lr = 2.5e-4
# ent_coef = 0
ent_coef = 1e-2
print("terminateReward", terminateReward, "livingReward", livingReward, "discountedSumLiving", livingReward / (1 - gamma)) # if terminate reward > discountedSumLiving the agent will perfer to run into obstacles.

terminateReward 0.0 livingReward -0.001 discountedSumLiving -0.09999999999999991


In [5]:
player = VectorPlayer(env, num_agents=num_agents, num_models=num_models, 
                      epsilon=0.01, epsilon_decay=1, 
                      rewardScale=rewardScale, livingReward=livingReward, terminateReward=terminateReward, 
                      finishedOnly=True, maxStaleSteps=64)
ppo = VectorPPO(model, env, num_agents=num_agents, num_models=num_models, player=player, 
                lr=lr, gamma=gamma, ent_coef=ent_coef, weight_decay=0.0, 
                warmup_steps=10, train_steps=1000, sync_epochs=6,
                batch_size=1, epochs_per_game=3)



In [6]:
# from CVModels import printParams
# # printParams(model.modelList)

# print(list(model.modelList[0].parameters())[0].data, list(model.modelList[1].parameters())[0].data)
# print("before sync")
# model.sync()
# print("after sync")
# # printParams(model.modelList)
# print(list(model.modelList[0].parameters())[0].data, list(model.modelList[1].parameters())[0].data)

# paramList = list(zip(*[list(model.parameters()) for model in model.modelList]))
# print(len(paramList)) # 42 num layers
# print(len(paramList[0])) # 2 num models
# print(paramList[0][0]) # param

In [7]:
# ppo.runGame()
# loss = ppo.train(debug=True)
# print(loss)
# import torchviz
# torchviz.make_dot(torch.sum(loss), params=dict(model.named_parameters()))

In [8]:
# loadAll("vitNegT8BigFin")
print(ppo.params)
print(player.params)
print(envKW)

{'alg_name': 'ppo', 'lr': 0.00025, 'gamma': 0.99, 'lam': 0.95, 'whiten': True, 'cliprange': 0.2, 'cliprange_value': 0.2, 'vf_coef': 0.5, 'epoch_steps': 256, 'epochs_per_game': 3, 'batch_size': 1, 'ent_coef': 0.01, 'weight_decay': 0.0, 'warmup_steps': 10, 'train_steps': 1000, 'sync_epochs': 6, 'sync_steps': -1}
{'alg_name': 'ppo', 'epsilon': 0.01, 'epsilon_decay': 1, 'rewardScale': 10, 'livingReward': -0.001, 'terminateReward': 0.0, 'finishedOnly': True, 'maxStaleSteps': 64}
{'num': 128, 'env_name': 'coinrun', 'distribution_mode': 'easy', 'paint_vel_info': True, 'use_backgrounds': False, 'restrict_themes': True}


In [9]:
for i in range(300 // ppo.params['epochs_per_game']):
    ppo.runGame()
    ppo.train()
    if i % 10 == 0:
        # print("episodeLength", ppo.all_stats[-1]["game/episodeLength"], "episodeReward", ppo.all_stats[-1]["game/episodeReward"],
        #       "epoch", ppo.all_stats[-1]["epoch"], "steps", ppo.all_stats[-1]["steps"], 
        #       "\nloss", ppo.all_stats[-1]["ppo/loss/total"].item(), "policy", ppo.all_stats[-1]["ppo/loss/policy"].item(), 
        #       "value", ppo.all_stats[-1]["ppo/loss/value"].item(),
        #       "entropy", ppo.all_stats[-1]["ppo/policy/entropy"].item())
        print(f"episodeLength {ppo.all_stats[-1]['game/episodeLength']} episodeReward {ppo.all_stats[-1]['game/episodeReward']} " + 
              f"\nepoch {ppo.all_stats[-1]['epoch']} steps {ppo.all_stats[-1]['steps']} " +
              f"\nloss {ppo.all_stats[-1]['ppo/loss/total']} policy {ppo.all_stats[-1]['ppo/loss/policy']} " +
              f"\nvalue {ppo.all_stats[-1]['ppo/loss/value']} entropy {ppo.all_stats[-1]['ppo/policy/entropy']} " +
              f"\ncomms {ppo.all_stats[-1]['sync/comms']} data {ppo.all_stats[-1]['sync/data']} " +
              f"\nstale {ppo.all_stats[-1]['game/staleSteps']}              ")
        # print(ppo.all_stats[-1])
    else:
        print(f"episodeLength {ppo.all_stats[-1]['game/episodeLength']} episodeReward {ppo.all_stats[-1]['game/episodeReward']}               ", end="\r")
        
    if i % (50 // ppo.params['epochs_per_game']) == 0:
        saveAll(f"vector{modelName}{ppo.all_stats[-1]['epoch']}RS{rewardScale}G{gamma}Lv{livingReward!=0}_4-9")
saveAll(f"vector{modelName}{ppo.all_stats[-1]['epoch']}RS{rewardScale}G{gamma}Lv{livingReward!=0}_4-9")

episodeLength [174.57142857142858, 162.57142857142858] episodeReward [-0.1745714368631265, -0.01971429158999984] 
epoch 2 steps 32768 
loss tensor([-0.0300, -0.0289], dtype=torch.float64) policy tensor([-0.0037, -0.0026], dtype=torch.float64) 
value tensor([0.0004, 0.0004], dtype=torch.float64) entropy tensor([2.6759, 2.6728]) 
comms 2 data 9.555908203125 
stale 0              
episodeLength [255.57142857142858, 522.7878787878788] episodeReward [0.19897401970871664, -0.15915153528366124] 
epoch 32 steps 360448 
loss tensor([-0.0300, -0.0303], dtype=torch.float64) policy tensor([-0.0039, -0.0036], dtype=torch.float64) 
value tensor([0.0005, 0.0002], dtype=torch.float64) entropy tensor([2.6597, 2.6926]) 
comms 12 data 57.33544921875 
stale 64              
episodeLength [141.0096153846154, 160.74774774774775] episodeReward [0.3109134606806597, 0.3257387373900578] 
epoch 62 steps 688128 
loss tensor([-0.0331, -0.0323], dtype=torch.float64) policy tensor([-0.0083, -0.0071], dtype=torch.flo

KeyboardInterrupt: 