In [1]:
%cd RL_study/dreamer/

/mnt/c/Users/mingu/OneDrive/바탕 화면/성균관대/리서치인턴/공부/RL_study/dreamer


In [2]:
import gymnasium as gym
import random
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.distributions import Normal, kl_divergence

from tqdm import tqdm
from models import *
from logger import Logger


env = gym.make('CarRacing-v2')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


2024-10-04 09:28:52.825363: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-04 09:28:52.838239: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-04 09:28:52.846089: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-04 09:28:52.861292: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-04 09:28:52.864940: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attemptin

In [3]:
action_dim = env.action_space.shape[0]
obs_shape = env.observation_space.shape
print("action space: ",action_dim,", obs shape: ", obs_shape,sep='')

action space: 3, obs shape: (96, 96, 3)


In [4]:
def collect_data(env,state_dim, transition_representation, agent,replay_buffer, num_episode, device, training=True):
    print("collecting data...")
    score=0
    for _ in tqdm(range(num_episode)):
        obs, info = env.reset()
        done = False
        experience = []
        prev_state = torch.zeros(1, state_dim).to(device)
        prev_deter = transition_representation.init_hidden(1).to(device)
        prev_action = torch.zeros(1, action_dim).to(device)
        with torch.no_grad():
            while not done:
                #obs(96x96x3) -> (3x96x96) -> (1x3x96x96)
                obs = torch.tensor(obs, dtype=torch.float32).permute(2,0,1).unsqueeze(0).to(device)/255
                # s_t-1, a_t-1, o_t-1 -> s_t
                posterior_mean, posterior_std, prev_deter = transition_representation.posterior(prev_state, prev_action, prev_deter,obs)
                cur_state = posterior_mean + posterior_std*torch.normal(0, 1, posterior_mean.size()).to(device)

                action_mu, action_std = agent(cur_state, prev_deter)
                eps = torch.normal(0, 1, (1,action_dim)).to(device)
                if training:
                    cur_action = torch.tanh(action_mu + action_std*eps)
                else:
                    cur_action = torch.tanh(action_mu)
                next_obs, reward, terminated, truncated, info  = env.step(cur_action[0].cpu().numpy())
                done = terminated or truncated
                
                experience.append((np.array(obs.squeeze(0).cpu()), np.array(cur_action.squeeze(0).detach().cpu()), reward, done))
                
                obs = next_obs
                prev_state = cur_state
                prev_action = cur_action
                score+=reward
        if training:
            for exp in experience:
                replay_buffer.push(exp)
    return score/num_episode

In [5]:
def lambda_return(rewards, values, gamma, lambda_):
    # rewards, values : (Horizon+1, seq*batch)
    # 어렵다
    V_lambda = torch.zeros_like(rewards, device=rewards.device)

    H = rewards.shape[0] - 1
    V_n = torch.zeros_like(rewards, device=rewards.device)
    V_n[H] = values[H]
    for n in range(1, H+1):
        # n-step 계산 하기 위함
        # 각 step의 value 목표
        V_n[:-n] = (gamma ** n) * values[n:]
        for k in range(1, n+1):
            # n step의 reward 합 진행
            if k == n:
                V_n[:-n] += (gamma ** (n-1)) * rewards[k:]
            else:
                V_n[:-n] += (gamma ** (k-1)) * rewards[k:-n+k]

        # add lambda_ weighted n-step target to compute lambda target
        if n == H:
            V_lambda += (lambda_ ** (H-1)) * V_n
        else:
            V_lambda += (1 - lambda_) * (lambda_ ** (n-1)) * V_n
            
    return V_lambda

In [6]:
def train(batch,state_dim,deterministic_dim, device, transition_representation, reward_model, observation, actor, value, model_optimizer, actor_optimizer, critic_optimizer):
    obs_seq = []
    action_seq = []
    reward_seq = []
    #batch = batch, seq, (obs, action, reward, done)
    for seq in batch:
        obs_temp=[]
        action_temp=[]
        reward_temp=[]
        for (obs, action, reward, done) in seq:
            obs_temp.append(obs)
            action_temp.append(action)
            reward_temp.append(reward)
        obs_seq.append(obs_temp)
        action_seq.append(action_temp)
        reward_seq.append(reward_temp)
    obs_seq = torch.tensor(np.array(obs_seq), dtype=torch.float32).to(device)
    action_seq = torch.tensor(np.array(action_seq), dtype=torch.float32).to(device)
    reward_seq = torch.tensor(np.array(reward_seq), dtype=torch.float32).to(device)
    batch_size, seq_len, _, _, _ = obs_seq.size()
    
    prev_deter = transition_representation.init_hidden(batch_size).to(device)
    prev_state = torch.zeros(batch_size, state_dim).to(device)
    
    states = torch.zeros(seq_len,batch_size, state_dim).to(device)
    deters = torch.zeros(seq_len,batch_size, deterministic_dim).to(device)
    
    beta=0.1 #kl조절
    imagine_horizon=15
    gamma=0.99
    lambda_=0.95
    kl_loss = 0
    reconstruction_loss = 0
    reward_loss = 0
    
    total_kl_loss = 0
    total_reconstruction_loss = 0
    total_reward_loss = 0
    
    action_prev = action_seq[:, 0].to(device)
    total_loss=torch.zeros(1).to(device)
    for t in range(1,seq_len):
        obs = obs_seq[:, t].to(device)
        action = action_seq[:, t].to(device)
        reward = reward_seq[:, t].to(device)
        prior_mean, prior_std, _ = transition_representation(prev_state, action_prev, prev_deter)
        posterior_mean, posterior_std, cur_deter = transition_representation.posterior(prev_state, action_prev, prev_deter,obs)
        
        state = posterior_mean + posterior_std*torch.normal(0, 1, posterior_mean.size()).to(device)
        obs_pred = observation(state, cur_deter)
        reconstruction_loss = nn.functional.mse_loss(obs_pred, obs)
        
        
        reward_pred = reward_model(state, cur_deter)
        reward_pred = reward_pred.squeeze(1)
        reward_loss = nn.functional.mse_loss(reward_pred, reward)
        
        prior = Normal(prior_mean, prior_std)
        posterior = Normal(posterior_mean, posterior_std)
        kl_loss = kl_divergence(posterior, prior).mean()

        
        total_loss += reconstruction_loss + reward_loss + beta*kl_loss

        action_prev = action
        prev_state = state
        prev_deter = cur_deter
        
        states[t] = state
        deters[t] = cur_deter
        
        total_kl_loss += kl_loss.item()
        total_reconstruction_loss += reconstruction_loss.item()
        total_reward_loss += reward_loss.item()
    model_optimizer.zero_grad()
    total_loss.backward()
    model_optimizer.step()

    
    ##actor, critic 학습
    
    print("training actor, critic...")
    #states (seq, batch, state_dim) -> (seq*batch, state_dim)
    #deters (seq, batch, deterministic_dim) -> (seq*batch, deterministic_dim)
    states = states.view(-1, state_dim).detach()
    deters = deters.view(-1, deterministic_dim).detach()
    
    imagined_states = [states]
    imagined_deters = [deters]
    
    rewards = []
    values = []
    
    
    rewards.append(reward_model(states, deters).squeeze())
    values.append(value(states, deters).squeeze())
    
    for t in range(1,imagine_horizon+1):
        action_mu, action_std = actor(imagined_states[t-1], imagined_deters[t-1])
        eps = torch.normal(0, 1, (action_mu.size())).to(device)
        action = torch.tanh(action_mu + action_std*eps)
        
        prior_mean, prior_std, deter = transition_representation(imagined_states[t-1], action, imagined_deters[t-1])
        state = prior_mean + prior_std*torch.normal(0, 1, prior_mean.size()).to(device)
        
        imagined_states.append(state)
        imagined_deters.append(deter)
        
        rewards.append(reward_model(imagined_states[t], imagined_deters[t]).squeeze())
        values.append( value(imagined_states[t], imagined_deters[t]).squeeze())
    
    imagined_states = torch.stack(imagined_states, dim=0)
    imagined_deters = torch.stack(imagined_deters, dim=0)
    values = torch.stack(values, dim=0)
    rewards = torch.stack(rewards, dim=0)
    
    returns = lambda_return(rewards, values,0.99, 0.95)
    
    critic_loss = nn.functional.mse_loss(values[1:],returns[1:].detach())
    critic_optimizer.zero_grad()
    critic_loss.backward(retain_graph=True)
    torch.nn.utils.clip_grad_norm_(value.parameters(), max_norm=100)
    critic_optimizer.step()
    
    actor_loss = -returns.mean()
    actor_optimizer.zero_grad()
    actor_loss.backward()
    torch.nn.utils.clip_grad_norm_(actor.parameters(), max_norm=100)
    actor_optimizer.step()
    
    print("actor loss: ",actor_loss.item(),", critic loss: ",critic_loss.item(),sep='')
    
    return total_kl_loss/(seq_len-1), total_reconstruction_loss/(seq_len-1), total_reward_loss/(seq_len-1), actor_loss.item(), critic_loss.item()
    

In [7]:
state_dim=64
deterministic_dim=256
model_lr=1e-4
actor_critc_lr=1e-4
transition_representation=TransitionRepresentationModel(state_dim, action_dim).to(device)
observation=ObservationModel(state_dim,deterministic_dim, obs_shape[2]).to(device)
reward=RewardModel(state_dim,deterministic_dim).to(device)

agent=Agent(state_dim,deterministic_dim, action_dim).to(device)
value=ValueModel(state_dim,deterministic_dim).to(device)

model_params = list(transition_representation.parameters()) + list(observation.parameters()) + list(reward.parameters())
model_optimizer = optim.Adam(model_params, lr=model_lr)
actor_optimizer = optim.Adam(agent.parameters(), lr=actor_critc_lr)
critic_optimizer = optim.Adam(value.parameters(), lr=actor_critc_lr)

#state, action, reward, next_state, done 저장하고 sampling 가능
replay_buffer = ReplayBufferSeq(100000)
logger = Logger('./logs')

: 

In [8]:
num_epochs = 10000
batch_size = 64
seq_len = 50

world_episodes = 3
update_step = 20

seed_episodes = 5
test_interval = 3
save_interval = 20
print("collecting seed data...")
collect_data(env,state_dim, transition_representation, agent, replay_buffer, seed_episodes, device)

for epoch in range(num_epochs):
    train_score=collect_data(env,state_dim, transition_representation, agent, replay_buffer, world_episodes, device)
    logger.log(epoch*update_step,train_score=train_score)

    if len(replay_buffer) < batch_size*seq_len:
        continue
    print(len(replay_buffer))
    
    #train world model and actor, critic
    for _ in range(update_step):
        batch = replay_buffer.sample_seq(batch_size, seq_len)
        kl_loss,reconst_loss, reward_loss, actor_loss, critic_loss=train(batch,state_dim,deterministic_dim, device, transition_representation, reward, observation, agent, value, model_optimizer, actor_optimizer, critic_optimizer)
        logger.log(epoch*update_step+_,epoch=epoch, kl_loss=kl_loss, reconst_loss=reconst_loss, reward_loss=reward_loss, actor_loss=actor_loss, critic_loss=critic_loss)

    if epoch % test_interval == 0:
        test_score=collect_data(env,state_dim, transition_representation, agent, replay_buffer, world_episodes, device,training=False)
        logger.log(epoch*update_step,test_score=test_score)
    if epoch % save_interval == 0:
        torch.save(transition_representation.state_dict(), 'transition_representation.pth')
        torch.save(observation.state_dict(), 'observation.pth')
        torch.save(reward.state_dict(), 'reward.pth')
        torch.save(agent.state_dict(), 'agent.pth')
        torch.save(value.state_dict(), 'value.pth')
torch.save(transition_representation.state_dict(), 'transition_representation.pth')
torch.save(observation.state_dict(), 'observation.pth')
torch.save(reward.state_dict(), 'reward.pth')
torch.save(agent.state_dict(), 'agent.pth')
torch.save(value.state_dict(), 'value.pth')

collecting seed data...
collecting data...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:03<00:00, 12.69s/it]


collecting data...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:35<00:00, 11.98s/it]

2024-10-04 09:30:36,637 global_step: 0,train_score: -45.385780615726496, 





8000


  obs_seq = torch.tensor(obs_seq, dtype=torch.float32).to(device)
  reward_loss = nn.functional.mse_loss(reward_pred, reward)


training actor, critic...
actor loss: 0.5180847644805908, critic loss: 0.28133466839790344
2024-10-04 09:30:49,071 global_step: 0,epoch: 0, kl_loss: 0.0465136615046281, reconst_loss: 0.06141179153809742, reward_loss: 0.14672660635195064, actor_loss: 0.5180847644805908, critic_loss: 0.28133466839790344, 
training actor, critic...
actor loss: 0.43933549523353577, critic loss: 0.182479128241539
2024-10-04 09:30:59,367 global_step: 1,epoch: 0, kl_loss: 0.0455660763842871, reconst_loss: 0.06215268267052514, reward_loss: 0.2354550215235094, actor_loss: 0.43933549523353577, critic_loss: 0.182479128241539, 
training actor, critic...
actor loss: 0.37368133664131165, critic loss: 0.11487583070993423
2024-10-04 09:31:09,355 global_step: 2,epoch: 0, kl_loss: 0.04516511601490938, reconst_loss: 0.06195729986137273, reward_loss: 0.17653874023247282, actor_loss: 0.37368133664131165, critic_loss: 0.11487583070993423, 
training actor, critic...
actor loss: 0.32428258657455444, critic loss: 0.07298120856

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:40<00:00, 13.61s/it]

2024-10-04 09:34:27,992 global_step: 0,test_score: 18.13186319161436, 





collecting data...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:44<00:00, 14.67s/it]

2024-10-04 09:35:12,369 global_step: 20,train_score: -40.57974824680562, 





11000
training actor, critic...
actor loss: 0.6808317303657532, critic loss: 0.10232876241207123
2024-10-04 09:35:21,754 global_step: 20,epoch: 1, kl_loss: 0.031500272015679856, reconst_loss: 0.05828988574901406, reward_loss: 0.20949900846415181, actor_loss: 0.6808317303657532, critic_loss: 0.10232876241207123, 
training actor, critic...
actor loss: 0.6832895278930664, critic loss: 0.09022171050310135
2024-10-04 09:35:30,923 global_step: 21,epoch: 1, kl_loss: 0.030890718843711883, reconst_loss: 0.05832663033993877, reward_loss: 0.20402368033133753, actor_loss: 0.6832895278930664, critic_loss: 0.09022171050310135, 
training actor, critic...
actor loss: 0.6849983334541321, critic loss: 0.07827065885066986
2024-10-04 09:35:40,040 global_step: 22,epoch: 1, kl_loss: 0.030243416925017932, reconst_loss: 0.058521484766079455, reward_loss: 0.2298392050956585, actor_loss: 0.6849983334541321, critic_loss: 0.07827065885066986, 
training actor, critic...
actor loss: 0.69305020570755, critic loss: 0

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:34<00:00, 11.53s/it]

2024-10-04 09:38:52,109 global_step: 40,train_score: -24.205909348286735, 





14000
training actor, critic...
actor loss: 1.1982152462005615, critic loss: 0.09734126925468445
2024-10-04 09:39:01,163 global_step: 40,epoch: 2, kl_loss: 0.02147642217519484, reconst_loss: 0.04977826203922836, reward_loss: 0.2176303232411797, actor_loss: 1.1982152462005615, critic_loss: 0.09734126925468445, 
training actor, critic...
actor loss: 1.225172996520996, critic loss: 0.09397320449352264
2024-10-04 09:39:10,395 global_step: 41,epoch: 2, kl_loss: 0.0212439403261001, reconst_loss: 0.04958156078141563, reward_loss: 0.23272727196560983, actor_loss: 1.225172996520996, critic_loss: 0.09397320449352264, 
training actor, critic...
actor loss: 1.2415794134140015, critic loss: 0.08373169600963593
2024-10-04 09:39:19,469 global_step: 42,epoch: 2, kl_loss: 0.020703630289062858, reconst_loss: 0.05006854076470647, reward_loss: 0.23619782843398957, actor_loss: 1.2415794134140015, critic_loss: 0.08373169600963593, 
training actor, critic...
actor loss: 1.2491090297698975, critic loss: 0.070

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:39<00:00, 13.27s/it]

2024-10-04 09:42:33,833 global_step: 60,train_score: -49.960653792068975, 





17000
training actor, critic...
actor loss: 2.023080348968506, critic loss: 0.08578900247812271
2024-10-04 09:42:43,003 global_step: 60,epoch: 3, kl_loss: 0.015682090938623463, reconst_loss: 0.045122177169031025, reward_loss: 0.1584618837306542, actor_loss: 2.023080348968506, critic_loss: 0.08578900247812271, 
training actor, critic...
actor loss: 2.1225953102111816, critic loss: 0.08995955437421799
2024-10-04 09:42:52,271 global_step: 61,epoch: 3, kl_loss: 0.015488251345232129, reconst_loss: 0.0453438046483361, reward_loss: 0.2128380358561265, actor_loss: 2.1225953102111816, critic_loss: 0.08995955437421799, 
training actor, critic...
actor loss: 2.2252440452575684, critic loss: 0.08662063628435135
2024-10-04 09:43:02,472 global_step: 62,epoch: 3, kl_loss: 0.015369460423838119, reconst_loss: 0.04524308807995855, reward_loss: 0.222147020554094, actor_loss: 2.2252440452575684, critic_loss: 0.08662063628435135, 
training actor, critic...
actor loss: 2.3502230644226074, critic loss: 0.087

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:38<00:00, 12.83s/it]

2024-10-04 09:46:18,080 global_step: 60,test_score: -93.10868201068031, 





collecting data...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:40<00:00, 13.41s/it]

2024-10-04 09:46:58,323 global_step: 80,train_score: -93.13928868819839, 





20000
training actor, critic...
actor loss: 7.959059715270996, critic loss: 0.47426536679267883
2024-10-04 09:47:07,375 global_step: 80,epoch: 4, kl_loss: 0.008591430670372685, reconst_loss: 0.035733500001381854, reward_loss: 0.2601017670430319, actor_loss: 7.959059715270996, critic_loss: 0.47426536679267883, 
training actor, critic...
actor loss: 8.092012405395508, critic loss: 0.46709170937538147
2024-10-04 09:47:16,415 global_step: 81,epoch: 4, kl_loss: 0.008297206729422418, reconst_loss: 0.035344552400769016, reward_loss: 0.2505299281804081, actor_loss: 8.092012405395508, critic_loss: 0.46709170937538147, 
training actor, critic...
actor loss: 8.143653869628906, critic loss: 0.4743824303150177
2024-10-04 09:47:25,461 global_step: 82,epoch: 4, kl_loss: 0.008187980229529192, reconst_loss: 0.03381947146690622, reward_loss: 0.16865815760149638, actor_loss: 8.143653869628906, critic_loss: 0.4743824303150177, 
training actor, critic...
actor loss: 8.14395523071289, critic loss: 0.4429574

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:34<00:00, 11.50s/it]

2024-10-04 09:50:34,323 global_step: 100,train_score: -93.12822851680771, 





23000
training actor, critic...
actor loss: 5.077959060668945, critic loss: 0.11478555202484131
2024-10-04 09:50:44,258 global_step: 100,epoch: 5, kl_loss: 0.005406432015327167, reconst_loss: 0.020356987660028497, reward_loss: 0.23172599372539518, actor_loss: 5.077959060668945, critic_loss: 0.11478555202484131, 
training actor, critic...
actor loss: 5.0248894691467285, critic loss: 0.10869759321212769
2024-10-04 09:50:53,646 global_step: 101,epoch: 5, kl_loss: 0.005256889117121392, reconst_loss: 0.019909548744255184, reward_loss: 0.20257097774669908, actor_loss: 5.0248894691467285, critic_loss: 0.10869759321212769, 
training actor, critic...
actor loss: 4.991273880004883, critic loss: 0.09975573420524597
2024-10-04 09:51:02,777 global_step: 102,epoch: 5, kl_loss: 0.005148085546964894, reconst_loss: 0.0196505890284874, reward_loss: 0.20070641073493325, actor_loss: 4.991273880004883, critic_loss: 0.09975573420524597, 
training actor, critic...
actor loss: 4.9755682945251465, critic loss:

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:39<00:00, 13.29s/it]

2024-10-04 09:54:18,077 global_step: 120,train_score: -93.47998800130453, 





26000
training actor, critic...
actor loss: 6.44756555557251, critic loss: 0.1339159458875656
2024-10-04 09:54:32,309 global_step: 120,epoch: 6, kl_loss: 0.003792242085257051, reconst_loss: 0.01373792000647102, reward_loss: 0.11011664845923684, actor_loss: 6.44756555557251, critic_loss: 0.1339159458875656, 
training actor, critic...
actor loss: 6.379302024841309, critic loss: 0.13147017359733582
2024-10-04 09:54:41,730 global_step: 121,epoch: 6, kl_loss: 0.0037494328072560684, reconst_loss: 0.01451866418047219, reward_loss: 0.21663206130769863, actor_loss: 6.379302024841309, critic_loss: 0.13147017359733582, 
training actor, critic...
actor loss: 6.33648681640625, critic loss: 0.13331502676010132
2024-10-04 09:54:51,217 global_step: 122,epoch: 6, kl_loss: 0.003667163916825488, reconst_loss: 0.013946238563072925, reward_loss: 0.14883846606897685, actor_loss: 6.33648681640625, critic_loss: 0.13331502676010132, 
training actor, critic...
actor loss: 6.333540916442871, critic loss: 0.13383

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:41<00:00, 13.73s/it]

2024-10-04 09:58:07,595 global_step: 120,test_score: -93.44236185523967, 





collecting data...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:43<00:00, 14.55s/it]

2024-10-04 09:58:51,257 global_step: 140,train_score: -92.21939763280393, 





29000
training actor, critic...
actor loss: 6.083479881286621, critic loss: 0.10632853209972382
2024-10-04 09:59:02,853 global_step: 140,epoch: 7, kl_loss: 0.0033180087267858337, reconst_loss: 0.01145254514579262, reward_loss: 0.11418336793918124, actor_loss: 6.083479881286621, critic_loss: 0.10632853209972382, 
training actor, critic...
actor loss: 6.22578239440918, critic loss: 0.10489938408136368
2024-10-04 09:59:12,396 global_step: 141,epoch: 7, kl_loss: 0.0033264091494968353, reconst_loss: 0.011483695757176195, reward_loss: 0.1385133317873661, actor_loss: 6.22578239440918, critic_loss: 0.10489938408136368, 
training actor, critic...
actor loss: 6.37975549697876, critic loss: 0.10548047721385956
2024-10-04 09:59:21,531 global_step: 142,epoch: 7, kl_loss: 0.0032554058267791966, reconst_loss: 0.011331113644552474, reward_loss: 0.19484436707880007, actor_loss: 6.37975549697876, critic_loss: 0.10548047721385956, 
training actor, critic...
actor loss: 6.535158634185791, critic loss: 0.1

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:40<00:00, 13.58s/it]

2024-10-04 10:02:36,296 global_step: 160,train_score: -93.35956441219416, 





32000
training actor, critic...
actor loss: 6.645365238189697, critic loss: 0.10383959114551544
2024-10-04 10:02:49,193 global_step: 160,epoch: 8, kl_loss: 0.0027135148577924284, reconst_loss: 0.009425457319890966, reward_loss: 0.10923001277070417, actor_loss: 6.645365238189697, critic_loss: 0.10383959114551544, 
training actor, critic...
actor loss: 6.645007610321045, critic loss: 0.10300573706626892
2024-10-04 10:02:58,322 global_step: 161,epoch: 8, kl_loss: 0.002754152826109559, reconst_loss: 0.009591666050255299, reward_loss: 0.10827057148871601, actor_loss: 6.645007610321045, critic_loss: 0.10300573706626892, 
training actor, critic...
actor loss: 6.639404773712158, critic loss: 0.10034840553998947
2024-10-04 10:03:07,521 global_step: 162,epoch: 8, kl_loss: 0.0026556682269259983, reconst_loss: 0.010345236645365248, reward_loss: 0.14170679977942943, actor_loss: 6.639404773712158, critic_loss: 0.10034840553998947, 
training actor, critic...
actor loss: 6.63866662979126, critic loss:

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:35<00:00, 11.86s/it]

2024-10-04 10:06:17,089 global_step: 180,train_score: -92.81039781728327, 





35000
training actor, critic...
actor loss: 6.888499736785889, critic loss: 0.09816811233758926
2024-10-04 10:06:28,200 global_step: 180,epoch: 9, kl_loss: 0.0023894357810994343, reconst_loss: 0.008736148353057856, reward_loss: 0.1419854277022639, actor_loss: 6.888499736785889, critic_loss: 0.09816811233758926, 
training actor, critic...
actor loss: 6.67781400680542, critic loss: 0.09136965870857239
2024-10-04 10:06:37,756 global_step: 181,epoch: 9, kl_loss: 0.0025060954147341605, reconst_loss: 0.010032261227618675, reward_loss: 0.16391828292933275, actor_loss: 6.67781400680542, critic_loss: 0.09136965870857239, 
training actor, critic...
actor loss: 6.503688812255859, critic loss: 0.08771155029535294
2024-10-04 10:06:47,066 global_step: 182,epoch: 9, kl_loss: 0.0023204438371716867, reconst_loss: 0.00818319029497857, reward_loss: 0.12009153401298563, actor_loss: 6.503688812255859, critic_loss: 0.08771155029535294, 
training actor, critic...
actor loss: 6.3719611167907715, critic loss: 

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:41<00:00, 13.76s/it]

2024-10-04 10:10:04,725 global_step: 180,test_score: -93.3842752200306, 





collecting data...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:43<00:00, 14.53s/it]

2024-10-04 10:10:48,332 global_step: 200,train_score: -92.99388408127042, 





38000
training actor, critic...
actor loss: 7.78507661819458, critic loss: 0.12300669401884079
2024-10-04 10:10:58,828 global_step: 200,epoch: 10, kl_loss: 0.0022102444176086964, reconst_loss: 0.008645693914090492, reward_loss: 0.0977066202820944, actor_loss: 7.78507661819458, critic_loss: 0.12300669401884079, 
training actor, critic...
actor loss: 7.818826198577881, critic loss: 0.1187894269824028
2024-10-04 10:11:08,299 global_step: 201,epoch: 10, kl_loss: 0.002081270733544109, reconst_loss: 0.007297686519747486, reward_loss: 0.11293629872997539, actor_loss: 7.818826198577881, critic_loss: 0.1187894269824028, 
training actor, critic...
actor loss: 7.824929237365723, critic loss: 0.12169671803712845
2024-10-04 10:11:17,534 global_step: 202,epoch: 10, kl_loss: 0.0021272615579489085, reconst_loss: 0.008140126756411426, reward_loss: 0.1355578833086207, actor_loss: 7.824929237365723, critic_loss: 0.12169671803712845, 
training actor, critic...
actor loss: 7.809103012084961, critic loss: 0

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:35<00:00, 11.78s/it]

2024-10-04 10:14:27,626 global_step: 220,train_score: -92.88652091002761, 





41000
training actor, critic...
actor loss: 7.25114631652832, critic loss: 0.09766591340303421
2024-10-04 10:14:37,739 global_step: 220,epoch: 11, kl_loss: 0.0019883304865251543, reconst_loss: 0.008403650190377114, reward_loss: 0.15122321035982614, actor_loss: 7.25114631652832, critic_loss: 0.09766591340303421, 
training actor, critic...
actor loss: 7.515231609344482, critic loss: 0.0994890108704567
2024-10-04 10:14:46,858 global_step: 221,epoch: 11, kl_loss: 0.0019293130216264756, reconst_loss: 0.008885237093710778, reward_loss: 0.14483394553856355, actor_loss: 7.515231609344482, critic_loss: 0.0994890108704567, 
training actor, critic...
actor loss: 7.760721206665039, critic loss: 0.110483817756176
2024-10-04 10:14:55,994 global_step: 222,epoch: 11, kl_loss: 0.0019734513866287483, reconst_loss: 0.008418116547471407, reward_loss: 0.12470754224222572, actor_loss: 7.760721206665039, critic_loss: 0.110483817756176, 
training actor, critic...
actor loss: 7.959385871887207, critic loss: 0.

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:34<00:00, 11.40s/it]

2024-10-04 10:18:05,842 global_step: 240,train_score: -92.42330528993232, 





44000
training actor, critic...
actor loss: 7.144186973571777, critic loss: 0.08179407566785812
2024-10-04 10:18:17,320 global_step: 240,epoch: 12, kl_loss: 0.001796928675113512, reconst_loss: 0.006320697844636684, reward_loss: 0.07598491078860374, actor_loss: 7.144186973571777, critic_loss: 0.08179407566785812, 
training actor, critic...
actor loss: 7.119193077087402, critic loss: 0.08077140897512436
2024-10-04 10:18:26,670 global_step: 241,epoch: 12, kl_loss: 0.0017596960307707135, reconst_loss: 0.00722624824325345, reward_loss: 0.12023428733322808, actor_loss: 7.119193077087402, critic_loss: 0.08077140897512436, 
training actor, critic...
actor loss: 7.102990627288818, critic loss: 0.07881191372871399
2024-10-04 10:18:35,888 global_step: 242,epoch: 12, kl_loss: 0.0017590092427372857, reconst_loss: 0.007766656235468631, reward_loss: 0.10823400408903859, actor_loss: 7.102990627288818, critic_loss: 0.07881191372871399, 
training actor, critic...
actor loss: 7.109726428985596, critic lo

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:42<00:00, 14.06s/it]

2024-10-04 10:21:56,337 global_step: 240,test_score: -93.19098107785844, 





collecting data...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:42<00:00, 14.05s/it]

2024-10-04 10:22:38,481 global_step: 260,train_score: -92.79304921184935, 





47000
training actor, critic...
actor loss: 7.639471054077148, critic loss: 0.09071583300828934
2024-10-04 10:22:48,385 global_step: 260,epoch: 13, kl_loss: 0.001578702841414975, reconst_loss: 0.006859782665055625, reward_loss: 0.09840520209756357, actor_loss: 7.639471054077148, critic_loss: 0.09071583300828934, 
training actor, critic...
actor loss: 7.35458517074585, critic loss: 0.0791340097784996
2024-10-04 10:22:57,578 global_step: 261,epoch: 13, kl_loss: 0.0015081772735171324, reconst_loss: 0.006295794972731751, reward_loss: 0.07412525951121078, actor_loss: 7.35458517074585, critic_loss: 0.0791340097784996, 
training actor, critic...
actor loss: 7.087250709533691, critic loss: 0.07351567596197128
2024-10-04 10:23:06,834 global_step: 262,epoch: 13, kl_loss: 0.0015424423374720297, reconst_loss: 0.00690465794914231, reward_loss: 0.15628560003824532, actor_loss: 7.087250709533691, critic_loss: 0.07351567596197128, 
training actor, critic...
actor loss: 6.858292102813721, critic loss: 

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:35<00:00, 11.99s/it]

2024-10-04 10:26:20,099 global_step: 280,train_score: -93.04741937602125, 





50000
training actor, critic...
actor loss: 8.200356483459473, critic loss: 0.10307971388101578
2024-10-04 10:26:30,965 global_step: 280,epoch: 14, kl_loss: 0.0013950298527227144, reconst_loss: 0.005791434704573179, reward_loss: 0.09717648277981017, actor_loss: 8.200356483459473, critic_loss: 0.10307971388101578, 
training actor, critic...
actor loss: 8.242693901062012, critic loss: 0.1031142994761467
2024-10-04 10:26:39,933 global_step: 281,epoch: 14, kl_loss: 0.0014950590741545037, reconst_loss: 0.006276183885199075, reward_loss: 0.08475098198835682, actor_loss: 8.242693901062012, critic_loss: 0.1031142994761467, 
training actor, critic...
actor loss: 8.247540473937988, critic loss: 0.10287461429834366
2024-10-04 10:26:48,977 global_step: 282,epoch: 14, kl_loss: 0.001482019520469238, reconst_loss: 0.007792228291153299, reward_loss: 0.13198339791103667, actor_loss: 8.247540473937988, critic_loss: 0.10287461429834366, 
training actor, critic...
actor loss: 8.219576835632324, critic los

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:42<00:00, 14.20s/it]

2024-10-04 10:30:06,647 global_step: 300,train_score: -93.0860847994772, 





53000
training actor, critic...
actor loss: 7.559849739074707, critic loss: 0.0726112574338913
2024-10-04 10:30:19,992 global_step: 300,epoch: 15, kl_loss: 0.0013714549734556516, reconst_loss: 0.005722367964989069, reward_loss: 0.09105832335700718, actor_loss: 7.559849739074707, critic_loss: 0.0726112574338913, 
training actor, critic...
actor loss: 7.591652870178223, critic loss: 0.07611655443906784
2024-10-04 10:30:29,428 global_step: 301,epoch: 15, kl_loss: 0.0014227870884422707, reconst_loss: 0.006111837362832561, reward_loss: 0.11016790799520985, actor_loss: 7.591652870178223, critic_loss: 0.07611655443906784, 
training actor, critic...
actor loss: 7.630435466766357, critic loss: 0.07591193914413452
2024-10-04 10:30:38,803 global_step: 302,epoch: 15, kl_loss: 0.0013845335429044897, reconst_loss: 0.005399550168718003, reward_loss: 0.12008366489198477, actor_loss: 7.630435466766357, critic_loss: 0.07591193914413452, 
training actor, critic...
actor loss: 7.675459861755371, critic lo

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:40<00:00, 13.46s/it]

2024-10-04 10:33:58,898 global_step: 300,test_score: -93.30420405970888, 





collecting data...


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:41<00:00, 13.87s/it]

2024-10-04 10:34:40,526 global_step: 320,train_score: -92.97004692073263, 





56000
training actor, critic...
actor loss: 7.761111736297607, critic loss: 0.07217442989349365
2024-10-04 10:34:50,501 global_step: 320,epoch: 16, kl_loss: 0.0012375323205226461, reconst_loss: 0.005663793477020702, reward_loss: 0.17234579206216244, actor_loss: 7.761111736297607, critic_loss: 0.07217442989349365, 
training actor, critic...
actor loss: 7.763628959655762, critic loss: 0.07441224902868271
2024-10-04 10:34:59,829 global_step: 321,epoch: 16, kl_loss: 0.0012707863028555615, reconst_loss: 0.006341393324261417, reward_loss: 0.1358437740905638, actor_loss: 7.763628959655762, critic_loss: 0.07441224902868271, 
training actor, critic...
actor loss: 7.784878253936768, critic loss: 0.07166476547718048
2024-10-04 10:35:08,973 global_step: 322,epoch: 16, kl_loss: 0.001187611624303901, reconst_loss: 0.005353522135362942, reward_loss: 0.06049851672569461, actor_loss: 7.784878253936768, critic_loss: 0.07166476547718048, 
training actor, critic...
actor loss: 7.820186614990234, critic lo

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:35<00:00, 11.93s/it]

2024-10-04 10:38:22,248 global_step: 340,train_score: -92.85945259883488, 


: 

: 