In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Attention Mechanism
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.W1 = nn.Linear(self.hidden_dim, self.hidden_dim, bias=False)
        self.W2 = nn.Linear(self.hidden_dim, self.hidden_dim, bias=False)
        self.V = nn.Linear(self.hidden_dim, 1, bias=False)
        self.tanh = nn.Tanh()

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight.data)

    def forward(self, enc_outputs, dec_output, mask):
        w1_e = self.W1(enc_outputs)
        w2_d = self.W2(dec_output)
        tanh_output = self.tanh(w1_e + w2_d)
        v_dot_tanh = self.V(tanh_output).squeeze(2)
        v_dot_tanh += mask
        attention_weights = F.softmax(v_dot_tanh, dim=1)
        return attention_weights

# Encoder Network
class Encoder(nn.Module):
    def __init__(self, hidden_dim, input_dim=2):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.cell = nn.GRU(self.input_dim, self.hidden_dim, 1, batch_first=True)

    def forward(self, input):
        enc_output, enc_hidden_state = self.cell(input)
        return enc_output, enc_hidden_state

# Decoder Network
class Decoder(nn.Module):
    def __init__(self, hidden_dim, input_dim=2):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.cell = nn.GRU(self.input_dim, self.hidden_dim, 1, batch_first=True)
        self.attention_layer = Attention(self.hidden_dim)

    def forward(self, input, enc_output, hidden_state, pointer, mask):
        idx = pointer.repeat(1, 2).unsqueeze(1)
        dec_output, dec_hidden = self.cell(input.gather(1, idx), hidden_state)
        attention_weights = self.attention_layer(enc_output, dec_output, mask)
        return attention_weights, dec_hidden

ModuleNotFoundError: No module named 'torch'

In [None]:
class Critic(nn.Module):
    def __init__(self, hidden_dim, input_dim=2):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim

        self.encoder = Encoder(self.hidden_dim)
        self.decoder_1 = nn.Linear(self.hidden_dim, self.hidden_dim)
        self.decoder_2 = nn.Linear(self.hidden_dim, 1)
        self.relu = nn.ReLU()

        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight.data)

    def forward(self, input):
        enc_output, enc_hidden_state = self.encoder(input)
        dec_hidden_state = self.decoder_1(enc_hidden_state)
        dec_hidden_state = self.relu(dec_hidden_state)
        dec_output = self.decoder_2(dec_hidden_state)
        return dec_output.squeeze(0)


In [None]:
class PCTSPEnvironment:
    def __init__(self, num_cities, max_reward, max_distance):
        self.num_cities = num_cities
        self.max_reward = max_reward
        self.max_distance = max_distance
        self.reset()

    def reset(self):
        self.city_coordinates = torch.rand((self.num_cities, 2))
        self.city_rewards = torch.randint(1, self.max_reward, (self.num_cities,))
        self.distance_travelled = 0
        self.current_city = 0
        self.visited_cities = set([0])
        print("Random Data for Current Episode:")
        print("City Coordinates:\n", self.city_coordinates)
        print("City Rewards:\n", self.city_rewards)
        print("distance_travelled:\n", self.distance_travelled)

    def get_state(self):
        return self.city_coordinates

    def step(self, action):
        next_city = action.item()
        reward = 0
        done = False

        distance_to_next_city = torch.norm(self.city_coordinates[self.current_city] - self.city_coordinates[next_city])

        if next_city not in self.visited_cities and self.distance_travelled + distance_to_next_city <= self.max_distance:
            reward += self.city_rewards[next_city]
            self.distance_travelled += distance_to_next_city
            self.visited_cities.add(next_city)
            self.current_city = next_city
        else:
            done = True

        return self.get_state(), reward, done


In [None]:
# PtrNet Network
class PtrNet(nn.Module):
    def __init__(self, hidden_dim, input_dim=2, deterministic=False):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.deterministic = deterministic
        self.encoder = Encoder(self.hidden_dim)
        self.decoder = Decoder(self.hidden_dim)
    
    def get_length(self, input, path_indices):
        """
        Calculate the total length of the path.
        Args:
        - input: Tensor of city coordinates.
        - path_indices: Indices of the cities in the path.
        Returns:
        - Total length of the path.
        """
        batch_size, _, _ = input.size()
        path_length = path_indices.size(1)  # 실제 경로 길이
        length = 0

        path_indices = path_indices.long()

        for i in range(path_length - 1):  # 실제 경로 길이를 기준으로 반복
            current_city = path_indices[:, i]
            next_city = path_indices[:, i + 1]
            distance = torch.norm(input[torch.arange(batch_size), current_city] - input[torch.arange(batch_size), next_city], dim=1)
            length += distance

        # 마지막 도시에서 첫 번째 도시로 돌아가는 거리 계산
        last_city = path_indices[:, -1]
        first_city = path_indices[:, 0]
        distance = torch.norm(input[torch.arange(batch_size), last_city] - input[torch.arange(batch_size), first_city], dim=1)
        length += distance
        print("step length: ",length)

        return length
    
    def forward(self, input, city_rewards, max_distance):
        batch_size = input.size(0)
        seq_len = input.size(1)

        # 초기화
        probs = torch.zeros(batch_size, 1, device=input.device)
        pointers = torch.zeros(batch_size, 0, dtype=torch.long, device=input.device)
        mask = torch.zeros(batch_size, seq_len, device=input.device)
        distance_travelled = torch.zeros(batch_size, 1, device=input.device)
        total_reward = torch.zeros(batch_size, 1, device=input.device, requires_grad=True)
        prev_pointer = torch.zeros(batch_size, 1, dtype=torch.long, device=input.device)
        pointer = torch.zeros(batch_size, 1, dtype=torch.long, device=input.device)  # 추가된 초기화

        enc_output, enc_hidden_state = self.encoder(input)
        mask = self.update_mask(mask, pointer)  # 이제 pointer가 정의됨
        dec_hidden_state = enc_hidden_state

        for i in range(seq_len - 1):
            attention_weights, dec_hidden_state = self.decoder(input, enc_output, dec_hidden_state, pointer, mask)
            # ... (나머지 코드)

            if self.deterministic:
                prob, pointer = torch.max(attention_weights, dim=1)
                pointer = pointer.unsqueeze(1)  # 포인터 차원 변경
            else:
                pointer = attention_weights.multinomial(1, replacement=True)
                prob = torch.gather(attention_weights, 1, pointer)

            probs += torch.log(prob)
            pointers = torch.cat([pointers, pointer], dim=1)
            mask = self.update_mask(mask, pointer)

            # Calculate distance and reward
            distance = torch.norm(input[torch.arange(batch_size), prev_pointer.squeeze(-1)] - input[torch.arange(batch_size), pointer.squeeze(-1)], dim=1).unsqueeze(1)
            distance_travelled += distance
            
            # Assuming city_rewards is of shape [batch_size, num_cities]
            
            pointer_for_reward = pointer.squeeze(-1)  # [batch_size, 1] -> [batch_size]
            pointer_for_reward = pointer_for_reward.unsqueeze(1)  # Reshape to [batch_size, 1] for gather

            if city_rewards.dim() == 1:
                # If city_rewards is a 1D tensor, unsqueeze it to match dimensions for gather
                reward = torch.gather(city_rewards.unsqueeze(0), 1, pointer_for_reward)
            elif city_rewards.dim() == 2:
                # If city_rewards is a 2D tensor
                reward = torch.gather(city_rewards, 1, pointer_for_reward)
            else:
                raise ValueError("Unsupported shape for city_rewards")

            total_reward = total_reward + reward.squeeze(1)

            prev_pointer = pointer

            # Check if distance exceeds max distance
            if torch.any(distance_travelled > max_distance):
                break

        return total_reward, pointers

    def update_mask(self, mask, pointer):
        for batch, i in enumerate(pointer):
            mask[batch, i] = float('-inf')
        return mask
    

In [None]:
# Training Loop
num_cities = 100
max_reward = 100
max_distance = 10
hidden_dim = 256
lr = 0.0001

env = PCTSPEnvironment(num_cities, max_reward, max_distance)
dataset = TensorDataset(env.get_state().unsqueeze(0))
dataloader = DataLoader(dataset, batch_size=64)

model = PtrNet(hidden_dim=hidden_dim)
critic = Critic(hidden_dim=hidden_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
optimizer_critic = torch.optim.Adam(critic.parameters(), lr=lr)

# Training Loop
# ...

for epoch in range(1000):
    for i, s_i in tqdm(enumerate(dataloader)):
        s_i = s_i[0]
        city_rewards = env.city_rewards

        # 모델을 통한 예측
        total_reward, pi = model(s_i, city_rewards, max_distance)
        L = model.get_length(s_i, pi)
        b = critic(s_i)

        # 첫 번째 손실 계산
        loss = -total_reward.mean()  # 보상 최대화
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()

        # 두 번째 손실 계산
        loss_critic = F.mse_loss(L.squeeze(), b.squeeze())  # 차원 일치를 위해 squeeze 사용
        optimizer_critic.zero_grad()
        loss_critic.backward()  # 여기서는 retain_graph=True 필요 없음
        optimizer_critic.step()

        if (i + 1) % 1 == 0:
            print(f"Epoch: {epoch + 1}, Step: {i + 1}, Reward: {total_reward.mean().item()}, Loss: {loss.item()}")

print("Training completed.")



Random Data for Current Episode:
City Coordinates:
 tensor([[0.3624, 0.7666],
        [0.1314, 0.2474],
        [0.4037, 0.3024],
        [0.8362, 0.7349],
        [0.0964, 0.0315],
        [0.6602, 0.2739],
        [0.0802, 0.7884],
        [0.5368, 0.1451],
        [0.4808, 0.4080],
        [0.4113, 0.6874],
        [0.0349, 0.8853],
        [0.1420, 0.8162],
        [0.3445, 0.5759],
        [0.2930, 0.1449],
        [0.4886, 0.9847],
        [0.0458, 0.0929],
        [0.5563, 0.5625],
        [0.8495, 0.8150],
        [0.9506, 0.1575],
        [0.9018, 0.7592],
        [0.0554, 0.1295],
        [0.2693, 0.1970],
        [0.3492, 0.2665],
        [0.7836, 0.8150],
        [0.9080, 0.0599],
        [0.0060, 0.6412],
        [0.5082, 0.0822],
        [0.2194, 0.9191],
        [0.9887, 0.0479],
        [0.2015, 0.6008],
        [0.2692, 0.5937],
        [0.8079, 0.7084],
        [0.0437, 0.2660],
        [0.0924, 0.2377],
        [0.1883, 0.7958],
        [0.1012, 0.6955],
        [0.1

1it [00:00,  8.42it/s]


step length:  tensor([10.6025])
Epoch: 1, Step: 1, Reward: 872.0, Loss: -872.0


1it [00:00, 18.80it/s]


step length:  tensor([9.8811])
Epoch: 2, Step: 1, Reward: 1037.0, Loss: -1037.0


1it [00:00, 22.68it/s]


step length:  tensor([10.6728])
Epoch: 3, Step: 1, Reward: 1001.0, Loss: -1001.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 20.88it/s]


 tensor([10.0817])
Epoch: 4, Step: 1, Reward: 1079.0, Loss: -1079.0


1it [00:00, 19.73it/s]


step length:  tensor([10.1849])
Epoch: 5, Step: 1, Reward: 1097.0, Loss: -1097.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.96it/s]


 tensor([10.6861])
Epoch: 6, Step: 1, Reward: 868.0, Loss: -868.0


0it [00:00, ?it/s]

step length:  tensor([10.0325])
Epoch: 7, Step: 1, Reward: 1154.0, Loss: -1154.0


1it [00:00, 19.75it/s]
0it [00:00, ?it/s]

step length:  tensor([10.9756])


1it [00:00, 19.54it/s]


Epoch: 8, Step: 1, Reward: 1076.0, Loss: -1076.0


0it [00:00, ?it/s]

step length:  tensor([9.7542])
Epoch: 9, Step: 1, Reward: 1024.0, Loss: -1024.0


1it [00:00, 21.20it/s]
1it [00:00, 19.51it/s]


step length:  tensor([10.5893])
Epoch: 10, Step: 1, Reward: 905.0, Loss: -905.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 20.21it/s]


 tensor([9.8924])
Epoch: 11, Step: 1, Reward: 880.0, Loss: -880.0


0it [00:00, ?it/s]

step length:  tensor([10.3193])
Epoch: 12, Step: 1, Reward: 1064.0, Loss: -1064.0


1it [00:00, 15.53it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3213])
Epoch: 13, Step: 1, Reward: 626.0, Loss: -626.0


1it [00:00, 20.58it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3833])
Epoch: 14, Step: 1, Reward: 962.0, Loss: -962.0


1it [00:00, 28.30it/s]
1it [00:00, 27.68it/s]


step length:  tensor([9.9452])
Epoch: 15, Step: 1, Reward: 995.0, Loss: -995.0


1it [00:00, 19.02it/s]


step length:  tensor([10.6838])
Epoch: 16, Step: 1, Reward: 1242.0, Loss: -1242.0


0it [00:00, ?it/s]

step length:  tensor([10.5166])


1it [00:00, 15.31it/s]


Epoch: 17, Step: 1, Reward: 1024.0, Loss: -1024.0


0it [00:00, ?it/s]

step length:  tensor([10.5353])


1it [00:00, 21.07it/s]


Epoch: 18, Step: 1, Reward: 1066.0, Loss: -1066.0


1it [00:00, 29.84it/s]

step length:  tensor([10.8401])
Epoch: 19, Step: 1, Reward: 840.0, Loss: -840.0



1it [00:00, 28.55it/s]


step length:  tensor([10.0504])
Epoch: 20, Step: 1, Reward: 772.0, Loss: -772.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.72it/s]


 tensor([9.8597])
Epoch: 21, Step: 1, Reward: 862.0, Loss: -862.0


0it [00:00, ?it/s]

step length:  tensor([9.5770])
Epoch: 22, Step: 1, Reward: 990.0, Loss: -990.0


1it [00:00, 20.32it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 19.89it/s]


 tensor([10.0096])
Epoch: 23, Step: 1, Reward: 809.0, Loss: -809.0


0it [00:00, ?it/s]

step length:  tensor([9.7808])
Epoch: 24, Step: 1, Reward: 916.0, Loss: -916.0


1it [00:00, 17.18it/s]
1it [00:00, 27.77it/s]


step length:  tensor([10.6051])
Epoch: 25, Step: 1, Reward: 1029.0, Loss: -1029.0


0it [00:00, ?it/s]

step length:  tensor([11.0724])
Epoch: 26, Step: 1, Reward: 943.0, Loss: -943.0


1it [00:00, 15.66it/s]
0it [00:00, ?it/s]

step length:  tensor([9.8862])
Epoch: 27, Step: 1, Reward: 757.0, Loss: -757.0


1it [00:00, 28.08it/s]
1it [00:00, 20.08it/s]


step length:  tensor([10.7418])
Epoch: 28, Step: 1, Reward: 994.0, Loss: -994.0


1it [00:00, 20.34it/s]


step length:  tensor([10.5699])
Epoch: 29, Step: 1, Reward: 918.0, Loss: -918.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.45it/s]


tensor([10.7719])
Epoch: 30, Step: 1, Reward: 1007.0, Loss: -1007.0


0it [00:00, ?it/s]

step length:  tensor([10.5381])
Epoch: 31, Step: 1, Reward: 1022.0, Loss: -1022.0


1it [00:00, 20.40it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 29.76it/s]


 tensor([10.2726])
Epoch: 32, Step: 1, Reward: 1016.0, Loss: -1016.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.71it/s]


 tensor([10.3626])
Epoch: 33, Step: 1, Reward: 1090.0, Loss: -1090.0


1it [00:00, 26.78it/s]


step length:  tensor([10.7213])
Epoch: 34, Step: 1, Reward: 1016.0, Loss: -1016.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 15.23it/s]


 tensor([10.7823])
Epoch: 35, Step: 1, Reward: 701.0, Loss: -701.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.91it/s]


tensor([10.4698])
Epoch: 36, Step: 1, Reward: 1019.0, Loss: -1019.0


0it [00:00, ?it/s]

step length:  tensor([10.8770])
Epoch: 37, Step: 1, Reward: 1019.0, Loss: -1019.0


1it [00:00, 19.52it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3291])


1it [00:00, 20.22it/s]


Epoch: 38, Step: 1, Reward: 921.0, Loss: -921.0


1it [00:00, 18.93it/s]


step length:  tensor([10.4760])
Epoch: 39, Step: 1, Reward: 1034.0, Loss: -1034.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.65it/s]


tensor([9.9900])
Epoch: 40, Step: 1, Reward: 1041.0, Loss: -1041.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.91it/s]


 tensor([10.2819])
Epoch: 41, Step: 1, Reward: 1149.0, Loss: -1149.0


0it [00:00, ?it/s]

step length:  tensor([10.7123])
Epoch: 42, Step: 1, Reward: 931.0, Loss: -931.0


1it [00:00, 19.97it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3705])
Epoch: 43, Step: 1, Reward: 839.0, Loss: -839.0


1it [00:00, 20.22it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2054])
Epoch: 44, Step: 1, Reward: 741.0, Loss: -741.0


1it [00:00, 19.67it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1657])
Epoch: 45, Step: 1, Reward: 1062.0, Loss: -1062.0


1it [00:00, 20.70it/s]
1it [00:00, 17.04it/s]


step length:  tensor([10.4190])
Epoch: 46, Step: 1, Reward: 853.0, Loss: -853.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 24.47it/s]


 tensor([9.7231])
Epoch: 47, Step: 1, Reward: 884.0, Loss: -884.0


0it [00:00, ?it/s]

step length:  tensor([10.5664])
Epoch: 48, Step: 1, Reward: 914.0, Loss: -914.0


1it [00:00, 20.06it/s]
1it [00:00, 28.06it/s]


step length:  tensor([10.3716])
Epoch: 49, Step: 1, Reward: 1078.0, Loss: -1078.0


1it [00:00, 20.40it/s]


step length:  tensor([10.4677])
Epoch: 50, Step: 1, Reward: 986.0, Loss: -986.0


1it [00:00, 20.03it/s]


step length:  tensor([10.4878])
Epoch: 51, Step: 1, Reward: 789.0, Loss: -789.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.87it/s]


 tensor([10.1269])
Epoch: 52, Step: 1, Reward: 890.0, Loss: -890.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.89it/s]


 tensor([10.5314])
Epoch: 53, Step: 1, Reward: 965.0, Loss: -965.0


1it [00:00, 19.80it/s]


step length:  tensor([10.1521])
Epoch: 54, Step: 1, Reward: 1196.0, Loss: -1196.0


1it [00:00, 20.35it/s]


step length:  tensor([10.4959])
Epoch: 55, Step: 1, Reward: 925.0, Loss: -925.0


1it [00:00, 19.57it/s]


step length:  tensor([10.5437])
Epoch: 56, Step: 1, Reward: 1098.0, Loss: -1098.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 28.62it/s]


tensor([10.0653])
Epoch: 57, Step: 1, Reward: 852.0, Loss: -852.0


1it [00:00, 19.92it/s]


step length:  tensor([9.6410])
Epoch: 58, Step: 1, Reward: 854.0, Loss: -854.0


1it [00:00, 20.24it/s]


step length:  tensor([10.9309])
Epoch: 59, Step: 1, Reward: 1114.0, Loss: -1114.0


1it [00:00, 14.83it/s]


step length:  tensor([10.5703])
Epoch: 60, Step: 1, Reward: 787.0, Loss: -787.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 19.97it/s]


tensor([10.3350])
Epoch: 61, Step: 1, Reward: 902.0, Loss: -902.0


0it [00:00, ?it/s]

step length:  tensor([10.3062])
Epoch: 62, Step: 1, Reward: 1030.0, Loss: -1030.0


1it [00:00, 19.71it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4688])
Epoch: 63, Step: 1, Reward: 1054.0, Loss: -1054.0


1it [00:00, 20.21it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5591])


1it [00:00, 20.23it/s]


Epoch: 64, Step: 1, Reward: 1063.0, Loss: -1063.0


1it [00:00, 19.89it/s]


step length:  tensor([10.0219])
Epoch: 65, Step: 1, Reward: 1037.0, Loss: -1037.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.14it/s]


tensor([10.6847])
Epoch: 66, Step: 1, Reward: 1329.0, Loss: -1329.0


0it [00:00, ?it/s]

step length:  tensor([9.9980])


1it [00:00, 18.68it/s]


Epoch: 67, Step: 1, Reward: 1038.0, Loss: -1038.0


0it [00:00, ?it/s]

step length:  tensor([10.2393])
Epoch: 68, Step: 1, Reward: 1098.0, Loss: -1098.0


1it [00:00, 20.13it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3181])


1it [00:00, 20.60it/s]


Epoch: 69, Step: 1, Reward: 810.0, Loss: -810.0


1it [00:00, 20.97it/s]


step length:  tensor([10.2304])
Epoch: 70, Step: 1, Reward: 1157.0, Loss: -1157.0


0it [00:00, ?it/s]

step length:  tensor([11.4703])
Epoch: 71, Step: 1, Reward: 1466.0, Loss: -1466.0


1it [00:00, 19.08it/s]
0it [00:00, ?it/s]

step length:  tensor([9.8969])
Epoch: 72, Step: 1, Reward: 1191.0, Loss: -1191.0


1it [00:00, 15.70it/s]
0it [00:00, ?it/s]

step length:  tensor([11.3099])


1it [00:00, 21.02it/s]


Epoch: 73, Step: 1, Reward: 1267.0, Loss: -1267.0


0it [00:00, ?it/s]

step length:  tensor([10.1061])


1it [00:00, 19.64it/s]


Epoch: 74, Step: 1, Reward: 930.0, Loss: -930.0


1it [00:00, 19.80it/s]


step length:  tensor([9.9760])
Epoch: 75, Step: 1, Reward: 1048.0, Loss: -1048.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 21.04it/s]


 tensor([10.1129])
Epoch: 76, Step: 1, Reward: 1183.0, Loss: -1183.0


0it [00:00, ?it/s]

step length:  tensor([10.5803])


1it [00:00, 16.67it/s]


Epoch: 77, Step: 1, Reward: 922.0, Loss: -922.0


0it [00:00, ?it/s]

step length:  tensor([10.7747])
Epoch: 78, Step: 1, Reward: 963.0, Loss: -963.0


1it [00:00, 23.93it/s]
1it [00:00, 20.14it/s]


step length:  tensor([9.6496])
Epoch: 79, Step: 1, Reward: 730.0, Loss: -730.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.03it/s]

tensor([10.7470])
Epoch: 80, Step: 1, Reward: 901.0, Loss: -901.0



0it [00:00, ?it/s]

step length: 

1it [00:00, 20.00it/s]

 tensor([10.7523])
Epoch: 81, Step: 1, Reward: 880.0, Loss: -880.0



0it [00:00, ?it/s]

step length:  tensor([10.6044])
Epoch: 82, Step: 1, Reward: 1096.0, Loss: -1096.0


1it [00:00, 19.42it/s]
0it [00:00, ?it/s]

step length:  tensor([10.6603])
Epoch: 83, Step: 1, Reward: 803.0, Loss: -803.0


1it [00:00, 20.08it/s]
1it [00:00, 21.03it/s]


step length:  tensor([10.2799])
Epoch: 84, Step: 1, Reward: 919.0, Loss: -919.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.77it/s]


tensor([9.8950])
Epoch: 85, Step: 1, Reward: 1003.0, Loss: -1003.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 20.42it/s]


 tensor([10.9255])
Epoch: 86, Step: 1, Reward: 1132.0, Loss: -1132.0


0it [00:00, ?it/s]

step length:  tensor([10.2843])


1it [00:00, 19.77it/s]


Epoch: 87, Step: 1, Reward: 880.0, Loss: -880.0


0it [00:00, ?it/s]

step length:  tensor([10.2319])
Epoch: 88, Step: 1, Reward: 1206.0, Loss: -1206.0


1it [00:00, 19.99it/s]
1it [00:00, 19.94it/s]


step length:  tensor([10.4375])
Epoch: 89, Step: 1, Reward: 1175.0, Loss: -1175.0


0it [00:00, ?it/s]

step length:  tensor([10.0748])
Epoch: 90, Step: 1, Reward: 1041.0, Loss: -1041.0


1it [00:00, 20.18it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5843])
Epoch: 91, Step: 1, Reward: 882.0, Loss: -882.0


1it [00:00, 18.97it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 21.04it/s]


 tensor([10.9576])
Epoch: 92, Step: 1, Reward: 1176.0, Loss: -1176.0


0it [00:00, ?it/s]

step length:  tensor([10.2423])
Epoch: 93, Step: 1, Reward: 982.0, Loss: -982.0


1it [00:00, 15.35it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3189])


1it [00:00, 19.75it/s]


Epoch: 94, Step: 1, Reward: 875.0, Loss: -875.0


0it [00:00, ?it/s]

step length:  tensor([10.5655])
Epoch: 95, Step: 1, Reward: 1022.0, Loss: -1022.0


1it [00:00, 19.85it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1989])
Epoch: 96, Step: 1, Reward: 1015.0, Loss: -1015.0


1it [00:00, 19.83it/s]
0it [00:00, ?it/s]

step length:  tensor([9.7206])


1it [00:00, 20.10it/s]


Epoch: 97, Step: 1, Reward: 1185.0, Loss: -1185.0


0it [00:00, ?it/s]

step length:  tensor([9.7518])


1it [00:00, 20.31it/s]


Epoch: 98, Step: 1, Reward: 1024.0, Loss: -1024.0


1it [00:00, 19.93it/s]


step length:  tensor([10.6579])
Epoch: 99, Step: 1, Reward: 1075.0, Loss: -1075.0


0it [00:00, ?it/s]

step length:  tensor([10.2752])
Epoch: 100, Step: 1, Reward: 959.0, Loss: -959.0


1it [00:00, 16.10it/s]
0it [00:00, ?it/s]

step length:  tensor([9.9975])
Epoch: 101, Step: 1, Reward: 991.0, Loss: -991.0


1it [00:00, 19.99it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 28.79it/s]


 tensor([10.5114])
Epoch: 102, Step: 1, Reward: 895.0, Loss: -895.0


0it [00:00, ?it/s]

step length:  tensor([10.2530])
Epoch: 103, Step: 1, Reward: 1271.0, Loss: -1271.0


1it [00:00, 18.94it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2431])
Epoch: 104, Step: 1, Reward: 947.0, Loss: -947.0


1it [00:00, 19.48it/s]
0it [00:00, ?it/s]

step length:  tensor([10.6067])


1it [00:00, 19.49it/s]


Epoch: 105, Step: 1, Reward: 1174.0, Loss: -1174.0


0it [00:00, ?it/s]

step length:  tensor([10.2489])
Epoch: 106, Step: 1, Reward: 1099.0, Loss: -1099.0


1it [00:00, 20.73it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1418])
Epoch: 107, Step: 1, Reward: 1211.0, Loss: -1211.0


1it [00:00, 15.95it/s]
1it [00:00, 29.81it/s]

step length:  tensor([11.0425])
Epoch: 108, Step: 1, Reward: 1383.0, Loss: -1383.0



0it [00:00, ?it/s]

step length:  tensor([10.8093])
Epoch: 109, Step: 1, Reward: 942.0, Loss: -942.0


1it [00:00, 20.43it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1471])
Epoch: 110, Step: 1, Reward: 1056.0, Loss: -1056.0


1it [00:00, 19.21it/s]
0it [00:00, ?it/s]

step length:  tensor([9.9132])


1it [00:00, 19.84it/s]


Epoch: 111, Step: 1, Reward: 919.0, Loss: -919.0


0it [00:00, ?it/s]

step length:  tensor([10.5978])


1it [00:00, 20.47it/s]


Epoch: 112, Step: 1, Reward: 809.0, Loss: -809.0


1it [00:00, 20.02it/s]


step length:  tensor([10.4356])
Epoch: 113, Step: 1, Reward: 952.0, Loss: -952.0


0it [00:00, ?it/s]

step length:  tensor([10.7705])
Epoch: 114, Step: 1, Reward: 860.0, Loss: -860.0


1it [00:00, 18.69it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1900])
Epoch: 115, Step: 1, Reward: 951.0, Loss: -951.0


1it [00:00, 20.00it/s]
0it [00:00, ?it/s]

step length:  tensor([9.5463])
Epoch: 116, Step: 1, Reward: 816.0, Loss: -816.0


1it [00:00, 22.71it/s]
1it [00:00, 20.92it/s]


step length:  tensor([10.1334])
Epoch: 117, Step: 1, Reward: 939.0, Loss: -939.0


0it [00:00, ?it/s]

step length:  tensor([10.5306])
Epoch: 118, Step: 1, Reward: 933.0, Loss: -933.0


1it [00:00, 18.41it/s]
0it [00:00, ?it/s]

step length:  tensor([9.7020])
Epoch: 119, Step: 1, Reward: 787.0, Loss: -787.0


1it [00:00, 21.72it/s]
0it [00:00, ?it/s]

step length:  tensor([10.8498])
Epoch: 120, Step: 1, Reward: 1136.0, Loss: -1136.0


1it [00:00, 20.67it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5118])


1it [00:00, 19.28it/s]


Epoch: 121, Step: 1, Reward: 855.0, Loss: -855.0


1it [00:00, 20.13it/s]


step length:  tensor([10.8997])
Epoch: 122, Step: 1, Reward: 1165.0, Loss: -1165.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 20.12it/s]


 tensor([10.5626])
Epoch: 123, Step: 1, Reward: 1289.0, Loss: -1289.0


0it [00:00, ?it/s]

step length:  tensor([10.1454])
Epoch: 124, Step: 1, Reward: 1049.0, Loss: -1049.0


1it [00:00, 18.80it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1705])
Epoch: 125, Step: 1, Reward: 932.0, Loss: -932.0


1it [00:00, 21.69it/s]
0it [00:00, ?it/s]

step length:  tensor([9.9249])
Epoch: 126, Step: 1, Reward: 866.0, Loss: -866.0


1it [00:00, 20.34it/s]
1it [00:00, 19.20it/s]


step length:  tensor([10.8730])
Epoch: 127, Step: 1, Reward: 1153.0, Loss: -1153.0


1it [00:00, 25.89it/s]


step length:  tensor([10.0299])
Epoch: 128, Step: 1, Reward: 810.0, Loss: -810.0


1it [00:00, 20.62it/s]


step length:  tensor([10.4026])
Epoch: 129, Step: 1, Reward: 972.0, Loss: -972.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.56it/s]


 tensor([11.0436])
Epoch: 130, Step: 1, Reward: 913.0, Loss: -913.0


0it [00:00, ?it/s]

step length:  tensor([10.2052])
Epoch: 131, Step: 1, Reward: 1028.0, Loss: -1028.0


1it [00:00, 15.94it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1900])
Epoch: 132, Step: 1, Reward: 1132.0, Loss: -1132.0


1it [00:00, 19.22it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2697])
Epoch: 133, Step: 1, Reward: 1231.0, Loss: -1231.0


1it [00:00, 20.71it/s]
1it [00:00, 21.43it/s]


step length:  tensor([9.5355])
Epoch: 134, Step: 1, Reward: 834.0, Loss: -834.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 28.90it/s]


 tensor([10.5132])
Epoch: 135, Step: 1, Reward: 1042.0, Loss: -1042.0


0it [00:00, ?it/s]

step length:  tensor([10.9325])
Epoch: 136, Step: 1, Reward: 999.0, Loss: -999.0


1it [00:00, 15.77it/s]
1it [00:00, 18.87it/s]


step length:  tensor([10.5500])
Epoch: 137, Step: 1, Reward: 923.0, Loss: -923.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 20.58it/s]


 tensor([10.9755])
Epoch: 138, Step: 1, Reward: 1025.0, Loss: -1025.0


0it [00:00, ?it/s]

step length:  tensor([10.2008])
Epoch: 139, Step: 1, Reward: 900.0, Loss: -900.0


1it [00:00, 20.83it/s]
1it [00:00, 22.22it/s]


step length:  tensor([10.2928])
Epoch: 140, Step: 1, Reward: 1010.0, Loss: -1010.0


0it [00:00, ?it/s]

step length:  tensor([10.1939])
Epoch: 141, Step: 1, Reward: 1116.0, Loss: -1116.0


1it [00:00, 19.97it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0676])


1it [00:00, 17.93it/s]


Epoch: 142, Step: 1, Reward: 881.0, Loss: -881.0


1it [00:00, 20.98it/s]


step length:  tensor([10.2670])
Epoch: 143, Step: 1, Reward: 985.0, Loss: -985.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 21.98it/s]


 tensor([10.5985])
Epoch: 144, Step: 1, Reward: 660.0, Loss: -660.0


0it [00:00, ?it/s]

step length:  tensor([10.9408])
Epoch: 145, Step: 1, Reward: 830.0, Loss: -830.0


1it [00:00, 20.41it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3860])
Epoch: 146, Step: 1, Reward: 1181.0, Loss: -1181.0


1it [00:00, 19.06it/s]
1it [00:00, 20.18it/s]


step length:  tensor([10.9691])
Epoch: 147, Step: 1, Reward: 1135.0, Loss: -1135.0


0it [00:00, ?it/s]

step length:  tensor([10.2720])
Epoch: 148, Step: 1, Reward: 998.0, Loss: -998.0


1it [00:00, 16.49it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3269])
Epoch: 149, Step: 1, Reward: 984.0, Loss: -984.0


1it [00:00, 19.01it/s]
0it [00:00, ?it/s]

step length:  tensor([10.6419])
Epoch: 150, Step: 1, Reward: 820.0, Loss: -820.0


1it [00:00, 20.36it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0100])
Epoch: 151, Step: 1, Reward: 1017.0, Loss: -1017.0


1it [00:00, 19.66it/s]
1it [00:00, 20.23it/s]


step length:  tensor([9.6364])
Epoch: 152, Step: 1, Reward: 723.0, Loss: -723.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 20.11it/s]


 tensor([10.5016])
Epoch: 153, Step: 1, Reward: 1067.0, Loss: -1067.0


0it [00:00, ?it/s]

step length:  tensor([10.3842])
Epoch: 154, Step: 1, Reward: 1129.0, Loss: -1129.0


1it [00:00, 15.48it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4782])
Epoch: 155, Step: 1, Reward: 761.0, Loss: -761.0


1it [00:00, 28.31it/s]
0it [00:00, ?it/s]

step length:  tensor([10.9091])
Epoch: 156, Step: 1, Reward: 963.0, Loss: -963.0


1it [00:00, 24.69it/s]
1it [00:00, 25.72it/s]


step length:  tensor([9.9418])
Epoch: 157, Step: 1, Reward: 830.0, Loss: -830.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 26.37it/s]


 tensor([11.2188])
Epoch: 158, Step: 1, Reward: 887.0, Loss: -887.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.71it/s]


 tensor([10.5619])
Epoch: 159, Step: 1, Reward: 882.0, Loss: -882.0


1it [00:00, 20.00it/s]


step length:  tensor([10.3575])
Epoch: 160, Step: 1, Reward: 1003.0, Loss: -1003.0


0it [00:00, ?it/s]

step length:  tensor([10.6685])
Epoch: 161, Step: 1, Reward: 968.0, Loss: -968.0


1it [00:00, 16.30it/s]
1it [00:00, 19.14it/s]


step length:  tensor([10.6890])
Epoch: 162, Step: 1, Reward: 1080.0, Loss: -1080.0


0it [00:00, ?it/s]

step length:  tensor([10.6470])
Epoch: 163, Step: 1, Reward: 915.0, Loss: -915.0


1it [00:00, 19.83it/s]
0it [00:00, ?it/s]

step length:  tensor([10.8582])
Epoch: 164, Step: 1, Reward: 919.0, Loss: -919.0


1it [00:00, 15.62it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3187])
Epoch: 165, Step: 1, Reward: 1044.0, Loss: -1044.0


1it [00:00, 20.86it/s]
0it [00:00, ?it/s]

step length:  tensor([10.6035])
Epoch: 166, Step: 1, Reward: 1224.0, Loss: -1224.0


1it [00:00, 18.79it/s]
1it [00:00, 20.38it/s]


step length:  tensor([10.6249])
Epoch: 167, Step: 1, Reward: 1096.0, Loss: -1096.0


0it [00:00, ?it/s]

step length:  tensor([10.5253])
Epoch: 168, Step: 1, Reward: 1297.0, Loss: -1297.0


1it [00:00, 15.76it/s]
0it [00:00, ?it/s]

step length:  tensor([10.6251])
Epoch: 169, Step: 1, Reward: 1036.0, Loss: -1036.0


1it [00:00, 21.76it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3926])
Epoch: 170, Step: 1, Reward: 852.0, Loss: -852.0


1it [00:00, 20.83it/s]
1it [00:00, 20.89it/s]

step length:  tensor([9.6088])
Epoch: 171, Step: 1, Reward: 1406.0, Loss: -1406.0



0it [00:00, ?it/s]

step length:  tensor([9.9004])
Epoch: 172, Step: 1, Reward: 976.0, Loss: -976.0


1it [00:00, 18.23it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3874])
Epoch: 173, Step: 1, Reward: 694.0, Loss: -694.0


1it [00:00, 20.71it/s]
0it [00:00, ?it/s]

step length:  tensor([9.9277])


1it [00:00, 21.09it/s]


Epoch: 174, Step: 1, Reward: 889.0, Loss: -889.0


1it [00:00, 19.78it/s]


step length:  tensor([10.3157])
Epoch: 175, Step: 1, Reward: 1250.0, Loss: -1250.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.09it/s]


 tensor([10.4548])
Epoch: 176, Step: 1, Reward: 1000.0, Loss: -1000.0


0it [00:00, ?it/s]

step length:  tensor([10.7309])
Epoch: 177, Step: 1, Reward: 1050.0, Loss: -1050.0


1it [00:00, 18.73it/s]
0it [00:00, ?it/s]

step length:  tensor([9.4985])
Epoch: 178, Step: 1, Reward: 870.0, Loss: -870.0


1it [00:00, 16.44it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2392])
Epoch: 179, Step: 1, Reward: 845.0, Loss: -845.0


1it [00:00, 20.84it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0676])


1it [00:00, 19.52it/s]


Epoch: 180, Step: 1, Reward: 1168.0, Loss: -1168.0


1it [00:00, 20.17it/s]


step length:  tensor([10.1017])
Epoch: 181, Step: 1, Reward: 945.0, Loss: -945.0


0it [00:00, ?it/s]

step length:  tensor([10.0488])
Epoch: 182, Step: 1, Reward: 978.0, Loss: -978.0


1it [00:00, 19.26it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3211])
Epoch: 183, Step: 1, Reward: 822.0, Loss: -822.0


1it [00:00, 19.28it/s]
0it [00:00, ?it/s]

step length:  tensor([11.1458])
Epoch: 184, Step: 1, Reward: 1069.0, Loss: -1069.0


1it [00:00, 19.27it/s]
1it [00:00, 22.13it/s]


step length:  tensor([10.5650])
Epoch: 185, Step: 1, Reward: 975.0, Loss: -975.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.09it/s]


tensor([10.0736])
Epoch: 186, Step: 1, Reward: 1209.0, Loss: -1209.0


1it [00:00, 15.74it/s]


step length:  tensor([10.2906])
Epoch: 187, Step: 1, Reward: 1020.0, Loss: -1020.0


0it [00:00, ?it/s]

step length:  tensor([9.3379])
Epoch: 188, Step: 1, Reward: 858.0, Loss: -858.0


1it [00:00, 19.21it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7025])
Epoch: 189, Step: 1, Reward: 1257.0, Loss: -1257.0


1it [00:00, 19.15it/s]
1it [00:00, 22.06it/s]


step length:  tensor([10.3797])
Epoch: 190, Step: 1, Reward: 923.0, Loss: -923.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 20.38it/s]


 tensor([9.7991])
Epoch: 191, Step: 1, Reward: 977.0, Loss: -977.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 16.31it/s]


tensor([11.0115])
Epoch: 192, Step: 1, Reward: 913.0, Loss: -913.0


0it [00:00, ?it/s]

step length:  tensor([11.0336])
Epoch: 193, Step: 1, Reward: 1255.0, Loss: -1255.0


1it [00:00, 19.36it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7777])
Epoch: 194, Step: 1, Reward: 1073.0, Loss: -1073.0


1it [00:00, 20.79it/s]
0it [00:00, ?it/s]

step length:  tensor([9.9446])
Epoch: 195, Step: 1, Reward: 974.0, Loss: -974.0


1it [00:00, 20.02it/s]
1it [00:00, 18.69it/s]


step length:  tensor([9.8283])
Epoch: 196, Step: 1, Reward: 995.0, Loss: -995.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.65it/s]


 tensor([10.5539])
Epoch: 197, Step: 1, Reward: 967.0, Loss: -967.0


0it [00:00, ?it/s]

step length:  tensor([10.4435])
Epoch: 198, Step: 1, Reward: 933.0, Loss: -933.0


1it [00:00, 20.94it/s]
1it [00:00, 20.48it/s]


step length:  tensor([10.2695])
Epoch: 199, Step: 1, Reward: 857.0, Loss: -857.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.83it/s]


tensor([10.0235])
Epoch: 200, Step: 1, Reward: 1178.0, Loss: -1178.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.27it/s]


tensor([10.5431])
Epoch: 201, Step: 1, Reward: 1252.0, Loss: -1252.0


0it [00:00, ?it/s]

step length:  tensor([11.2543])


1it [00:00, 17.31it/s]


Epoch: 202, Step: 1, Reward: 876.0, Loss: -876.0


0it [00:00, ?it/s]

step length:  tensor([10.4769])


1it [00:00, 16.76it/s]


Epoch: 203, Step: 1, Reward: 1034.0, Loss: -1034.0


0it [00:00, ?it/s]

step length:  tensor([10.7777])
Epoch: 204, Step: 1, Reward: 759.0, Loss: -759.0


1it [00:00, 23.55it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1160])
Epoch: 205, Step: 1, Reward: 873.0, Loss: -873.0


1it [00:00, 20.15it/s]
0it [00:00, ?it/s]

step length:  tensor([10.9770])


1it [00:00, 20.69it/s]


Epoch: 206, Step: 1, Reward: 1067.0, Loss: -1067.0


0it [00:00, ?it/s]

step length:  tensor([10.5492])
Epoch: 207, Step: 1, Reward: 844.0, Loss: -844.0


1it [00:00, 18.40it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3251])


1it [00:00, 19.71it/s]


Epoch: 208, Step: 1, Reward: 1151.0, Loss: -1151.0


0it [00:00, ?it/s]

step length:  tensor([10.4733])
Epoch: 209, Step: 1, Reward: 648.0, Loss: -648.0


1it [00:00, 22.21it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 19.03it/s]


 tensor([10.2465])
Epoch: 210, Step: 1, Reward: 1214.0, Loss: -1214.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.73it/s]


tensor([10.7204])
Epoch: 211, Step: 1, Reward: 732.0, Loss: -732.0


0it [00:00, ?it/s]

step length:  tensor([10.7091])
Epoch: 212, Step: 1, Reward: 1163.0, Loss: -1163.0


1it [00:00, 20.94it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 29.67it/s]


 tensor([9.8709])
Epoch: 213, Step: 1, Reward: 1075.0, Loss: -1075.0


0it [00:00, ?it/s]

step length:  tensor([10.7261])


1it [00:00, 14.70it/s]


Epoch: 214, Step: 1, Reward: 1009.0, Loss: -1009.0


1it [00:00, 21.10it/s]


step length:  tensor([10.9066])
Epoch: 215, Step: 1, Reward: 939.0, Loss: -939.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 19.26it/s]

tensor([10.4322])
Epoch: 216, Step: 1, Reward: 871.0, Loss: -871.0



1it [00:00, 28.51it/s]


step length:  tensor([10.9887])
Epoch: 217, Step: 1, Reward: 1003.0, Loss: -1003.0


0it [00:00, ?it/s]

step length:  tensor([10.1653])
Epoch: 218, Step: 1, Reward: 1412.0, Loss: -1412.0


1it [00:00, 18.88it/s]
0it [00:00, ?it/s]

step length:  tensor([9.4838])
Epoch: 219, Step: 1, Reward: 807.0, Loss: -807.0


1it [00:00, 20.73it/s]
1it [00:00, 28.49it/s]


step length:  tensor([10.1120])
Epoch: 220, Step: 1, Reward: 780.0, Loss: -780.0


0it [00:00, ?it/s]

step length:  tensor([10.3788])


1it [00:00, 20.37it/s]


Epoch: 221, Step: 1, Reward: 1025.0, Loss: -1025.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 30.45it/s]


tensor([10.2848])
Epoch: 222, Step: 1, Reward: 964.0, Loss: -964.0


0it [00:00, ?it/s]

step length:  tensor([10.5737])


1it [00:00, 24.93it/s]


Epoch: 223, Step: 1, Reward: 827.0, Loss: -827.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 20.10it/s]


 tensor([10.6294])
Epoch: 224, Step: 1, Reward: 1081.0, Loss: -1081.0


1it [00:00, 20.26it/s]

step length:  tensor([10.7948])
Epoch: 225, Step: 1, Reward: 1302.0, Loss: -1302.0



0it [00:00, ?it/s]

step length:  tensor([10.8819])
Epoch: 226, Step: 1, Reward: 1081.0, Loss: -1081.0


1it [00:00, 19.95it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5559])


1it [00:00, 20.54it/s]


Epoch: 227, Step: 1, Reward: 917.0, Loss: -917.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 28.42it/s]


tensor([9.9542])
Epoch: 228, Step: 1, Reward: 844.0, Loss: -844.0


1it [00:00, 15.64it/s]


step length:  tensor([11.0353])
Epoch: 229, Step: 1, Reward: 997.0, Loss: -997.0


0it [00:00, ?it/s]

step length:  tensor([10.3143])
Epoch: 230, Step: 1, Reward: 1120.0, Loss: -1120.0


1it [00:00, 18.28it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 22.32it/s]

 tensor([10.8412])
Epoch: 231, Step: 1, Reward: 836.0, Loss: -836.0



1it [00:00, 25.47it/s]


step length:  tensor([10.6173])
Epoch: 232, Step: 1, Reward: 653.0, Loss: -653.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 28.98it/s]


 tensor([9.9424])
Epoch: 233, Step: 1, Reward: 960.0, Loss: -960.0


0it [00:00, ?it/s]

step length:  tensor([9.7569])


1it [00:00, 21.39it/s]


Epoch: 234, Step: 1, Reward: 971.0, Loss: -971.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.03it/s]


tensor([10.4207])
Epoch: 235, Step: 1, Reward: 932.0, Loss: -932.0


0it [00:00, ?it/s]

step length:  tensor([10.9286])


1it [00:00, 20.02it/s]


Epoch: 236, Step: 1, Reward: 1117.0, Loss: -1117.0


0it [00:00, ?it/s]

step length:  tensor([10.1824])
Epoch: 237, Step: 1, Reward: 1132.0, Loss: -1132.0


1it [00:00, 19.59it/s]
0it [00:00, ?it/s]

step length:  tensor([11.1281])
Epoch: 238, Step: 1, Reward: 946.0, Loss: -946.0


1it [00:00, 15.32it/s]
0it [00:00, ?it/s]

step length:  tensor([11.2038])
Epoch: 239, Step: 1, Reward: 897.0, Loss: -897.0


1it [00:00, 19.84it/s]
0it [00:00, ?it/s]

step length:  

1it [00:00, 17.42it/s]


tensor([10.6561])
Epoch: 240, Step: 1, Reward: 1044.0, Loss: -1044.0


0it [00:00, ?it/s]

step length:  tensor([10.5916])
Epoch: 241, Step: 1, Reward: 913.0, Loss: -913.0


1it [00:00, 20.66it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0542])
Epoch: 242, Step: 1, Reward: 988.0, Loss: -988.0


1it [00:00, 17.89it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2384])
Epoch: 243, Step: 1, Reward: 1024.0, Loss: -1024.0


1it [00:00, 20.70it/s]
0it [00:00, ?it/s]

step length:  tensor([9.7768])
Epoch: 244, Step: 1, Reward: 899.0, Loss: -899.0


1it [00:00, 20.54it/s]
1it [00:00, 20.01it/s]


step length:  tensor([10.7952])
Epoch: 245, Step: 1, Reward: 827.0, Loss: -827.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.48it/s]


 tensor([10.6462])
Epoch: 246, Step: 1, Reward: 1072.0, Loss: -1072.0


0it [00:00, ?it/s]

step length:  tensor([10.6972])
Epoch: 247, Step: 1, Reward: 895.0, Loss: -895.0


1it [00:00, 20.44it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4784])
Epoch: 248, Step: 1, Reward: 1040.0, Loss: -1040.0


1it [00:00, 20.57it/s]
1it [00:00, 27.38it/s]


step length:  tensor([10.8983])
Epoch: 249, Step: 1, Reward: 974.0, Loss: -974.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.49it/s]


tensor([10.2879])
Epoch: 250, Step: 1, Reward: 910.0, Loss: -910.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 19.91it/s]


tensor([10.1805])
Epoch: 251, Step: 1, Reward: 1302.0, Loss: -1302.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 28.65it/s]


 tensor([10.0176])
Epoch: 252, Step: 1, Reward: 1034.0, Loss: -1034.0


0it [00:00, ?it/s]

step length:  tensor([9.7711])
Epoch: 253, Step: 1, Reward: 962.0, Loss: -962.0


1it [00:00, 18.50it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0580])
Epoch: 254, Step: 1, Reward: 794.0, Loss: -794.0


1it [00:00, 21.66it/s]
1it [00:00, 20.97it/s]


step length:  tensor([10.5084])
Epoch: 255, Step: 1, Reward: 1015.0, Loss: -1015.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 29.06it/s]


tensor([10.0433])
Epoch: 256, Step: 1, Reward: 984.0, Loss: -984.0


0it [00:00, ?it/s]

step length:  tensor([10.8833])


1it [00:00, 19.37it/s]


Epoch: 257, Step: 1, Reward: 1075.0, Loss: -1075.0


0it [00:00, ?it/s]

step length:  tensor([9.7915])


1it [00:00, 20.59it/s]


Epoch: 258, Step: 1, Reward: 796.0, Loss: -796.0


0it [00:00, ?it/s]

step length:  tensor([10.2010])


1it [00:00, 20.14it/s]


Epoch: 259, Step: 1, Reward: 847.0, Loss: -847.0


1it [00:00, 19.87it/s]


step length:  tensor([10.1922])
Epoch: 260, Step: 1, Reward: 894.0, Loss: -894.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 29.13it/s]


tensor([10.4692])
Epoch: 261, Step: 1, Reward: 1197.0, Loss: -1197.0


0it [00:00, ?it/s]

step length:  tensor([10.5202])


1it [00:00, 20.07it/s]


Epoch: 262, Step: 1, Reward: 841.0, Loss: -841.0


0it [00:00, ?it/s]

step length:  tensor([10.6979])
Epoch: 263, Step: 1, Reward: 1113.0, Loss: -1113.0


1it [00:00, 18.08it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4914])
Epoch: 264, Step: 1, Reward: 896.0, Loss: -896.0


1it [00:00, 21.92it/s]
1it [00:00, 27.35it/s]


step length:  tensor([10.8369])
Epoch: 265, Step: 1, Reward: 1008.0, Loss: -1008.0


0it [00:00, ?it/s]

step length:  tensor([10.2233])
Epoch: 266, Step: 1, Reward: 1219.0, Loss: -1219.0


1it [00:00, 20.28it/s]
0it [00:00, ?it/s]

step length:  

1it [00:00, 20.13it/s]


tensor([10.0746])
Epoch: 267, Step: 1, Reward: 1194.0, Loss: -1194.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.86it/s]


 tensor([10.8525])
Epoch: 268, Step: 1, Reward: 1161.0, Loss: -1161.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.09it/s]


 tensor([10.1223])
Epoch: 269, Step: 1, Reward: 919.0, Loss: -919.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 20.15it/s]


 tensor([10.5658])
Epoch: 270, Step: 1, Reward: 848.0, Loss: -848.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 21.20it/s]


tensor([10.8886])
Epoch: 271, Step: 1, Reward: 1298.0, Loss: -1298.0


0it [00:00, ?it/s]

step length:  tensor([10.8234])


1it [00:00, 18.40it/s]


Epoch: 272, Step: 1, Reward: 970.0, Loss: -970.0


0it [00:00, ?it/s]

step length:  tensor([9.8726])


1it [00:00, 21.76it/s]


Epoch: 273, Step: 1, Reward: 769.0, Loss: -769.0


0it [00:00, ?it/s]

step length:  tensor([10.0912])


1it [00:00, 19.48it/s]


Epoch: 274, Step: 1, Reward: 925.0, Loss: -925.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 20.66it/s]


 tensor([10.3359])
Epoch: 275, Step: 1, Reward: 994.0, Loss: -994.0


0it [00:00, ?it/s]

step length:  tensor([10.1234])
Epoch: 276, Step: 1, Reward: 885.0, Loss: -885.0


1it [00:00, 20.07it/s]
0it [00:00, ?it/s]

step length:  tensor([10.8024])
Epoch: 277, Step: 1, Reward: 869.0, Loss: -869.0

1it [00:00, 19.81it/s]





0it [00:00, ?it/s]

step length:  tensor([9.7980])


1it [00:00, 18.63it/s]


Epoch: 278, Step: 1, Reward: 1044.0, Loss: -1044.0


0it [00:00, ?it/s]

step length:  tensor([10.4436])
Epoch: 279, Step: 1, Reward: 924.0, Loss: -924.0


1it [00:00, 22.10it/s]
1it [00:00, 19.22it/s]


step length:  tensor([10.5016])
Epoch: 280, Step: 1, Reward: 1012.0, Loss: -1012.0


0it [00:00, ?it/s]

step length:  tensor([10.4083])
Epoch: 281, Step: 1, Reward: 900.0, Loss: -900.0


1it [00:00, 20.37it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7674])
Epoch: 282, Step: 1, Reward: 991.0, Loss: -991.0


1it [00:00, 20.47it/s]
0it [00:00, ?it/s]

step length:  

1it [00:00, 19.54it/s]


tensor([10.3167])
Epoch: 283, Step: 1, Reward: 1110.0, Loss: -1110.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.97it/s]


 tensor([10.9754])
Epoch: 284, Step: 1, Reward: 997.0, Loss: -997.0


0it [00:00, ?it/s]

step length:  tensor([10.4137])


1it [00:00, 19.27it/s]


Epoch: 285, Step: 1, Reward: 1004.0, Loss: -1004.0


1it [00:00, 22.07it/s]


step length:  tensor([10.3055])
Epoch: 286, Step: 1, Reward: 1127.0, Loss: -1127.0


0it [00:00, ?it/s]

step length:  tensor([10.5107])
Epoch: 287, Step: 1, Reward: 786.0, Loss: -786.0


1it [00:00, 20.54it/s]
1it [00:00, 29.07it/s]


step length:  tensor([10.1801])
Epoch: 288, Step: 1, Reward: 899.0, Loss: -899.0


0it [00:00, ?it/s]

step length:  tensor([10.2744])
Epoch: 289, Step: 1, Reward: 950.0, Loss: -950.0


1it [00:00, 19.22it/s]
1it [00:00, 17.12it/s]


step length:  tensor([10.3401])
Epoch: 290, Step: 1, Reward: 1111.0, Loss: -1111.0


0it [00:00, ?it/s]

step length:  tensor([10.0873])
Epoch: 291, Step: 1, Reward: 898.0, Loss: -898.0


1it [00:00, 16.77it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7130])
Epoch: 292, Step: 1, Reward: 1075.0, Loss: -1075.0


1it [00:00, 23.95it/s]
0it [00:00, ?it/s]

step length:  tensor([10.8134])
Epoch: 293, Step: 1, Reward: 1020.0, Loss: -1020.0


1it [00:00, 20.27it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0314])
Epoch: 294, Step: 1, Reward: 866.0, Loss: -866.0


1it [00:00, 16.96it/s]
1it [00:00, 23.77it/s]


step length:  tensor([10.1247])
Epoch: 295, Step: 1, Reward: 1028.0, Loss: -1028.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.79it/s]


 tensor([10.3286])
Epoch: 296, Step: 1, Reward: 873.0, Loss: -873.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 20.70it/s]


 tensor([10.5609])
Epoch: 297, Step: 1, Reward: 926.0, Loss: -926.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 18.99it/s]


 tensor([10.9118])
Epoch: 298, Step: 1, Reward: 972.0, Loss: -972.0


0it [00:00, ?it/s]

step length:  tensor([10.5707])
Epoch: 299, Step: 1, Reward: 967.0, Loss: -967.0


1it [00:00, 18.29it/s]
0it [00:00, ?it/s]

step length:  tensor([9.8455])
Epoch: 300, Step: 1, Reward: 833.0, Loss: -833.0


1it [00:00, 19.96it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5726])


1it [00:00, 26.75it/s]


Epoch: 301, Step: 1, Reward: 945.0, Loss: -945.0


0it [00:00, ?it/s]

step length:  tensor([11.1909])
Epoch: 302, Step: 1, Reward: 1451.0, Loss: -1451.0


1it [00:00, 14.36it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5475])
Epoch: 303, Step: 1, Reward: 973.0, Loss: -973.0


1it [00:00, 19.51it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3454])
Epoch: 304, Step: 1, Reward: 862.0, Loss: -862.0


1it [00:00, 23.34it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 21.28it/s]


 tensor([10.6086])
Epoch: 305, Step: 1, Reward: 973.0, Loss: -973.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 20.21it/s]


 tensor([9.8335])
Epoch: 306, Step: 1, Reward: 812.0, Loss: -812.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 29.08it/s]


 tensor([10.3881])
Epoch: 307, Step: 1, Reward: 915.0, Loss: -915.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 25.69it/s]


 tensor([10.1040])
Epoch: 308, Step: 1, Reward: 889.0, Loss: -889.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 21.08it/s]


tensor([10.6072])
Epoch: 309, Step: 1, Reward: 999.0, Loss: -999.0


0it [00:00, ?it/s]

step length:  tensor([10.5025])


1it [00:00, 19.93it/s]


Epoch: 310, Step: 1, Reward: 866.0, Loss: -866.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 15.48it/s]


tensor([9.8523])
Epoch: 311, Step: 1, Reward: 980.0, Loss: -980.0


1it [00:00, 29.04it/s]


step length:  tensor([9.6686])
Epoch: 312, Step: 1, Reward: 1141.0, Loss: -1141.0


1it [00:00, 18.09it/s]


step length:  tensor([10.1250])
Epoch: 313, Step: 1, Reward: 1035.0, Loss: -1035.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 22.97it/s]


tensor([10.2788])
Epoch: 314, Step: 1, Reward: 1124.0, Loss: -1124.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 29.09it/s]

tensor([10.8631])
Epoch: 315, Step: 1, Reward: 1127.0, Loss: -1127.0



1it [00:00, 20.33it/s]


step length:  tensor([10.4156])
Epoch: 316, Step: 1, Reward: 874.0, Loss: -874.0


1it [00:00, 29.07it/s]


step length:  tensor([10.8634])
Epoch: 317, Step: 1, Reward: 1188.0, Loss: -1188.0


1it [00:00, 20.14it/s]


step length:  tensor([10.3302])
Epoch: 318, Step: 1, Reward: 703.0, Loss: -703.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 19.73it/s]


tensor([10.3069])
Epoch: 319, Step: 1, Reward: 941.0, Loss: -941.0


0it [00:00, ?it/s]

step length:  tensor([10.4353])


1it [00:00, 20.13it/s]


Epoch: 320, Step: 1, Reward: 791.0, Loss: -791.0


0it [00:00, ?it/s]

step length:  tensor([10.3374])


1it [00:00, 19.89it/s]


Epoch: 321, Step: 1, Reward: 1161.0, Loss: -1161.0


0it [00:00, ?it/s]

step length:  tensor([10.4175])


1it [00:00, 19.59it/s]


Epoch: 322, Step: 1, Reward: 974.0, Loss: -974.0


0it [00:00, ?it/s]

step length:  tensor([9.9741])


1it [00:00, 20.76it/s]


Epoch: 323, Step: 1, Reward: 949.0, Loss: -949.0


0it [00:00, ?it/s]

step length:  tensor([10.2076])
Epoch: 324, Step: 1, Reward: 971.0, Loss: -971.0


1it [00:00, 15.64it/s]
0it [00:00, ?it/s]

step length:  tensor([9.7423])
Epoch: 325, Step: 1, Reward: 870.0, Loss: -870.0


1it [00:00, 20.83it/s]
0it [00:00, ?it/s]

step length:  tensor([10.9621])
Epoch: 326, Step: 1, Reward: 1061.0, Loss: -1061.0


1it [00:00, 27.61it/s]
0it [00:00, ?it/s]

step length:  tensor([9.7901])


1it [00:00, 20.27it/s]


Epoch: 327, Step: 1, Reward: 1034.0, Loss: -1034.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.15it/s]


tensor([9.7132])
Epoch: 328, Step: 1, Reward: 939.0, Loss: -939.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.35it/s]


tensor([10.4014])
Epoch: 329, Step: 1, Reward: 958.0, Loss: -958.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.28it/s]


 tensor([10.5164])
Epoch: 330, Step: 1, Reward: 990.0, Loss: -990.0


0it [00:00, ?it/s]

step length:  tensor([9.9998])
Epoch: 331, Step: 1, Reward: 766.0, Loss: -766.0


1it [00:00, 20.36it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5959])
Epoch: 332, Step: 1, Reward: 922.0, Loss: -922.0


1it [00:00, 17.67it/s]
1it [00:00, 24.72it/s]


step length:  tensor([10.2661])
Epoch: 333, Step: 1, Reward: 928.0, Loss: -928.0


0it [00:00, ?it/s]

step length:  tensor([9.7875])
Epoch: 334, Step: 1, Reward: 1184.0, Loss: -1184.0


1it [00:00, 18.92it/s]
1it [00:00, 19.96it/s]


step length:  tensor([10.4668])
Epoch: 335, Step: 1, Reward: 1116.0, Loss: -1116.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.38it/s]


tensor([10.7409])
Epoch: 336, Step: 1, Reward: 1073.0, Loss: -1073.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 29.16it/s]


 tensor([10.1628])
Epoch: 337, Step: 1, Reward: 1075.0, Loss: -1075.0


0it [00:00, ?it/s]

step length:  tensor([9.9335])
Epoch: 338, Step: 1, Reward: 1075.0, Loss: -1075.0


1it [00:00, 18.66it/s]
1it [00:00, 21.65it/s]


step length:  tensor([10.0792])
Epoch: 339, Step: 1, Reward: 857.0, Loss: -857.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 19.06it/s]


tensor([9.4370])
Epoch: 340, Step: 1, Reward: 777.0, Loss: -777.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.37it/s]


tensor([10.5678])
Epoch: 341, Step: 1, Reward: 1266.0, Loss: -1266.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.05it/s]


tensor([10.0485])
Epoch: 342, Step: 1, Reward: 958.0, Loss: -958.0


0it [00:00, ?it/s]

step length:  tensor([10.7452])
Epoch: 343, Step: 1, Reward: 940.0, Loss: -940.0


1it [00:00, 16.00it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4753])
Epoch: 344, Step: 1, Reward: 1014.0, Loss: -1014.0


1it [00:00, 28.18it/s]
1it [00:00, 19.86it/s]


step length:  tensor([10.2806])
Epoch: 345, Step: 1, Reward: 1158.0, Loss: -1158.0


0it [00:00, ?it/s]

step length:  tensor([10.7978])


1it [00:00, 19.52it/s]


Epoch: 346, Step: 1, Reward: 986.0, Loss: -986.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 20.77it/s]


 tensor([11.4204])
Epoch: 347, Step: 1, Reward: 1000.0, Loss: -1000.0


1it [00:00, 20.16it/s]


step length:  tensor([10.2282])
Epoch: 348, Step: 1, Reward: 1446.0, Loss: -1446.0


0it [00:00, ?it/s]

step length:  tensor([10.0194])


1it [00:00, 19.00it/s]


Epoch: 349, Step: 1, Reward: 652.0, Loss: -652.0


0it [00:00, ?it/s]

step length:  tensor([10.2270])
Epoch: 350, Step: 1, Reward: 1077.0, Loss: -1077.0


1it [00:00, 21.91it/s]
0it [00:00, ?it/s]

step length:  

1it [00:00, 18.83it/s]


tensor([10.4911])
Epoch: 351, Step: 1, Reward: 1168.0, Loss: -1168.0


0it [00:00, ?it/s]

step length:  tensor([9.7207])
Epoch: 352, Step: 1, Reward: 651.0, Loss: -651.0


1it [00:00, 19.78it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3670])
Epoch: 353, Step: 1, Reward: 1046.0, Loss: -1046.0


1it [00:00, 32.44it/s]
1it [00:00, 19.87it/s]


step length:  tensor([11.0973])
Epoch: 354, Step: 1, Reward: 991.0, Loss: -991.0


1it [00:00, 22.92it/s]


step length:  tensor([10.0566])
Epoch: 355, Step: 1, Reward: 1278.0, Loss: -1278.0


0it [00:00, ?it/s]

step length:  tensor([10.0316])
Epoch: 356, Step: 1, Reward: 808.0, Loss: -808.0


1it [00:00, 15.80it/s]
0it [00:00, ?it/s]

step length:  tensor([10.6893])
Epoch: 357, Step: 1, Reward: 1205.0, Loss: -1205.0


1it [00:00, 24.21it/s]
1it [00:00, 21.14it/s]


step length:  tensor([10.5030])
Epoch: 358, Step: 1, Reward: 939.0, Loss: -939.0


0it [00:00, ?it/s]

step length:  tensor([10.6214])
Epoch: 359, Step: 1, Reward: 1092.0, Loss: -1092.0


1it [00:00, 17.99it/s]
1it [00:00, 20.35it/s]


step length:  tensor([9.9204])
Epoch: 360, Step: 1, Reward: 1173.0, Loss: -1173.0


0it [00:00, ?it/s]

step length:  tensor([10.2065])
Epoch: 361, Step: 1, Reward: 1094.0, Loss: -1094.0


1it [00:00, 18.54it/s]
1it [00:00, 23.19it/s]


step length:  tensor([10.5140])
Epoch: 362, Step: 1, Reward: 1161.0, Loss: -1161.0


0it [00:00, ?it/s]

step length:  tensor([10.7100])
Epoch: 363, Step: 1, Reward: 1197.0, Loss: -1197.0


1it [00:00, 15.75it/s]
1it [00:00, 26.55it/s]


step length:  tensor([10.0842])
Epoch: 364, Step: 1, Reward: 695.0, Loss: -695.0


1it [00:00, 26.91it/s]


step length:  tensor([9.7768])
Epoch: 365, Step: 1, Reward: 855.0, Loss: -855.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.20it/s]


 tensor([10.6953])
Epoch: 366, Step: 1, Reward: 923.0, Loss: -923.0


0it [00:00, ?it/s]

step length:  tensor([9.9012])
Epoch: 367, Step: 1, Reward: 1290.0, Loss: -1290.0


1it [00:00, 15.18it/s]
0it [00:00, ?it/s]

step length:  tensor([9.7605])


1it [00:00, 19.63it/s]


Epoch: 368, Step: 1, Reward: 913.0, Loss: -913.0


0it [00:00, ?it/s]

step length:  tensor([10.7842])
Epoch: 369, Step: 1, Reward: 1121.0, Loss: -1121.0


1it [00:00, 19.13it/s]
1it [00:00, 23.48it/s]


step length:  tensor([11.0517])
Epoch: 370, Step: 1, Reward: 1037.0, Loss: -1037.0


0it [00:00, ?it/s]

step length:  tensor([10.5983])
Epoch: 371, Step: 1, Reward: 969.0, Loss: -969.0


1it [00:00, 18.18it/s]
0it [00:00, ?it/s]

step length:  tensor([9.8564])


1it [00:00, 22.26it/s]


Epoch: 372, Step: 1, Reward: 748.0, Loss: -748.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.96it/s]


 tensor([9.6832])
Epoch: 373, Step: 1, Reward: 815.0, Loss: -815.0


1it [00:00, 14.99it/s]

step length:  tensor([10.9581])
Epoch: 374, Step: 1, Reward: 1356.0, Loss: -1356.0



0it [00:00, ?it/s]

step length:  tensor([10.7054])


1it [00:00, 20.07it/s]


Epoch: 375, Step: 1, Reward: 971.0, Loss: -971.0


0it [00:00, ?it/s]

step length:  tensor([10.3282])


1it [00:00, 20.15it/s]


Epoch: 376, Step: 1, Reward: 898.0, Loss: -898.0


0it [00:00, ?it/s]

step length:  tensor([10.7400])
Epoch: 377, Step: 1, Reward: 939.0, Loss: -939.0


1it [00:00, 20.52it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3438])
Epoch: 378, Step: 1, Reward: 1110.0, Loss: -1110.0


1it [00:00, 19.97it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2665])


1it [00:00, 20.11it/s]


Epoch: 379, Step: 1, Reward: 866.0, Loss: -866.0


1it [00:00, 30.18it/s]


step length:  tensor([9.8626])
Epoch: 380, Step: 1, Reward: 740.0, Loss: -740.0


0it [00:00, ?it/s]

step length:  tensor([10.8253])
Epoch: 381, Step: 1, Reward: 810.0, Loss: -810.0

1it [00:00, 14.74it/s]





0it [00:00, ?it/s]

step length:  tensor([10.6569])


1it [00:00, 21.73it/s]


Epoch: 382, Step: 1, Reward: 996.0, Loss: -996.0


0it [00:00, ?it/s]

step length:  tensor([11.2367])


1it [00:00, 19.94it/s]


Epoch: 383, Step: 1, Reward: 804.0, Loss: -804.0


0it [00:00, ?it/s]

step length:  tensor([10.0469])
Epoch: 384, Step: 1, Reward: 926.0, Loss: -926.0


1it [00:00, 28.73it/s]
1it [00:00, 20.63it/s]

step length:  tensor([9.8241])
Epoch: 385, Step: 1, Reward: 1093.0, Loss: -1093.0



0it [00:00, ?it/s]

step length:  tensor([10.4958])
Epoch: 386, Step: 1, Reward: 967.0, Loss: -967.0


1it [00:00, 18.84it/s]
0it [00:00, ?it/s]

step length:  tensor([9.6730])


1it [00:00, 20.30it/s]


Epoch: 387, Step: 1, Reward: 1036.0, Loss: -1036.0


0it [00:00, ?it/s]

step length:  tensor([9.5504])


1it [00:00, 21.66it/s]


Epoch: 388, Step: 1, Reward: 1014.0, Loss: -1014.0


1it [00:00, 19.39it/s]


step length:  tensor([10.5321])
Epoch: 389, Step: 1, Reward: 865.0, Loss: -865.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.81it/s]


tensor([10.8640])
Epoch: 390, Step: 1, Reward: 824.0, Loss: -824.0


0it [00:00, ?it/s]

step length:  tensor([10.8101])
Epoch: 391, Step: 1, Reward: 1163.0, Loss: -1163.0


1it [00:00, 20.12it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7289])
Epoch: 392, Step: 1, Reward: 1055.0, Loss: -1055.0


1it [00:00, 20.18it/s]
1it [00:00, 25.41it/s]

step length:  tensor([10.0940])
Epoch: 393, Step: 1, Reward: 899.0, Loss: -899.0



0it [00:00, ?it/s]

step length:  tensor([9.5789])


1it [00:00, 20.85it/s]


Epoch: 394, Step: 1, Reward: 988.0, Loss: -988.0


1it [00:00, 21.80it/s]


step length:  tensor([10.4726])
Epoch: 395, Step: 1, Reward: 1085.0, Loss: -1085.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.16it/s]


 tensor([10.5533])
Epoch: 396, Step: 1, Reward: 1010.0, Loss: -1010.0


0it [00:00, ?it/s]

step length:  tensor([10.2575])


1it [00:00, 21.47it/s]


Epoch: 397, Step: 1, Reward: 1222.0, Loss: -1222.0


0it [00:00, ?it/s]

step length:  tensor([10.3049])


1it [00:00, 18.47it/s]


Epoch: 398, Step: 1, Reward: 836.0, Loss: -836.0


1it [00:00, 20.72it/s]


step length:  tensor([10.3207])
Epoch: 399, Step: 1, Reward: 1301.0, Loss: -1301.0


0it [00:00, ?it/s]

step length:  tensor([11.3058])
Epoch: 400, Step: 1, Reward: 948.0, Loss: -948.0


1it [00:00, 18.54it/s]
0it [00:00, ?it/s]

step length:  tensor([11.2293])
Epoch: 401, Step: 1, Reward: 1177.0, Loss: -1177.0


1it [00:00, 22.35it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0047])
Epoch: 402, Step: 1, Reward: 964.0, Loss: -964.0


1it [00:00, 15.20it/s]
0it [00:00, ?it/s]

step length:  tensor([10.6297])


1it [00:00, 20.29it/s]


Epoch: 403, Step: 1, Reward: 1012.0, Loss: -1012.0


0it [00:00, ?it/s]

step length:  tensor([10.3373])


1it [00:00, 19.84it/s]


Epoch: 404, Step: 1, Reward: 941.0, Loss: -941.0


1it [00:00, 20.09it/s]


step length:  tensor([10.0779])
Epoch: 405, Step: 1, Reward: 867.0, Loss: -867.0


0it [00:00, ?it/s]

step length:  tensor([11.1047])
Epoch: 406, Step: 1, Reward: 905.0, Loss: -905.0


1it [00:00, 19.71it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2943])
Epoch: 407, Step: 1, Reward: 906.0, Loss: -906.0


1it [00:00, 30.61it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1937])
Epoch: 408, Step: 1, Reward: 962.0, Loss: -962.0


1it [00:00, 20.75it/s]
1it [00:00, 18.26it/s]


step length:  tensor([10.5396])
Epoch: 409, Step: 1, Reward: 929.0, Loss: -929.0


1it [00:00, 21.81it/s]


step length:  tensor([9.8486])
Epoch: 410, Step: 1, Reward: 919.0, Loss: -919.0


1it [00:00, 18.08it/s]


step length:  tensor([9.9318])
Epoch: 411, Step: 1, Reward: 975.0, Loss: -975.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 22.31it/s]


 tensor([10.4406])
Epoch: 412, Step: 1, Reward: 912.0, Loss: -912.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 25.52it/s]


 tensor([10.3915])
Epoch: 413, Step: 1, Reward: 1022.0, Loss: -1022.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 21.46it/s]


tensor([10.9720])
Epoch: 414, Step: 1, Reward: 1019.0, Loss: -1019.0


1it [00:00, 20.38it/s]


step length:  tensor([10.0381])
Epoch: 415, Step: 1, Reward: 1014.0, Loss: -1014.0


0it [00:00, ?it/s]

step length:  tensor([10.3445])


1it [00:00, 18.15it/s]


Epoch: 416, Step: 1, Reward: 995.0, Loss: -995.0


0it [00:00, ?it/s]

step length:  tensor([10.1304])
Epoch: 417, Step: 1, Reward: 1199.0, Loss: -1199.0


1it [00:00, 22.22it/s]
1it [00:00, 20.56it/s]

step length:  tensor([10.5507])
Epoch: 418, Step: 1, Reward: 1125.0, Loss: -1125.0



0it [00:00, ?it/s]

step length:  tensor([10.4401])
Epoch: 419, Step: 1, Reward: 849.0, Loss: -849.0


1it [00:00, 23.18it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4489])
Epoch: 420, Step: 1, Reward: 866.0, Loss: -866.0


1it [00:00, 25.36it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 18.05it/s]


 tensor([10.5829])
Epoch: 421, Step: 1, Reward: 970.0, Loss: -970.0


0it [00:00, ?it/s]

step length:  tensor([9.9289])


1it [00:00, 15.85it/s]


Epoch: 422, Step: 1, Reward: 1343.0, Loss: -1343.0


0it [00:00, ?it/s]

step length:  tensor([10.4704])


1it [00:00, 21.66it/s]


Epoch: 423, Step: 1, Reward: 887.0, Loss: -887.0


0it [00:00, ?it/s]

step length:  tensor([10.5493])
Epoch: 424, Step: 1, Reward: 1099.0, Loss: -1099.0


1it [00:00, 23.49it/s]
1it [00:00, 17.55it/s]


step length:  tensor([10.7077])
Epoch: 425, Step: 1, Reward: 823.0, Loss: -823.0


0it [00:00, ?it/s]

step length:  tensor([9.6624])


1it [00:00, 19.97it/s]


Epoch: 426, Step: 1, Reward: 862.0, Loss: -862.0


0it [00:00, ?it/s]

step length:  tensor([10.5355])
Epoch: 427, Step: 1, Reward: 1023.0, Loss: -1023.0


1it [00:00, 18.17it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5139])
Epoch: 428, Step: 1, Reward: 974.0, Loss: -974.0


1it [00:00, 20.63it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0212])
Epoch: 429, Step: 1, Reward: 1058.0, Loss: -1058.0


1it [00:00, 16.27it/s]
0it [00:00, ?it/s]

step length:  tensor([9.8525])


1it [00:00, 20.40it/s]


Epoch: 430, Step: 1, Reward: 1221.0, Loss: -1221.0


0it [00:00, ?it/s]

step length:  tensor([10.1856])
Epoch: 431, Step: 1, Reward: 1037.0, Loss: -1037.0


1it [00:00, 18.82it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1299])
Epoch: 432, Step: 1, Reward: 1079.0, Loss: -1079.0


1it [00:00, 21.23it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2241])
Epoch: 433, Step: 1, Reward: 1064.0, Loss: -1064.0


1it [00:00, 18.08it/s]
0it [00:00, ?it/s]

step length:  tensor([10.9132])


1it [00:00, 20.12it/s]


Epoch: 434, Step: 1, Reward: 784.0, Loss: -784.0


1it [00:00, 21.29it/s]


step length:  tensor([10.0361])
Epoch: 435, Step: 1, Reward: 787.0, Loss: -787.0


1it [00:00, 20.63it/s]

step length:  tensor([10.1871])
Epoch: 436, Step: 1, Reward: 801.0, Loss: -801.0



0it [00:00, ?it/s]

step length:  tensor([10.7924])


1it [00:00, 19.73it/s]


Epoch: 437, Step: 1, Reward: 1202.0, Loss: -1202.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.64it/s]


 tensor([9.9100])
Epoch: 438, Step: 1, Reward: 992.0, Loss: -992.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 20.40it/s]


 tensor([11.0666])
Epoch: 439, Step: 1, Reward: 1038.0, Loss: -1038.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.97it/s]


 tensor([10.4162])
Epoch: 440, Step: 1, Reward: 906.0, Loss: -906.0


0it [00:00, ?it/s]

step length:  tensor([10.4307])
Epoch: 441, Step: 1, Reward: 1098.0, Loss: -1098.0


1it [00:00, 23.47it/s]
0it [00:00, ?it/s]

step length:  

1it [00:00, 17.45it/s]


tensor([10.6478])
Epoch: 442, Step: 1, Reward: 738.0, Loss: -738.0


0it [00:00, ?it/s]

step length:  tensor([10.3983])


1it [00:00, 14.96it/s]


Epoch: 443, Step: 1, Reward: 1022.0, Loss: -1022.0


1it [00:00, 30.65it/s]


step length:  tensor([10.2925])
Epoch: 444, Step: 1, Reward: 742.0, Loss: -742.0


0it [00:00, ?it/s]

step length:  tensor([10.7129])
Epoch: 445, Step: 1, Reward: 928.0, Loss: -928.0


1it [00:00, 19.79it/s]
1it [00:00, 20.26it/s]


step length:  tensor([10.2699])
Epoch: 446, Step: 1, Reward: 924.0, Loss: -924.0


0it [00:00, ?it/s]

step length:  tensor([10.2916])
Epoch: 447, Step: 1, Reward: 929.0, Loss: -929.0


1it [00:00, 18.67it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 20.37it/s]


 tensor([9.9639])
Epoch: 448, Step: 1, Reward: 948.0, Loss: -948.0


1it [00:00, 20.77it/s]


step length:  tensor([10.9009])
Epoch: 449, Step: 1, Reward: 930.0, Loss: -930.0


0it [00:00, ?it/s]

step length:  tensor([10.8171])
Epoch: 450, Step: 1, Reward: 836.0, Loss: -836.0

1it [00:00, 19.80it/s]





0it [00:00, ?it/s]

step length:  tensor([10.2129])


1it [00:00, 20.48it/s]


Epoch: 451, Step: 1, Reward: 934.0, Loss: -934.0


1it [00:00, 20.01it/s]


step length:  tensor([10.9924])
Epoch: 452, Step: 1, Reward: 1075.0, Loss: -1075.0


0it [00:00, ?it/s]

step length:  tensor([11.0254])


1it [00:00, 19.04it/s]


Epoch: 453, Step: 1, Reward: 857.0, Loss: -857.0


0it [00:00, ?it/s]

step length:  tensor([10.1755])
Epoch: 454, Step: 1, Reward: 1121.0, Loss: -1121.0


1it [00:00, 20.84it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1523])
Epoch: 455, Step: 1, Reward: 797.0, Loss: -797.0


1it [00:00, 18.49it/s]
0it [00:00, ?it/s]

step length:  tensor([10.8176])


1it [00:00, 21.81it/s]


Epoch: 456, Step: 1, Reward: 1097.0, Loss: -1097.0


0it [00:00, ?it/s]

step length:  tensor([10.8910])
Epoch: 457, Step: 1, Reward: 815.0, Loss: -815.0


1it [00:00, 14.06it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3088])
Epoch: 458, Step: 1, Reward: 1051.0, Loss: -1051.0


1it [00:00, 22.01it/s]
0it [00:00, ?it/s]

step length:  tensor([10.6188])


1it [00:00, 20.05it/s]


Epoch: 459, Step: 1, Reward: 1058.0, Loss: -1058.0


0it [00:00, ?it/s]

step length:  tensor([10.8330])


1it [00:00, 20.06it/s]


Epoch: 460, Step: 1, Reward: 932.0, Loss: -932.0


1it [00:00, 20.13it/s]


step length:  tensor([10.3067])
Epoch: 461, Step: 1, Reward: 1192.0, Loss: -1192.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 29.88it/s]


 tensor([10.3094])
Epoch: 462, Step: 1, Reward: 977.0, Loss: -977.0


0it [00:00, ?it/s]

step length:  tensor([10.3959])
Epoch: 463, Step: 1, Reward: 1168.0, Loss: -1168.0


1it [00:00, 23.89it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3471])
Epoch: 464, Step: 1, Reward: 974.0, Loss: -974.0


1it [00:00, 17.30it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 24.43it/s]


 tensor([10.0110])
Epoch: 465, Step: 1, Reward: 893.0, Loss: -893.0


1it [00:00, 16.75it/s]


step length:  tensor([10.2064])
Epoch: 466, Step: 1, Reward: 1157.0, Loss: -1157.0


0it [00:00, ?it/s]

step length:  tensor([10.1976])
Epoch: 467, Step: 1, Reward: 981.0, Loss: -981.0


1it [00:00, 20.49it/s]
1it [00:00, 19.69it/s]


step length:  tensor([9.7770])
Epoch: 468, Step: 1, Reward: 943.0, Loss: -943.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 30.34it/s]


tensor([11.1881])
Epoch: 469, Step: 1, Reward: 1052.0, Loss: -1052.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.26it/s]


 tensor([9.5956])
Epoch: 470, Step: 1, Reward: 889.0, Loss: -889.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 20.68it/s]


 tensor([10.5600])
Epoch: 471, Step: 1, Reward: 939.0, Loss: -939.0


1it [00:00, 18.89it/s]


step length:  tensor([10.8951])
Epoch: 472, Step: 1, Reward: 835.0, Loss: -835.0


1it [00:00, 21.40it/s]


step length:  tensor([10.1546])
Epoch: 473, Step: 1, Reward: 847.0, Loss: -847.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 28.84it/s]


tensor([10.0862])
Epoch: 474, Step: 1, Reward: 1297.0, Loss: -1297.0


0it [00:00, ?it/s]

step length:  tensor([10.9493])
Epoch: 475, Step: 1, Reward: 1031.0, Loss: -1031.0


1it [00:00, 17.90it/s]
0it [00:00, ?it/s]

step length:  tensor([9.9274])
Epoch: 476, Step: 1, Reward: 1205.0, Loss: -1205.0


1it [00:00, 21.98it/s]
0it [00:00, ?it/s]

step length:  tensor([10.8338])
Epoch: 477, Step: 1, Reward: 1149.0, Loss: -1149.0


1it [00:00, 18.13it/s]
1it [00:00, 21.71it/s]


step length:  tensor([10.5464])
Epoch: 478, Step: 1, Reward: 1520.0, Loss: -1520.0


1it [00:00, 26.02it/s]


step length:  tensor([11.3121])
Epoch: 479, Step: 1, Reward: 716.0, Loss: -716.0


1it [00:00, 21.45it/s]


step length:  tensor([11.0834])
Epoch: 480, Step: 1, Reward: 794.0, Loss: -794.0


0it [00:00, ?it/s]

step length:  tensor([10.2545])
Epoch: 481, Step: 1, Reward: 897.0, Loss: -897.0


1it [00:00, 19.80it/s]
1it [00:00, 29.66it/s]


step length:  tensor([10.5531])
Epoch: 482, Step: 1, Reward: 1005.0, Loss: -1005.0


1it [00:00, 19.12it/s]


step length:  tensor([11.1228])
Epoch: 483, Step: 1, Reward: 820.0, Loss: -820.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.52it/s]


 tensor([10.6145])
Epoch: 484, Step: 1, Reward: 1010.0, Loss: -1010.0


0it [00:00, ?it/s]

step length:  tensor([10.3156])


1it [00:00, 21.81it/s]


Epoch: 485, Step: 1, Reward: 934.0, Loss: -934.0


1it [00:00, 19.89it/s]


step length:  tensor([10.4733])
Epoch: 486, Step: 1, Reward: 894.0, Loss: -894.0


1it [00:00, 20.19it/s]


step length:  tensor([11.1939])
Epoch: 487, Step: 1, Reward: 1025.0, Loss: -1025.0


0it [00:00, ?it/s]

step length:  tensor([10.7707])
Epoch: 488, Step: 1, Reward: 989.0, Loss: -989.0


1it [00:00, 18.67it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3397])
Epoch: 489, Step: 1, Reward: 1042.0, Loss: -1042.0


1it [00:00, 21.17it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2887])
Epoch: 490, Step: 1, Reward: 679.0, Loss: -679.0


1it [00:00, 29.95it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 18.59it/s]


 tensor([10.1034])
Epoch: 491, Step: 1, Reward: 957.0, Loss: -957.0


1it [00:00, 21.50it/s]

step length:  tensor([11.1400])
Epoch: 492, Step: 1, Reward: 1040.0, Loss: -1040.0



0it [00:00, ?it/s]

step length:  tensor([10.2130])
Epoch: 493, Step: 1, Reward: 1211.0, Loss: -1211.0


1it [00:00, 19.91it/s]
1it [00:00, 19.65it/s]


step length:  tensor([10.3393])
Epoch: 494, Step: 1, Reward: 874.0, Loss: -874.0


0it [00:00, ?it/s]

step length:  tensor([10.9339])
Epoch: 495, Step: 1, Reward: 1094.0, Loss: -1094.0


1it [00:00, 18.60it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3581])


1it [00:00, 16.18it/s]


Epoch: 496, Step: 1, Reward: 1144.0, Loss: -1144.0


0it [00:00, ?it/s]

step length:  tensor([10.1790])


1it [00:00, 20.27it/s]


Epoch: 497, Step: 1, Reward: 1233.0, Loss: -1233.0


0it [00:00, ?it/s]

step length:  tensor([10.2901])


1it [00:00, 20.47it/s]


Epoch: 498, Step: 1, Reward: 1084.0, Loss: -1084.0


0it [00:00, ?it/s]

step length:  tensor([10.1666])
Epoch: 499, Step: 1, Reward: 979.0, Loss: -979.0


1it [00:00, 18.04it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3570])


1it [00:00, 21.54it/s]


Epoch: 500, Step: 1, Reward: 1099.0, Loss: -1099.0


0it [00:00, ?it/s]

step length:  tensor([10.6587])
Epoch: 501, Step: 1, Reward: 916.0, Loss: -916.0


1it [00:00, 17.93it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5451])
Epoch: 502, Step: 1, Reward: 961.0, Loss: -961.0


1it [00:00, 13.49it/s]
0it [00:00, ?it/s]

step length:  tensor([10.6161])
Epoch: 503, Step: 1, Reward: 973.0, Loss: -973.0


1it [00:00, 26.30it/s]
1it [00:00, 22.04it/s]


step length:  tensor([10.9462])
Epoch: 504, Step: 1, Reward: 971.0, Loss: -971.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 18.82it/s]


tensor([10.8907])
Epoch: 505, Step: 1, Reward: 738.0, Loss: -738.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 21.27it/s]


tensor([10.5617])
Epoch: 506, Step: 1, Reward: 955.0, Loss: -955.0


0it [00:00, ?it/s]

step length:  tensor([9.9909])


1it [00:00, 20.42it/s]


Epoch: 507, Step: 1, Reward: 1036.0, Loss: -1036.0


0it [00:00, ?it/s]

step length:  tensor([9.8433])


1it [00:00, 22.54it/s]


Epoch: 508, Step: 1, Reward: 822.0, Loss: -822.0


0it [00:00, ?it/s]

step length:  tensor([10.8076])
Epoch: 509, Step: 1, Reward: 851.0, Loss: -851.0


1it [00:00, 16.96it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4338])


1it [00:00, 21.59it/s]


Epoch: 510, Step: 1, Reward: 1094.0, Loss: -1094.0


0it [00:00, ?it/s]

step length:  tensor([10.6328])


1it [00:00, 14.70it/s]


Epoch: 511, Step: 1, Reward: 1085.0, Loss: -1085.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 17.24it/s]


tensor([10.0801])
Epoch: 512, Step: 1, Reward: 1268.0, Loss: -1268.0


0it [00:00, ?it/s]

step length:  tensor([10.7747])


1it [00:00, 23.87it/s]


Epoch: 513, Step: 1, Reward: 826.0, Loss: -826.0


0it [00:00, ?it/s]

step length:  tensor([11.1503])
Epoch: 514, Step: 1, Reward: 1129.0, Loss: -1129.0


1it [00:00, 19.87it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 20.20it/s]


 tensor([10.8366])
Epoch: 515, Step: 1, Reward: 823.0, Loss: -823.0


1it [00:00, 20.00it/s]


step length:  tensor([9.7160])
Epoch: 516, Step: 1, Reward: 756.0, Loss: -756.0


0it [00:00, ?it/s]

step length:  tensor([10.0023])
Epoch: 517, Step: 1, Reward: 887.0, Loss: -887.0


1it [00:00, 17.64it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2738])


1it [00:00, 21.75it/s]

Epoch: 518, Step: 1, Reward: 903.0, Loss: -903.0



0it [00:00, ?it/s]

step length:  tensor([10.6777])


1it [00:00, 21.14it/s]


Epoch: 519, Step: 1, Reward: 961.0, Loss: -961.0


0it [00:00, ?it/s]

step length:  tensor([10.7706])
Epoch: 520, Step: 1, Reward: 1162.0, Loss: -1162.0


1it [00:00, 17.56it/s]
0it [00:00, ?it/s]

step length:  tensor([11.0891])
Epoch: 521, Step: 1, Reward: 768.0, Loss: -768.0


1it [00:00, 23.71it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3851])


1it [00:00, 17.38it/s]


Epoch: 522, Step: 1, Reward: 862.0, Loss: -862.0


0it [00:00, ?it/s]

step length:  tensor([10.7745])
Epoch: 523, Step: 1, Reward: 742.0, Loss: -742.0


1it [00:00, 19.60it/s]
1it [00:00, 21.73it/s]


step length:  tensor([9.5876])
Epoch: 524, Step: 1, Reward: 970.0, Loss: -970.0


0it [00:00, ?it/s]

step length:  tensor([10.7498])
Epoch: 525, Step: 1, Reward: 910.0, Loss: -910.0


1it [00:00, 22.13it/s]
0it [00:00, ?it/s]

step length:  tensor([10.9228])
Epoch: 526, Step: 1, Reward: 1236.0, Loss: -1236.0


1it [00:00, 20.85it/s]
1it [00:00, 15.87it/s]


step length:  tensor([9.9264])
Epoch: 527, Step: 1, Reward: 950.0, Loss: -950.0


0it [00:00, ?it/s]

step length:  tensor([9.8225])


1it [00:00, 20.85it/s]


Epoch: 528, Step: 1, Reward: 846.0, Loss: -846.0


0it [00:00, ?it/s]

step length:  tensor([9.8146])


1it [00:00, 19.74it/s]


Epoch: 529, Step: 1, Reward: 866.0, Loss: -866.0


1it [00:00, 22.70it/s]


step length:  tensor([10.5219])
Epoch: 530, Step: 1, Reward: 729.0, Loss: -729.0


0it [00:00, ?it/s]

step length:  tensor([10.3226])
Epoch: 531, Step: 1, Reward: 1057.0, Loss: -1057.0


1it [00:00, 24.10it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4852])


1it [00:00, 20.47it/s]


Epoch: 532, Step: 1, Reward: 966.0, Loss: -966.0


0it [00:00, ?it/s]

step length:  tensor([10.7613])
Epoch: 533, Step: 1, Reward: 974.0, Loss: -974.0


1it [00:00, 20.34it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 22.70it/s]


 tensor([10.5366])
Epoch: 534, Step: 1, Reward: 724.0, Loss: -724.0


1it [00:00, 27.31it/s]


step length:  tensor([10.2473])
Epoch: 535, Step: 1, Reward: 974.0, Loss: -974.0


0it [00:00, ?it/s]

step length:  tensor([10.6205])


1it [00:00, 15.74it/s]


Epoch: 536, Step: 1, Reward: 713.0, Loss: -713.0


0it [00:00, ?it/s]

step length:  tensor([10.4563])
Epoch: 537, Step: 1, Reward: 1042.0, Loss: -1042.0


1it [00:00, 17.78it/s]
1it [00:00, 23.25it/s]


step length:  tensor([10.5048])
Epoch: 538, Step: 1, Reward: 1312.0, Loss: -1312.0


0it [00:00, ?it/s]

step length:  tensor([10.0364])
Epoch: 539, Step: 1, Reward: 1065.0, Loss: -1065.0


1it [00:00, 17.46it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7867])


1it [00:00, 22.91it/s]


Epoch: 540, Step: 1, Reward: 786.0, Loss: -786.0


0it [00:00, ?it/s]

step length:  tensor([10.6646])
Epoch: 541, Step: 1, Reward: 1317.0, Loss: -1317.0


1it [00:00, 17.34it/s]
1it [00:00, 22.24it/s]


step length:  tensor([10.4905])
Epoch: 542, Step: 1, Reward: 745.0, Loss: -745.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 23.20it/s]


 tensor([9.7924])
Epoch: 543, Step: 1, Reward: 1044.0, Loss: -1044.0


1it [00:00, 18.00it/s]


step length:  tensor([10.1713])
Epoch: 544, Step: 1, Reward: 783.0, Loss: -783.0


0it [00:00, ?it/s]

step length:  tensor([11.0588])


1it [00:00, 15.51it/s]


Epoch: 545, Step: 1, Reward: 1103.0, Loss: -1103.0


0it [00:00, ?it/s]

step length:  tensor([10.5547])


1it [00:00, 21.52it/s]


Epoch: 546, Step: 1, Reward: 895.0, Loss: -895.0


0it [00:00, ?it/s]

step length:  tensor([11.0510])
Epoch: 547, Step: 1, Reward: 706.0, Loss: -706.0


1it [00:00, 17.78it/s]
1it [00:00, 23.02it/s]


step length:  tensor([10.2082])
Epoch: 548, Step: 1, Reward: 757.0, Loss: -757.0


0it [00:00, ?it/s]

step length:  tensor([10.2386])
Epoch: 549, Step: 1, Reward: 1226.0, Loss: -1226.0


1it [00:00, 14.96it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 26.82it/s]


 tensor([10.6220])
Epoch: 550, Step: 1, Reward: 962.0, Loss: -962.0


0it [00:00, ?it/s]

step length:  tensor([10.3464])


1it [00:00, 16.12it/s]


Epoch: 551, Step: 1, Reward: 947.0, Loss: -947.0


1it [00:00, 19.46it/s]


step length:  tensor([10.7313])
Epoch: 552, Step: 1, Reward: 1199.0, Loss: -1199.0


0it [00:00, ?it/s]

step length:  tensor([11.3753])
Epoch: 553, Step: 1, Reward: 1118.0, Loss: -1118.0


1it [00:00, 15.21it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2382])
Epoch: 554, Step: 1, Reward: 1020.0, Loss: -1020.0


1it [00:00, 28.10it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 28.99it/s]


 tensor([10.2628])
Epoch: 555, Step: 1, Reward: 1215.0, Loss: -1215.0


0it [00:00, ?it/s]

step length:  tensor([10.3843])
Epoch: 556, Step: 1, Reward: 1082.0, Loss: -1082.0


1it [00:00, 18.45it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1008])
Epoch: 557, Step: 1, Reward: 1006.0, Loss: -1006.0


1it [00:00, 16.58it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3594])
Epoch: 558, Step: 1, Reward: 936.0, Loss: -936.0


1it [00:00, 19.17it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 20.00it/s]


 tensor([10.5095])
Epoch: 559, Step: 1, Reward: 1378.0, Loss: -1378.0


0it [00:00, ?it/s]

step length:  tensor([10.0860])
Epoch: 560, Step: 1, Reward: 949.0, Loss: -949.0


1it [00:00, 19.23it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0303])
Epoch: 561, Step: 1, Reward: 885.0, Loss: -885.0


1it [00:00, 20.00it/s]
1it [00:00, 22.22it/s]


step length:  tensor([10.8376])
Epoch: 562, Step: 1, Reward: 1091.0, Loss: -1091.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 21.27it/s]


tensor([10.8950])
Epoch: 563, Step: 1, Reward: 832.0, Loss: -832.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 30.77it/s]


 tensor([10.0551])
Epoch: 564, Step: 1, Reward: 875.0, Loss: -875.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 29.28it/s]


 tensor([10.1791])
Epoch: 565, Step: 1, Reward: 723.0, Loss: -723.0


1it [00:00, 15.51it/s]


step length:  tensor([10.6523])
Epoch: 566, Step: 1, Reward: 1221.0, Loss: -1221.0


0it [00:00, ?it/s]

step length:  tensor([10.7342])
Epoch: 567, Step: 1, Reward: 1137.0, Loss: -1137.0


1it [00:00, 19.82it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2936])
Epoch: 568, Step: 1, Reward: 840.0, Loss: -840.0


1it [00:00, 20.38it/s]
1it [00:00, 20.68it/s]


step length:  tensor([10.5452])
Epoch: 569, Step: 1, Reward: 887.0, Loss: -887.0


0it [00:00, ?it/s]

step length:  tensor([10.7842])


1it [00:00, 17.40it/s]


Epoch: 570, Step: 1, Reward: 904.0, Loss: -904.0


0it [00:00, ?it/s]

step length:  tensor([9.9644])
Epoch: 571, Step: 1, Reward: 1181.0, Loss: -1181.0


1it [00:00, 20.02it/s]
0it [00:00, ?it/s]

step length:  tensor([9.9830])
Epoch: 572, Step: 1, Reward: 841.0, Loss: -841.0


1it [00:00, 16.56it/s]
1it [00:00, 15.22it/s]


step length:  tensor([10.2090])
Epoch: 573, Step: 1, Reward: 1033.0, Loss: -1033.0


0it [00:00, ?it/s]

step length:  tensor([11.2675])


1it [00:00, 15.82it/s]


Epoch: 574, Step: 1, Reward: 1309.0, Loss: -1309.0


0it [00:00, ?it/s]

step length:  tensor([10.5882])


1it [00:00, 19.45it/s]


Epoch: 575, Step: 1, Reward: 1220.0, Loss: -1220.0


0it [00:00, ?it/s]

step length:  tensor([10.3709])
Epoch: 576, Step: 1, Reward: 926.0, Loss: -926.0


1it [00:00, 21.33it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4059])
Epoch: 577, Step: 1, Reward: 1159.0, Loss: -1159.0


1it [00:00, 16.14it/s]
0it [00:00, ?it/s]

step length:  tensor([11.3345])
Epoch: 578, Step: 1, Reward: 1142.0, Loss: -1142.0


1it [00:00, 24.58it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5486])


1it [00:00, 18.12it/s]


Epoch: 579, Step: 1, Reward: 628.0, Loss: -628.0


0it [00:00, ?it/s]

step length:  tensor([10.6721])


1it [00:00, 17.75it/s]


Epoch: 580, Step: 1, Reward: 1348.0, Loss: -1348.0


1it [00:00, 26.53it/s]


step length:  tensor([10.2331])
Epoch: 581, Step: 1, Reward: 900.0, Loss: -900.0


0it [00:00, ?it/s]

step length:  tensor([10.5057])
Epoch: 582, Step: 1, Reward: 965.0, Loss: -965.0


1it [00:00, 19.67it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4220])


1it [00:00, 20.33it/s]


Epoch: 583, Step: 1, Reward: 893.0, Loss: -893.0


0it [00:00, ?it/s]

step length:  tensor([10.6859])


1it [00:00, 17.56it/s]


Epoch: 584, Step: 1, Reward: 1197.0, Loss: -1197.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 23.12it/s]


 tensor([10.4453])
Epoch: 585, Step: 1, Reward: 831.0, Loss: -831.0


0it [00:00, ?it/s]

step length:  tensor([10.6190])


1it [00:00, 15.06it/s]


Epoch: 586, Step: 1, Reward: 976.0, Loss: -976.0


0it [00:00, ?it/s]

step length:  tensor([11.4974])


1it [00:00, 15.92it/s]


Epoch: 587, Step: 1, Reward: 1009.0, Loss: -1009.0


0it [00:00, ?it/s]

step length:  tensor([10.0159])


1it [00:00, 18.98it/s]


Epoch: 588, Step: 1, Reward: 1047.0, Loss: -1047.0


0it [00:00, ?it/s]

step length:  tensor([10.6814])
Epoch: 589, Step: 1, Reward: 1065.0, Loss: -1065.0


1it [00:00, 17.93it/s]
0it [00:00, ?it/s]

step length:  tensor([9.8464])
Epoch: 590, Step: 1, Reward: 1158.0, Loss: -1158.0


1it [00:00, 20.15it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0887])
Epoch: 591, Step: 1, Reward: 1081.0, Loss: -1081.0


1it [00:00, 19.81it/s]
0it [00:00, ?it/s]

step length:  tensor([9.8497])
Epoch: 592, Step: 1, Reward: 969.0, Loss: -969.0


1it [00:00, 19.94it/s]
1it [00:00, 20.48it/s]


step length:  tensor([10.5918])
Epoch: 593, Step: 1, Reward: 1161.0, Loss: -1161.0


1it [00:00, 19.70it/s]


step length:  tensor([10.6130])
Epoch: 594, Step: 1, Reward: 1333.0, Loss: -1333.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 18.68it/s]


tensor([10.4417])
Epoch: 595, Step: 1, Reward: 975.0, Loss: -975.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 21.83it/s]


tensor([10.3947])
Epoch: 596, Step: 1, Reward: 892.0, Loss: -892.0


0it [00:00, ?it/s]

step length:  tensor([10.1085])
Epoch: 597, Step: 1, Reward: 653.0, Loss: -653.0


1it [00:00, 15.82it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5716])


1it [00:00, 18.44it/s]


Epoch: 598, Step: 1, Reward: 1323.0, Loss: -1323.0


0it [00:00, ?it/s]

step length:  tensor([10.2243])
Epoch: 599, Step: 1, Reward: 899.0, Loss: -899.0


1it [00:00, 20.97it/s]
0it [00:00, ?it/s]

step length:  tensor([10.6262])
Epoch: 600, Step: 1, Reward: 938.0, Loss: -938.0


1it [00:00, 21.14it/s]
0it [00:00, ?it/s]

step length:  tensor([10.8283])
Epoch: 601, Step: 1, Reward: 937.0, Loss: -937.0


1it [00:00, 19.21it/s]
0it [00:00, ?it/s]

step length:  tensor([9.9357])
Epoch: 602, Step: 1, Reward: 1135.0, Loss: -1135.0


1it [00:00, 22.18it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2465])
Epoch: 603, Step: 1, Reward: 1100.0, Loss: -1100.0


1it [00:00, 18.89it/s]
0it [00:00, ?it/s]

step length:  tensor([10.6917])
Epoch: 604, Step: 1, Reward: 819.0, Loss: -819.0


1it [00:00, 17.17it/s]
1it [00:00, 22.80it/s]


step length:  tensor([10.1010])
Epoch: 605, Step: 1, Reward: 696.0, Loss: -696.0


0it [00:00, ?it/s]

step length:  tensor([10.6036])


1it [00:00, 21.65it/s]


Epoch: 606, Step: 1, Reward: 785.0, Loss: -785.0


0it [00:00, ?it/s]

step length:  tensor([9.8818])
Epoch: 607, Step: 1, Reward: 999.0, Loss: -999.0


1it [00:00, 16.48it/s]
0it [00:00, ?it/s]

step length:  tensor([9.9202])


1it [00:00, 17.35it/s]


Epoch: 608, Step: 1, Reward: 1165.0, Loss: -1165.0


0it [00:00, ?it/s]

step length:  tensor([10.2033])


1it [00:00, 16.69it/s]


Epoch: 609, Step: 1, Reward: 986.0, Loss: -986.0


0it [00:00, ?it/s]

step length:  tensor([10.2191])
Epoch: 610, Step: 1, Reward: 1400.0, Loss: -1400.0


1it [00:00, 17.07it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4539])
Epoch: 611, Step: 1, Reward: 1069.0, Loss: -1069.0


1it [00:00, 16.99it/s]
0it [00:00, ?it/s]

step length:  tensor([9.9377])


1it [00:00, 20.52it/s]


Epoch: 612, Step: 1, Reward: 792.0, Loss: -792.0


0it [00:00, ?it/s]

step length:  tensor([10.5993])
Epoch: 613, Step: 1, Reward: 847.0, Loss: -847.0


1it [00:00, 17.75it/s]
0it [00:00, ?it/s]

step length:  tensor([11.1239])
Epoch: 614, Step: 1, Reward: 773.0, Loss: -773.0


1it [00:00, 18.21it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0396])
Epoch: 615, Step: 1, Reward: 943.0, Loss: -943.0


1it [00:00, 17.67it/s]
0it [00:00, ?it/s]

step length:  tensor([10.8410])
Epoch: 616, Step: 1, Reward: 782.0, Loss: -782.0


1it [00:00, 22.97it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7082])
Epoch: 617, Step: 1, Reward: 907.0, Loss: -907.0


1it [00:00, 15.10it/s]
0it [00:00, ?it/s]

step length:  tensor([9.8052])


1it [00:00, 20.02it/s]


Epoch: 618, Step: 1, Reward: 1066.0, Loss: -1066.0


0it [00:00, ?it/s]

step length:  tensor([9.8301])


1it [00:00, 20.66it/s]


Epoch: 619, Step: 1, Reward: 857.0, Loss: -857.0


1it [00:00, 20.55it/s]


step length:  tensor([10.5279])
Epoch: 620, Step: 1, Reward: 1107.0, Loss: -1107.0


0it [00:00, ?it/s]

step length:  tensor([10.5053])
Epoch: 621, Step: 1, Reward: 1347.0, Loss: -1347.0


1it [00:00, 16.12it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0973])


1it [00:00, 21.33it/s]


Epoch: 622, Step: 1, Reward: 850.0, Loss: -850.0


0it [00:00, ?it/s]

step length:  tensor([10.7592])


1it [00:00, 21.33it/s]


Epoch: 623, Step: 1, Reward: 1116.0, Loss: -1116.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 21.33it/s]


 tensor([9.8914])
Epoch: 624, Step: 1, Reward: 1134.0, Loss: -1134.0


0it [00:00, ?it/s]

step length:  tensor([9.7428])


1it [00:00, 17.03it/s]


Epoch: 625, Step: 1, Reward: 1018.0, Loss: -1018.0


0it [00:00, ?it/s]

step length:  tensor([10.0900])


1it [00:00, 16.00it/s]


Epoch: 626, Step: 1, Reward: 1046.0, Loss: -1046.0


0it [00:00, ?it/s]

step length:  tensor([10.5244])


1it [00:00, 16.23it/s]


Epoch: 627, Step: 1, Reward: 1119.0, Loss: -1119.0


0it [00:00, ?it/s]

step length:  tensor([10.3808])


1it [00:00, 17.61it/s]


Epoch: 628, Step: 1, Reward: 1010.0, Loss: -1010.0


0it [00:00, ?it/s]

step length:  tensor([10.4118])
Epoch: 629, Step: 1, Reward: 905.0, Loss: -905.0


1it [00:00, 24.08it/s]
0it [00:00, ?it/s]

step length:  tensor([10.6827])
Epoch: 630, Step: 1, Reward: 1051.0, Loss: -1051.0


1it [00:00, 20.70it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5697])
Epoch: 631, Step: 1, Reward: 1333.0, Loss: -1333.0


1it [00:00, 21.29it/s]
0it [00:00, ?it/s]

step length:  tensor([10.9415])
Epoch: 632, Step: 1, Reward: 916.0, Loss: -916.0


1it [00:00, 18.09it/s]
1it [00:00, 20.83it/s]


step length:  tensor([10.0882])
Epoch: 633, Step: 1, Reward: 984.0, Loss: -984.0


1it [00:00, 18.93it/s]


step length:  tensor([9.9658])
Epoch: 634, Step: 1, Reward: 719.0, Loss: -719.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.96it/s]


 tensor([10.5727])
Epoch: 635, Step: 1, Reward: 1005.0, Loss: -1005.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.81it/s]


tensor([10.9330])
Epoch: 636, Step: 1, Reward: 1145.0, Loss: -1145.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 20.36it/s]


 tensor([10.7358])
Epoch: 637, Step: 1, Reward: 1168.0, Loss: -1168.0


0it [00:00, ?it/s]

step length:  tensor([10.3667])
Epoch: 638, Step: 1, Reward: 1019.0, Loss: -1019.0


1it [00:00, 20.68it/s]
1it [00:00, 27.95it/s]


step length:  tensor([11.0607])
Epoch: 639, Step: 1, Reward: 869.0, Loss: -869.0


0it [00:00, ?it/s]

step length:  tensor([10.4113])
Epoch: 640, Step: 1, Reward: 949.0, Loss: -949.0


1it [00:00, 15.33it/s]
1it [00:00, 20.34it/s]


step length:  tensor([9.9191])
Epoch: 641, Step: 1, Reward: 1169.0, Loss: -1169.0


0it [00:00, ?it/s]

step length:  tensor([10.1050])
Epoch: 642, Step: 1, Reward: 867.0, Loss: -867.0


1it [00:00, 18.61it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5038])
Epoch: 643, Step: 1, Reward: 1041.0, Loss: -1041.0


1it [00:00, 21.25it/s]
0it [00:00, ?it/s]

step length:  tensor([11.1356])


1it [00:00, 20.10it/s]


Epoch: 644, Step: 1, Reward: 641.0, Loss: -641.0


0it [00:00, ?it/s]

step length:  tensor([10.3191])


1it [00:00, 20.52it/s]


Epoch: 645, Step: 1, Reward: 896.0, Loss: -896.0


1it [00:00, 19.62it/s]


step length:  tensor([10.4366])
Epoch: 646, Step: 1, Reward: 1134.0, Loss: -1134.0


0it [00:00, ?it/s]

step length:  tensor([10.1983])
Epoch: 647, Step: 1, Reward: 1063.0, Loss: -1063.0


1it [00:00, 19.78it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0914])
Epoch: 648, Step: 1, Reward: 904.0, Loss: -904.0


1it [00:00, 19.79it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0892])


1it [00:00, 19.78it/s]


Epoch: 649, Step: 1, Reward: 857.0, Loss: -857.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 20.40it/s]


 tensor([10.4898])
Epoch: 650, Step: 1, Reward: 673.0, Loss: -673.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 21.16it/s]


 tensor([10.2573])
Epoch: 651, Step: 1, Reward: 1049.0, Loss: -1049.0


0it [00:00, ?it/s]

step length:  tensor([11.1762])
Epoch: 652, Step: 1, Reward: 976.0, Loss: -976.0


1it [00:00, 19.15it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1823])
Epoch: 653, Step: 1, Reward: 919.0, Loss: -919.0


1it [00:00, 22.33it/s]
1it [00:00, 16.85it/s]


step length:  tensor([10.4761])
Epoch: 654, Step: 1, Reward: 976.0, Loss: -976.0


1it [00:00, 15.37it/s]


step length:  tensor([10.4210])
Epoch: 655, Step: 1, Reward: 856.0, Loss: -856.0


0it [00:00, ?it/s]

step length:  tensor([9.9326])


1it [00:00, 19.52it/s]


Epoch: 656, Step: 1, Reward: 1133.0, Loss: -1133.0


0it [00:00, ?it/s]

step length:  tensor([9.6901])
Epoch: 657, Step: 1, Reward: 1075.0, Loss: -1075.0


1it [00:00, 22.04it/s]
0it [00:00, ?it/s]

step length:  tensor([9.4416])
Epoch: 658, Step: 1, Reward: 810.0, Loss: -810.0


1it [00:00, 21.33it/s]
0it [00:00, ?it/s]

step length:  tensor([10.8428])


1it [00:00, 18.50it/s]


Epoch: 659, Step: 1, Reward: 983.0, Loss: -983.0


0it [00:00, ?it/s]

step length:  tensor([9.8997])
Epoch: 660, Step: 1, Reward: 977.0, Loss: -977.0


1it [00:00, 25.42it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2432])


1it [00:00, 16.23it/s]


Epoch: 661, Step: 1, Reward: 1146.0, Loss: -1146.0


0it [00:00, ?it/s]

step length:  tensor([9.7927])


1it [00:00, 20.88it/s]


Epoch: 662, Step: 1, Reward: 1082.0, Loss: -1082.0


0it [00:00, ?it/s]

step length:  tensor([10.1854])
Epoch: 663, Step: 1, Reward: 854.0, Loss: -854.0


1it [00:00, 19.43it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4548])
Epoch: 664, Step: 1, Reward: 1273.0, Loss: -1273.0


1it [00:00, 14.90it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2882])


1it [00:00, 19.51it/s]


Epoch: 665, Step: 1, Reward: 1027.0, Loss: -1027.0


0it [00:00, ?it/s]

step length:  tensor([10.0208])
Epoch: 666, Step: 1, Reward: 976.0, Loss: -976.0


1it [00:00, 20.54it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2450])
Epoch: 667, Step: 1, Reward: 962.0, Loss: -962.0


1it [00:00, 20.53it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0288])
Epoch: 668, Step: 1, Reward: 1050.0, Loss: -1050.0


1it [00:00, 19.76it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3206])
Epoch: 669, Step: 1, Reward: 1075.0, Loss: -1075.0


1it [00:00, 19.61it/s]
1it [00:00, 19.69it/s]


step length:  tensor([9.7827])
Epoch: 670, Step: 1, Reward: 1290.0, Loss: -1290.0


0it [00:00, ?it/s]

step length:  tensor([10.4320])


1it [00:00, 19.51it/s]


Epoch: 671, Step: 1, Reward: 1183.0, Loss: -1183.0


0it [00:00, ?it/s]

step length:  tensor([10.2879])


1it [00:00, 14.92it/s]


Epoch: 672, Step: 1, Reward: 736.0, Loss: -736.0


0it [00:00, ?it/s]

step length:  tensor([11.1110])


1it [00:00, 20.10it/s]


Epoch: 673, Step: 1, Reward: 1156.0, Loss: -1156.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 18.23it/s]


tensor([11.1978])
Epoch: 674, Step: 1, Reward: 876.0, Loss: -876.0


0it [00:00, ?it/s]

step length:  tensor([9.3221])
Epoch: 675, Step: 1, Reward: 888.0, Loss: -888.0

1it [00:00, 20.46it/s]





0it [00:00, ?it/s]

step length:  tensor([10.1880])


1it [00:00, 20.14it/s]


Epoch: 676, Step: 1, Reward: 1051.0, Loss: -1051.0


0it [00:00, ?it/s]

step length:  tensor([10.5103])
Epoch: 677, Step: 1, Reward: 926.0, Loss: -926.0


1it [00:00, 16.68it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1799])
Epoch: 678, Step: 1, Reward: 1224.0, Loss: -1224.0


1it [00:00, 19.59it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3337])
Epoch: 679, Step: 1, Reward: 749.0, Loss: -749.0


1it [00:00, 19.98it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7717])
Epoch: 680, Step: 1, Reward: 969.0, Loss: -969.0


1it [00:00, 19.77it/s]
0it [00:00, ?it/s]

step length:  tensor([10.8612])
Epoch: 681, Step: 1, Reward: 912.0, Loss: -912.0


1it [00:00, 19.23it/s]
1it [00:00, 21.29it/s]


step length:  tensor([10.1117])
Epoch: 682, Step: 1, Reward: 880.0, Loss: -880.0


0it [00:00, ?it/s]

step length:  tensor([10.0420])
Epoch: 683, Step: 1, Reward: 859.0, Loss: -859.0


1it [00:00, 17.40it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4203])
Epoch: 684, Step: 1, Reward: 1204.0, Loss: -1204.0


1it [00:00, 17.97it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3463])
Epoch: 685, Step: 1, Reward: 892.0, Loss: -892.0


1it [00:00, 19.70it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3287])
Epoch: 686, Step: 1, Reward: 999.0, Loss: -999.0


1it [00:00, 19.46it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5469])
Epoch: 687, Step: 1, Reward: 1144.0, Loss: -1144.0


1it [00:00, 18.30it/s]
0it [00:00, ?it/s]

step length:  tensor([10.6196])


1it [00:00, 18.58it/s]


Epoch: 688, Step: 1, Reward: 965.0, Loss: -965.0


0it [00:00, ?it/s]

step length:  tensor([10.7275])
Epoch: 689, Step: 1, Reward: 779.0, Loss: -779.0


1it [00:00, 18.59it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2129])


1it [00:00, 18.78it/s]


Epoch: 690, Step: 1, Reward: 1031.0, Loss: -1031.0


0it [00:00, ?it/s]

step length:  tensor([10.6154])


1it [00:00, 18.62it/s]


Epoch: 691, Step: 1, Reward: 881.0, Loss: -881.0


0it [00:00, ?it/s]

step length:  tensor([10.6843])
Epoch: 692, Step: 1, Reward: 1085.0, Loss: -1085.0


1it [00:00, 19.34it/s]
1it [00:00, 23.28it/s]


step length:  tensor([10.2170])
Epoch: 693, Step: 1, Reward: 1007.0, Loss: -1007.0


0it [00:00, ?it/s]

step length:  tensor([10.9958])
Epoch: 694, Step: 1, Reward: 1276.0, Loss: -1276.0


1it [00:00, 13.78it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4895])
Epoch: 695, Step: 1, Reward: 877.0, Loss: -877.0


1it [00:00, 15.22it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 19.44it/s]


 tensor([10.4116])
Epoch: 696, Step: 1, Reward: 970.0, Loss: -970.0


0it [00:00, ?it/s]

step length:  tensor([10.9242])
Epoch: 697, Step: 1, Reward: 1073.0, Loss: -1073.0


1it [00:00, 19.59it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2038])
Epoch: 698, Step: 1, Reward: 947.0, Loss: -947.0


1it [00:00, 23.38it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2713])
Epoch: 699, Step: 1, Reward: 1160.0, Loss: -1160.0


1it [00:00, 15.71it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7955])
Epoch: 700, Step: 1, Reward: 1023.0, Loss: -1023.0


1it [00:00, 17.42it/s]
0it [00:00, ?it/s]

step length:  tensor([11.0420])
Epoch: 701, Step: 1, Reward: 931.0, Loss: -931.0


1it [00:00, 20.03it/s]
1it [00:00, 25.07it/s]


step length:  tensor([10.3892])
Epoch: 702, Step: 1, Reward: 885.0, Loss: -885.0


0it [00:00, ?it/s]

step length:  tensor([10.3952])


1it [00:00, 15.35it/s]


Epoch: 703, Step: 1, Reward: 1366.0, Loss: -1366.0


0it [00:00, ?it/s]

step length:  tensor([11.1183])


1it [00:00, 19.36it/s]


Epoch: 704, Step: 1, Reward: 1181.0, Loss: -1181.0


0it [00:00, ?it/s]

step length:  tensor([10.3198])


1it [00:00, 19.60it/s]


Epoch: 705, Step: 1, Reward: 920.0, Loss: -920.0


1it [00:00, 20.76it/s]


step length:  tensor([10.2937])
Epoch: 706, Step: 1, Reward: 617.0, Loss: -617.0


0it [00:00, ?it/s]

step length:  tensor([10.1897])


1it [00:00, 17.53it/s]


Epoch: 707, Step: 1, Reward: 1082.0, Loss: -1082.0


0it [00:00, ?it/s]

step length:  tensor([11.3666])
Epoch: 708, Step: 1, Reward: 1070.0, Loss: -1070.0


1it [00:00, 17.11it/s]
0it [00:00, ?it/s]

step length:  

1it [00:00, 15.70it/s]


tensor([10.5461])
Epoch: 709, Step: 1, Reward: 1263.0, Loss: -1263.0


0it [00:00, ?it/s]

step length:  tensor([10.0173])


1it [00:00, 19.58it/s]


Epoch: 710, Step: 1, Reward: 888.0, Loss: -888.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 21.03it/s]

 tensor([9.6654])
Epoch: 711, Step: 1, Reward: 840.0, Loss: -840.0



0it [00:00, ?it/s]

step length:  tensor([10.1137])
Epoch: 712, Step: 1, Reward: 1003.0, Loss: -1003.0


1it [00:00, 20.93it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4441])
Epoch: 713, Step: 1, Reward: 1100.0, Loss: -1100.0


1it [00:00, 18.81it/s]
0it [00:00, ?it/s]

step length:  tensor([9.8529])
Epoch: 714, Step: 1, Reward: 1068.0, Loss: -1068.0


1it [00:00, 18.52it/s]
0it [00:00, ?it/s]

step length:  tensor([9.7109])
Epoch: 715, Step: 1, Reward: 894.0, Loss: -894.0


1it [00:00, 19.62it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5191])
Epoch: 716, Step: 1, Reward: 931.0, Loss: -931.0


1it [00:00, 19.34it/s]
1it [00:00, 19.41it/s]

step length:  tensor([10.9722])
Epoch: 717, Step: 1, Reward: 879.0, Loss: -879.0



0it [00:00, ?it/s]

step length:  tensor([10.1952])


1it [00:00, 18.50it/s]


Epoch: 718, Step: 1, Reward: 867.0, Loss: -867.0


0it [00:00, ?it/s]

step length:  tensor([9.3694])
Epoch: 719, Step: 1, Reward: 750.0, Loss: -750.0


1it [00:00, 20.47it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1866])
Epoch: 720, Step: 1, Reward: 853.0, Loss: -853.0


1it [00:00, 18.92it/s]
0it [00:00, ?it/s]

step length:  tensor([11.0299])
Epoch: 721, Step: 1, Reward: 1211.0, Loss: -1211.0


1it [00:00, 17.81it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1253])
Epoch: 722, Step: 1, Reward: 803.0, Loss: -803.0


1it [00:00, 20.38it/s]
1it [00:00, 19.33it/s]

step length:  tensor([10.8223])
Epoch: 723, Step: 1, Reward: 856.0, Loss: -856.0



1it [00:00, 21.37it/s]

step length:  tensor([10.2776])
Epoch: 724, Step: 1, Reward: 927.0, Loss: -927.0



0it [00:00, ?it/s]

step length:  tensor([10.4705])


1it [00:00, 17.45it/s]


Epoch: 725, Step: 1, Reward: 993.0, Loss: -993.0


0it [00:00, ?it/s]

step length:  tensor([10.1471])
Epoch: 726, Step: 1, Reward: 946.0, Loss: -946.0


1it [00:00, 19.78it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7533])
Epoch: 727, Step: 1, Reward: 1003.0, Loss: -1003.0


1it [00:00, 19.05it/s]
0it [00:00, ?it/s]

step length:  tensor([10.9311])
Epoch: 728, Step: 1, Reward: 949.0, Loss: -949.0


1it [00:00, 20.28it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 18.20it/s]

 tensor([9.6641])
Epoch: 729, Step: 1, Reward: 1046.0, Loss: -1046.0



0it [00:00, ?it/s]

step length: 

1it [00:00, 20.39it/s]


 tensor([10.3926])
Epoch: 730, Step: 1, Reward: 912.0, Loss: -912.0


0it [00:00, ?it/s]

step length:  tensor([10.4173])


1it [00:00, 19.37it/s]


Epoch: 731, Step: 1, Reward: 1106.0, Loss: -1106.0


0it [00:00, ?it/s]

step length:  tensor([10.5947])
Epoch: 732, Step: 1, Reward: 1107.0, Loss: -1107.0


1it [00:00, 23.01it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 20.54it/s]


 tensor([10.5440])
Epoch: 733, Step: 1, Reward: 655.0, Loss: -655.0


0it [00:00, ?it/s]

step length:  tensor([10.0074])
Epoch: 734, Step: 1, Reward: 974.0, Loss: -974.0


1it [00:00, 15.16it/s]
0it [00:00, ?it/s]

step length:  tensor([10.8089])
Epoch: 735, Step: 1, Reward: 1288.0, Loss: -1288.0


1it [00:00, 16.74it/s]
0it [00:00, ?it/s]

step length:  

1it [00:00, 18.88it/s]


tensor([9.9994])
Epoch: 736, Step: 1, Reward: 853.0, Loss: -853.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 19.53it/s]


tensor([10.1984])
Epoch: 737, Step: 1, Reward: 1140.0, Loss: -1140.0


0it [00:00, ?it/s]

step length:  tensor([10.6766])
Epoch: 738, Step: 1, Reward: 1122.0, Loss: -1122.0


1it [00:00, 14.75it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0240])
Epoch: 739, Step: 1, Reward: 1029.0, Loss: -1029.0


1it [00:00, 23.37it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2051])
Epoch: 740, Step: 1, Reward: 1262.0, Loss: -1262.0


1it [00:00, 19.39it/s]
0it [00:00, ?it/s]

step length:  tensor([9.6779])
Epoch: 741, Step: 1, Reward: 1096.0, Loss: -1096.0


1it [00:00, 20.00it/s]
1it [00:00, 25.82it/s]


step length:  tensor([9.8605])
Epoch: 742, Step: 1, Reward: 926.0, Loss: -926.0


1it [00:00, 19.47it/s]

step length:  tensor([10.5421])
Epoch: 743, Step: 1, Reward: 1010.0, Loss: -1010.0



1it [00:00, 20.21it/s]


step length:  tensor([10.3512])
Epoch: 744, Step: 1, Reward: 980.0, Loss: -980.0


0it [00:00, ?it/s]

step length:  tensor([11.1971])
Epoch: 745, Step: 1, Reward: 830.0, Loss: -830.0


1it [00:00, 19.03it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2874])
Epoch: 746, Step: 1, Reward: 959.0, Loss: -959.0


1it [00:00, 19.59it/s]
1it [00:00, 21.31it/s]


step length:  tensor([11.3848])
Epoch: 747, Step: 1, Reward: 969.0, Loss: -969.0


1it [00:00, 19.23it/s]


step length:  tensor([10.3706])
Epoch: 748, Step: 1, Reward: 943.0, Loss: -943.0


1it [00:00, 20.46it/s]


step length:  tensor([10.0513])
Epoch: 749, Step: 1, Reward: 1154.0, Loss: -1154.0


0it [00:00, ?it/s]

step length:  tensor([10.3854])
Epoch: 750, Step: 1, Reward: 914.0, Loss: -914.0


1it [00:00, 19.85it/s]
0it [00:00, ?it/s]

step length:  tensor([10.9474])
Epoch: 751, Step: 1, Reward: 1224.0, Loss: -1224.0


1it [00:00, 19.57it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5494])


1it [00:00, 19.49it/s]


Epoch: 752, Step: 1, Reward: 999.0, Loss: -999.0


0it [00:00, ?it/s]

step length:  tensor([11.0119])
Epoch: 753, Step: 1, Reward: 867.0, Loss: -867.0


1it [00:00, 20.25it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5459])


1it [00:00, 19.73it/s]


Epoch: 754, Step: 1, Reward: 772.0, Loss: -772.0


0it [00:00, ?it/s]

step length:  tensor([10.4788])


1it [00:00, 20.67it/s]


Epoch: 755, Step: 1, Reward: 831.0, Loss: -831.0


0it [00:00, ?it/s]

step length:  tensor([10.0406])
Epoch: 756, Step: 1, Reward: 833.0, Loss: -833.0


1it [00:00, 15.17it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 17.69it/s]


 tensor([10.7618])
Epoch: 757, Step: 1, Reward: 953.0, Loss: -953.0


0it [00:00, ?it/s]

step length:  tensor([10.0025])
Epoch: 758, Step: 1, Reward: 1177.0, Loss: -1177.0


1it [00:00, 20.11it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5423])
Epoch: 759, Step: 1, Reward: 1047.0, Loss: -1047.0


1it [00:00, 18.84it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5323])
Epoch: 760, Step: 1, Reward: 1047.0, Loss: -1047.0


1it [00:00, 15.79it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1852])
Epoch: 761, Step: 1, Reward: 1045.0, Loss: -1045.0


1it [00:00, 23.59it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4388])
Epoch: 762, Step: 1, Reward: 918.0, Loss: -918.0


1it [00:00, 14.83it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2370])
Epoch: 763, Step: 1, Reward: 1076.0, Loss: -1076.0


1it [00:00, 26.10it/s]
0it [00:00, ?it/s]

step length:  tensor([11.0573])
Epoch: 764, Step: 1, Reward: 1177.0, Loss: -1177.0


1it [00:00, 16.34it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1945])
Epoch: 765, Step: 1, Reward: 988.0, Loss: -988.0


1it [00:00, 20.35it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4863])


1it [00:00, 20.05it/s]


Epoch: 766, Step: 1, Reward: 1028.0, Loss: -1028.0


1it [00:00, 19.85it/s]


step length:  tensor([10.4803])
Epoch: 767, Step: 1, Reward: 859.0, Loss: -859.0


0it [00:00, ?it/s]

step length:  tensor([10.2333])
Epoch: 768, Step: 1, Reward: 679.0, Loss: -679.0


1it [00:00, 18.11it/s]
0it [00:00, ?it/s]

step length:  tensor([9.7396])
Epoch: 769, Step: 1, Reward: 831.0, Loss: -831.0


1it [00:00, 20.72it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7299])


1it [00:00, 15.29it/s]


Epoch: 770, Step: 1, Reward: 992.0, Loss: -992.0


0it [00:00, ?it/s]

step length:  tensor([10.2403])
Epoch: 771, Step: 1, Reward: 870.0, Loss: -870.0


1it [00:00, 22.07it/s]
1it [00:00, 19.79it/s]


step length:  tensor([10.4286])
Epoch: 772, Step: 1, Reward: 921.0, Loss: -921.0


0it [00:00, ?it/s]

step length:  tensor([10.5725])
Epoch: 773, Step: 1, Reward: 1051.0, Loss: -1051.0


1it [00:00, 20.58it/s]
0it [00:00, ?it/s]

step length:  tensor([9.8207])
Epoch: 774, Step: 1, Reward: 1017.0, Loss: -1017.0


1it [00:00, 16.73it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 17.53it/s]


 tensor([10.4915])
Epoch: 775, Step: 1, Reward: 1171.0, Loss: -1171.0


0it [00:00, ?it/s]

step length:  tensor([9.5776])
Epoch: 776, Step: 1, Reward: 1155.0, Loss: -1155.0


1it [00:00, 18.73it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5989])
Epoch: 777, Step: 1, Reward: 1123.0, Loss: -1123.0


1it [00:00, 18.94it/s]
1it [00:00, 21.57it/s]


step length:  tensor([10.3923])
Epoch: 778, Step: 1, Reward: 856.0, Loss: -856.0


0it [00:00, ?it/s]

step length:  tensor([10.3080])
Epoch: 779, Step: 1, Reward: 974.0, Loss: -974.0


1it [00:00, 19.61it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3900])
Epoch: 780, Step: 1, Reward: 1061.0, Loss: -1061.0


1it [00:00, 16.10it/s]
1it [00:00, 20.45it/s]


step length:  tensor([10.8527])
Epoch: 781, Step: 1, Reward: 1089.0, Loss: -1089.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 19.98it/s]


 tensor([10.2246])
Epoch: 782, Step: 1, Reward: 1079.0, Loss: -1079.0


0it [00:00, ?it/s]

step length:  tensor([10.1646])
Epoch: 783, Step: 1, Reward: 783.0, Loss: -783.0


1it [00:00, 18.47it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2415])


1it [00:00, 15.34it/s]


Epoch: 784, Step: 1, Reward: 982.0, Loss: -982.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 17.29it/s]


tensor([10.1926])
Epoch: 785, Step: 1, Reward: 994.0, Loss: -994.0


0it [00:00, ?it/s]

step length:  tensor([10.0367])
Epoch: 786, Step: 1, Reward: 1098.0, Loss: -1098.0


1it [00:00, 20.04it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 27.98it/s]


 tensor([10.3984])
Epoch: 787, Step: 1, Reward: 1038.0, Loss: -1038.0


0it [00:00, ?it/s]

step length:  tensor([11.1886])
Epoch: 788, Step: 1, Reward: 980.0, Loss: -980.0


1it [00:00, 15.03it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4430])
Epoch: 789, Step: 1, Reward: 1037.0, Loss: -1037.0


1it [00:00, 17.70it/s]
1it [00:00, 21.16it/s]


step length:  tensor([10.1858])
Epoch: 790, Step: 1, Reward: 971.0, Loss: -971.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 21.51it/s]


tensor([11.4045])
Epoch: 791, Step: 1, Reward: 1249.0, Loss: -1249.0


1it [00:00, 17.22it/s]


step length:  tensor([9.8885])
Epoch: 792, Step: 1, Reward: 931.0, Loss: -931.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 23.89it/s]


 tensor([10.2004])
Epoch: 793, Step: 1, Reward: 953.0, Loss: -953.0


0it [00:00, ?it/s]

step length:  tensor([10.8994])


1it [00:00, 13.93it/s]


Epoch: 794, Step: 1, Reward: 892.0, Loss: -892.0


0it [00:00, ?it/s]

step length:  tensor([9.7906])
Epoch: 795, Step: 1, Reward: 955.0, Loss: -955.0


1it [00:00, 19.11it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7273])
Epoch: 796, Step: 1, Reward: 880.0, Loss: -880.0


1it [00:00, 24.05it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7446])
Epoch: 797, Step: 1, Reward: 950.0, Loss: -950.0


1it [00:00, 18.40it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5192])
Epoch: 798, Step: 1, Reward: 1039.0, Loss: -1039.0


1it [00:00, 19.53it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0594])


1it [00:00, 19.14it/s]


Epoch: 799, Step: 1, Reward: 1187.0, Loss: -1187.0


1it [00:00, 20.85it/s]


step length:  tensor([10.6526])
Epoch: 800, Step: 1, Reward: 978.0, Loss: -978.0


0it [00:00, ?it/s]

step length:  tensor([10.3260])
Epoch: 801, Step: 1, Reward: 1264.0, Loss: -1264.0


1it [00:00, 19.25it/s]
0it [00:00, ?it/s]

step length:  tensor([11.4998])
Epoch: 802, Step: 1, Reward: 945.0, Loss: -945.0


1it [00:00, 20.92it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4548])
Epoch: 803, Step: 1, Reward: 582.0, Loss: -582.0


1it [00:00, 19.15it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3029])


1it [00:00, 15.57it/s]


Epoch: 804, Step: 1, Reward: 782.0, Loss: -782.0


0it [00:00, ?it/s]

step length:  tensor([10.1134])


1it [00:00, 14.88it/s]


Epoch: 805, Step: 1, Reward: 934.0, Loss: -934.0


0it [00:00, ?it/s]

step length:  tensor([9.7312])


1it [00:00, 19.34it/s]


Epoch: 806, Step: 1, Reward: 819.0, Loss: -819.0


0it [00:00, ?it/s]

step length:  tensor([9.9846])


1it [00:00, 21.52it/s]


Epoch: 807, Step: 1, Reward: 905.0, Loss: -905.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 18.29it/s]


 tensor([9.6018])
Epoch: 808, Step: 1, Reward: 990.0, Loss: -990.0


0it [00:00, ?it/s]

step length:  tensor([10.0603])
Epoch: 809, Step: 1, Reward: 1051.0, Loss: -1051.0


1it [00:00, 19.79it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2820])


1it [00:00, 17.19it/s]


Epoch: 810, Step: 1, Reward: 1113.0, Loss: -1113.0


0it [00:00, ?it/s]

step length:  tensor([9.9246])
Epoch: 811, Step: 1, Reward: 1225.0, Loss: -1225.0


1it [00:00, 18.75it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1670])
Epoch: 812, Step: 1, Reward: 736.0, Loss: -736.0


1it [00:00, 24.09it/s]
0it [00:00, ?it/s]

step length:  tensor([10.9934])


1it [00:00, 12.39it/s]


Epoch: 813, Step: 1, Reward: 1303.0, Loss: -1303.0


0it [00:00, ?it/s]

step length:  tensor([10.9060])


1it [00:00, 18.47it/s]


Epoch: 814, Step: 1, Reward: 1034.0, Loss: -1034.0


0it [00:00, ?it/s]

step length:  tensor([10.7738])
Epoch: 815, Step: 1, Reward: 936.0, Loss: -936.0


1it [00:00, 21.70it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 16.99it/s]


 tensor([10.2428])
Epoch: 816, Step: 1, Reward: 1035.0, Loss: -1035.0


0it [00:00, ?it/s]

step length:  tensor([10.6678])


1it [00:00, 17.20it/s]


Epoch: 817, Step: 1, Reward: 858.0, Loss: -858.0


0it [00:00, ?it/s]

step length:  tensor([10.6762])
Epoch: 818, Step: 1, Reward: 1104.0, Loss: -1104.0


1it [00:00, 16.61it/s]
0it [00:00, ?it/s]

step length:  tensor([10.9491])
Epoch: 819, Step: 1, Reward: 904.0, Loss: -904.0


1it [00:00, 17.79it/s]
0it [00:00, ?it/s]

step length:  

1it [00:00, 21.61it/s]


tensor([10.1492])
Epoch: 820, Step: 1, Reward: 1176.0, Loss: -1176.0


0it [00:00, ?it/s]

step length:  tensor([10.4227])
Epoch: 821, Step: 1, Reward: 1025.0, Loss: -1025.0


1it [00:00, 14.92it/s]
0it [00:00, ?it/s]

step length:  tensor([10.9387])
Epoch: 822, Step: 1, Reward: 1033.0, Loss: -1033.0


1it [00:00, 20.22it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0448])
Epoch: 823, Step: 1, Reward: 897.0, Loss: -897.0


1it [00:00, 22.76it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3159])
Epoch: 824, Step: 1, Reward: 1004.0, Loss: -1004.0


1it [00:00, 22.84it/s]
1it [00:00, 19.29it/s]


step length:  tensor([10.9732])
Epoch: 825, Step: 1, Reward: 1094.0, Loss: -1094.0


1it [00:00, 20.03it/s]


step length:  tensor([9.7318])
Epoch: 826, Step: 1, Reward: 1319.0, Loss: -1319.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 21.80it/s]


 tensor([10.4119])
Epoch: 827, Step: 1, Reward: 1049.0, Loss: -1049.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 17.48it/s]


 tensor([10.2653])
Epoch: 828, Step: 1, Reward: 925.0, Loss: -925.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 22.29it/s]


 tensor([10.6338])
Epoch: 829, Step: 1, Reward: 713.0, Loss: -713.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 21.83it/s]

 tensor([10.4038])
Epoch: 830, Step: 1, Reward: 989.0, Loss: -989.0



0it [00:00, ?it/s]

step length:  tensor([10.6615])
Epoch: 831, Step: 1, Reward: 1095.0, Loss: -1095.0


1it [00:00, 19.26it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5049])


1it [00:00, 14.96it/s]


Epoch: 832, Step: 1, Reward: 1243.0, Loss: -1243.0


0it [00:00, ?it/s]

step length:  tensor([10.2255])
Epoch: 833, Step: 1, Reward: 901.0, Loss: -901.0


1it [00:00, 18.10it/s]
0it [00:00, ?it/s]

step length: 

1it [00:00, 20.19it/s]


 tensor([10.1396])
Epoch: 834, Step: 1, Reward: 942.0, Loss: -942.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 18.22it/s]


 tensor([10.4119])
Epoch: 835, Step: 1, Reward: 1006.0, Loss: -1006.0


0it [00:00, ?it/s]

step length:  tensor([9.5913])
Epoch: 836, Step: 1, Reward: 799.0, Loss: -799.0


1it [00:00, 19.14it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5408])
Epoch: 837, Step: 1, Reward: 1144.0, Loss: -1144.0


1it [00:00, 17.85it/s]
0it [00:00, ?it/s]

step length:  tensor([9.7689])
Epoch: 838, Step: 1, Reward: 976.0, Loss: -976.0


1it [00:00, 23.84it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4052])
Epoch: 839, Step: 1, Reward: 914.0, Loss: -914.0


1it [00:00, 19.57it/s]
0it [00:00, ?it/s]

step length:  tensor([10.9806])
Epoch: 840, Step: 1, Reward: 704.0, Loss: -704.0


1it [00:00, 20.28it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4381])
Epoch: 841, Step: 1, Reward: 1008.0, Loss: -1008.0


1it [00:00, 18.60it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1640])
Epoch: 842, Step: 1, Reward: 916.0, Loss: -916.0


1it [00:00, 19.29it/s]
1it [00:00, 19.20it/s]


step length:  tensor([10.2573])
Epoch: 843, Step: 1, Reward: 889.0, Loss: -889.0


0it [00:00, ?it/s]

step length:  tensor([10.6029])
Epoch: 844, Step: 1, Reward: 994.0, Loss: -994.0


1it [00:00, 24.49it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7283])
Epoch: 845, Step: 1, Reward: 1000.0, Loss: -1000.0


1it [00:00, 18.67it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2117])
Epoch: 846, Step: 1, Reward: 845.0, Loss: -845.0


1it [00:00, 19.94it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4874])
Epoch: 847, Step: 1, Reward: 944.0, Loss: -944.0


1it [00:00, 17.47it/s]
0it [00:00, ?it/s]

step length:  tensor([10.9372])
Epoch: 848, Step: 1, Reward: 1005.0, Loss: -1005.0


1it [00:00, 21.00it/s]
0it [00:00, ?it/s]

step length:  tensor([9.8113])
Epoch: 849, Step: 1, Reward: 918.0, Loss: -918.0


1it [00:00, 21.68it/s]
0it [00:00, ?it/s]

step length:  tensor([9.8872])


1it [00:00, 18.42it/s]


Epoch: 850, Step: 1, Reward: 965.0, Loss: -965.0


0it [00:00, ?it/s]

step length:  tensor([10.2297])
Epoch: 851, Step: 1, Reward: 907.0, Loss: -907.0


1it [00:00, 18.61it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3080])
Epoch: 852, Step: 1, Reward: 795.0, Loss: -795.0


1it [00:00, 17.52it/s]
0it [00:00, ?it/s]

step length:  tensor([11.0180])
Epoch: 853, Step: 1, Reward: 957.0, Loss: -957.0


1it [00:00, 23.89it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3815])
Epoch: 854, Step: 1, Reward: 857.0, Loss: -857.0


1it [00:00, 20.41it/s]
0it [00:00, ?it/s]

step length:  tensor([9.7838])
Epoch: 855, Step: 1, Reward: 1128.0, Loss: -1128.0


1it [00:00, 19.75it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7052])
Epoch: 856, Step: 1, Reward: 680.0, Loss: -680.0


1it [00:00, 18.71it/s]
0it [00:00, ?it/s]

step length:  tensor([10.8896])
Epoch: 857, Step: 1, Reward: 1089.0, Loss: -1089.0


1it [00:00, 20.42it/s]
1it [00:00, 22.15it/s]


step length:  tensor([9.9215])
Epoch: 858, Step: 1, Reward: 943.0, Loss: -943.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 18.70it/s]


 tensor([10.5730])
Epoch: 859, Step: 1, Reward: 897.0, Loss: -897.0


0it [00:00, ?it/s]

step length:  tensor([10.0369])
Epoch: 860, Step: 1, Reward: 1338.0, Loss: -1338.0


1it [00:00, 15.91it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7530])
Epoch: 861, Step: 1, Reward: 860.0, Loss: -860.0


1it [00:00, 20.19it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3501])
Epoch: 862, Step: 1, Reward: 1001.0, Loss: -1001.0


1it [00:00, 15.16it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4304])


1it [00:00, 18.90it/s]


Epoch: 863, Step: 1, Reward: 1063.0, Loss: -1063.0


0it [00:00, ?it/s]

step length:  tensor([10.3770])
Epoch: 864, Step: 1, Reward: 1247.0, Loss: -1247.0


1it [00:00, 21.79it/s]
0it [00:00, ?it/s]

step length:  tensor([10.6824])
Epoch: 865, Step: 1, Reward: 899.0, Loss: -899.0


1it [00:00, 18.86it/s]
0it [00:00, ?it/s]

step length:  tensor([9.9086])


1it [00:00, 19.09it/s]


Epoch: 866, Step: 1, Reward: 1095.0, Loss: -1095.0


0it [00:00, ?it/s]

step length:  tensor([10.8250])


1it [00:00, 20.20it/s]


Epoch: 867, Step: 1, Reward: 725.0, Loss: -725.0


0it [00:00, ?it/s]

step length:  tensor([10.4111])


1it [00:00, 15.91it/s]


Epoch: 868, Step: 1, Reward: 1020.0, Loss: -1020.0


0it [00:00, ?it/s]

step length:  tensor([10.2592])


1it [00:00, 21.83it/s]


Epoch: 869, Step: 1, Reward: 1011.0, Loss: -1011.0


0it [00:00, ?it/s]

step length:  tensor([10.9734])


1it [00:00, 17.54it/s]


Epoch: 870, Step: 1, Reward: 866.0, Loss: -866.0


0it [00:00, ?it/s]

step length:  tensor([10.1691])
Epoch: 871, Step: 1, Reward: 796.0, Loss: -796.0


1it [00:00, 19.57it/s]
0it [00:00, ?it/s]

step length:  tensor([9.9679])
Epoch: 872, Step: 1, Reward: 1059.0, Loss: -1059.0


1it [00:00, 23.42it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1467])
Epoch: 873, Step: 1, Reward: 876.0, Loss: -876.0


1it [00:00, 15.14it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5045])
Epoch: 874, Step: 1, Reward: 1033.0, Loss: -1033.0


1it [00:00, 19.66it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1179])
Epoch: 875, Step: 1, Reward: 1047.0, Loss: -1047.0


1it [00:00, 13.72it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0137])


1it [00:00, 21.24it/s]


Epoch: 876, Step: 1, Reward: 884.0, Loss: -884.0


0it [00:00, ?it/s]

step length:  tensor([9.9038])
Epoch: 877, Step: 1, Reward: 806.0, Loss: -806.0


1it [00:00, 23.61it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4157])
Epoch: 878, Step: 1, Reward: 990.0, Loss: -990.0


1it [00:00, 17.54it/s]
0it [00:00, ?it/s]

step length:  tensor([9.7957])
Epoch: 879, Step: 1, Reward: 870.0, Loss: -870.0


1it [00:00, 21.52it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4924])


1it [00:00, 20.32it/s]


Epoch: 880, Step: 1, Reward: 1052.0, Loss: -1052.0


0it [00:00, ?it/s]

step length:  tensor([9.6539])
Epoch: 881, Step: 1, Reward: 914.0, Loss: -914.0


1it [00:00, 15.69it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1415])
Epoch: 882, Step: 1, Reward: 1252.0, Loss: -1252.0


1it [00:00, 15.44it/s]
0it [00:00, ?it/s]

step length:  tensor([10.8937])


1it [00:00, 20.23it/s]


Epoch: 883, Step: 1, Reward: 1056.0, Loss: -1056.0


0it [00:00, ?it/s]

step length:  tensor([11.0245])
Epoch: 884, Step: 1, Reward: 1043.0, Loss: -1043.0


1it [00:00, 18.90it/s]
0it [00:00, ?it/s]

step length:  tensor([10.8460])
Epoch: 885, Step: 1, Reward: 1143.0, Loss: -1143.0


1it [00:00, 20.38it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0792])


1it [00:00, 20.07it/s]


Epoch: 886, Step: 1, Reward: 1065.0, Loss: -1065.0


0it [00:00, ?it/s]

step length:  tensor([10.6580])


1it [00:00, 21.42it/s]


Epoch: 887, Step: 1, Reward: 943.0, Loss: -943.0


0it [00:00, ?it/s]

step length:  tensor([10.0859])
Epoch: 888, Step: 1, Reward: 1044.0, Loss: -1044.0


1it [00:00, 14.68it/s]
0it [00:00, ?it/s]

step length:  tensor([10.6723])


1it [00:00, 21.00it/s]


Epoch: 889, Step: 1, Reward: 931.0, Loss: -931.0


0it [00:00, ?it/s]

step length:  tensor([10.9459])


1it [00:00, 17.41it/s]


Epoch: 890, Step: 1, Reward: 866.0, Loss: -866.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 18.00it/s]

 tensor([10.6617])
Epoch: 891, Step: 1, Reward: 1013.0, Loss: -1013.0



0it [00:00, ?it/s]

step length:  tensor([10.7202])
Epoch: 892, Step: 1, Reward: 1175.0, Loss: -1175.0


1it [00:00, 19.76it/s]
0it [00:00, ?it/s]

step length:  tensor([9.7772])


1it [00:00, 17.07it/s]


Epoch: 893, Step: 1, Reward: 981.0, Loss: -981.0


0it [00:00, ?it/s]

step length:  tensor([9.9715])
Epoch: 894, Step: 1, Reward: 977.0, Loss: -977.0


1it [00:00, 18.97it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5625])
Epoch: 895, Step: 1, Reward: 1235.0, Loss: -1235.0


1it [00:00, 19.06it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1595])
Epoch: 896, Step: 1, Reward: 1059.0, Loss: -1059.0


1it [00:00, 20.79it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7900])
Epoch: 897, Step: 1, Reward: 1100.0, Loss: -1100.0


1it [00:00, 15.12it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3018])
Epoch: 898, Step: 1, Reward: 1002.0, Loss: -1002.0


1it [00:00, 20.15it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2602])


1it [00:00, 19.68it/s]


Epoch: 899, Step: 1, Reward: 820.0, Loss: -820.0


0it [00:00, ?it/s]

step length:  tensor([10.1523])


1it [00:00, 14.96it/s]


Epoch: 900, Step: 1, Reward: 829.0, Loss: -829.0


0it [00:00, ?it/s]

step length:  tensor([10.7598])


1it [00:00, 18.33it/s]


Epoch: 901, Step: 1, Reward: 1364.0, Loss: -1364.0


0it [00:00, ?it/s]

step length:  tensor([10.0038])
Epoch: 902, Step: 1, Reward: 914.0, Loss: -914.0


1it [00:00, 18.46it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0650])
Epoch: 903, Step: 1, Reward: 1143.0, Loss: -1143.0


1it [00:00, 18.35it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5615])


1it [00:00, 18.91it/s]


Epoch: 904, Step: 1, Reward: 1226.0, Loss: -1226.0


0it [00:00, ?it/s]

step length:  tensor([10.2522])
Epoch: 905, Step: 1, Reward: 1205.0, Loss: -1205.0


1it [00:00, 15.47it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7129])
Epoch: 906, Step: 1, Reward: 1093.0, Loss: -1093.0


1it [00:00, 18.85it/s]
0it [00:00, ?it/s]

step length:  tensor([9.5374])
Epoch: 907, Step: 1, Reward: 817.0, Loss: -817.0


1it [00:00, 17.56it/s]
0it [00:00, ?it/s]

step length:  tensor([9.5705])
Epoch: 908, Step: 1, Reward: 1226.0, Loss: -1226.0


1it [00:00, 19.03it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3259])


1it [00:00, 20.31it/s]


Epoch: 909, Step: 1, Reward: 749.0, Loss: -749.0


0it [00:00, ?it/s]

step length:  tensor([10.5905])
Epoch: 910, Step: 1, Reward: 899.0, Loss: -899.0


1it [00:00, 18.46it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5936])


1it [00:00, 15.93it/s]


Epoch: 911, Step: 1, Reward: 1065.0, Loss: -1065.0


0it [00:00, ?it/s]

step length:  tensor([10.3394])


1it [00:00, 19.42it/s]


Epoch: 912, Step: 1, Reward: 757.0, Loss: -757.0


0it [00:00, ?it/s]

step length:  tensor([10.0638])
Epoch: 913, Step: 1, Reward: 1133.0, Loss: -1133.0


1it [00:00, 18.44it/s]
0it [00:00, ?it/s]

step length:  tensor([10.9981])
Epoch: 914, Step: 1, Reward: 952.0, Loss: -952.0


1it [00:00, 20.43it/s]
0it [00:00, ?it/s]

step length:  tensor([10.3937])


1it [00:00, 15.47it/s]


Epoch: 915, Step: 1, Reward: 746.0, Loss: -746.0


0it [00:00, ?it/s]

step length:  tensor([10.4377])
Epoch: 916, Step: 1, Reward: 858.0, Loss: -858.0


1it [00:00, 21.44it/s]
0it [00:00, ?it/s]

step length:  tensor([10.8084])
Epoch: 917, Step: 1, Reward: 1009.0, Loss: -1009.0


1it [00:00, 21.27it/s]
0it [00:00, ?it/s]

step length:  tensor([10.8489])


1it [00:00, 15.26it/s]


Epoch: 918, Step: 1, Reward: 984.0, Loss: -984.0


0it [00:00, ?it/s]

step length:  tensor([9.8101])
Epoch: 919, Step: 1, Reward: 919.0, Loss: -919.0


1it [00:00, 20.75it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5735])


1it [00:00, 16.31it/s]


Epoch: 920, Step: 1, Reward: 801.0, Loss: -801.0


0it [00:00, ?it/s]

step length:  tensor([10.1221])
Epoch: 921, Step: 1, Reward: 843.0, Loss: -843.0


1it [00:00, 27.17it/s]
0it [00:00, ?it/s]

step length:  tensor([9.9503])
Epoch: 922, Step: 1, Reward: 883.0, Loss: -883.0


1it [00:00, 20.02it/s]
0it [00:00, ?it/s]

step length:  tensor([10.9583])
Epoch: 923, Step: 1, Reward: 966.0, Loss: -966.0


1it [00:00, 15.77it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0324])
Epoch: 924, Step: 1, Reward: 915.0, Loss: -915.0


1it [00:00, 19.04it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0761])
Epoch: 925, Step: 1, Reward: 784.0, Loss: -784.0


1it [00:00, 31.88it/s]
1it [00:00, 19.98it/s]


step length:  tensor([9.6771])
Epoch: 926, Step: 1, Reward: 956.0, Loss: -956.0


0it [00:00, ?it/s]

step length:  tensor([10.3791])
Epoch: 927, Step: 1, Reward: 993.0, Loss: -993.0


1it [00:00, 19.53it/s]
0it [00:00, ?it/s]

step length:  tensor([10.0821])


1it [00:00, 18.23it/s]


Epoch: 928, Step: 1, Reward: 1083.0, Loss: -1083.0


1it [00:00, 22.93it/s]


step length:  tensor([10.9449])
Epoch: 929, Step: 1, Reward: 1014.0, Loss: -1014.0


0it [00:00, ?it/s]

step length:  tensor([10.6414])
Epoch: 930, Step: 1, Reward: 990.0, Loss: -990.0


1it [00:00, 14.94it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4721])


1it [00:00, 19.51it/s]


Epoch: 931, Step: 1, Reward: 732.0, Loss: -732.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 17.58it/s]


tensor([10.5233])
Epoch: 932, Step: 1, Reward: 863.0, Loss: -863.0


0it [00:00, ?it/s]

step length:  tensor([11.2105])


1it [00:00, 13.74it/s]


Epoch: 933, Step: 1, Reward: 938.0, Loss: -938.0


0it [00:00, ?it/s]

step length:  tensor([10.4031])
Epoch: 934, Step: 1, Reward: 856.0, Loss: -856.0


1it [00:00, 19.97it/s]
0it [00:00, ?it/s]

step length:  tensor([10.6983])
Epoch: 935, Step: 1, Reward: 1344.0, Loss: -1344.0


1it [00:00, 18.63it/s]
0it [00:00, ?it/s]

step length:  tensor([9.9647])
Epoch: 936, Step: 1, Reward: 1505.0, Loss: -1505.0


1it [00:00, 18.19it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7937])
Epoch: 937, Step: 1, Reward: 1043.0, Loss: -1043.0


1it [00:00, 20.74it/s]
1it [00:00, 18.56it/s]

step length:  tensor([10.4471])
Epoch: 938, Step: 1, Reward: 1181.0, Loss: -1181.0



0it [00:00, ?it/s]

step length:  tensor([10.5955])
Epoch: 939, Step: 1, Reward: 1021.0, Loss: -1021.0


1it [00:00, 21.20it/s]
0it [00:00, ?it/s]

step length:  tensor([11.0249])
Epoch: 940, Step: 1, Reward: 823.0, Loss: -823.0


1it [00:00, 25.87it/s]
1it [00:00, 20.55it/s]

step length:  tensor([10.7071])
Epoch: 941, Step: 1, Reward: 1177.0, Loss: -1177.0



0it [00:00, ?it/s]

step length:  tensor([10.6345])
Epoch: 942, Step: 1, Reward: 1043.0, Loss: -1043.0


1it [00:00, 26.46it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7102])
Epoch: 943, Step: 1, Reward: 1083.0, Loss: -1083.0


1it [00:00, 19.71it/s]
0it [00:00, ?it/s]

step length:  tensor([11.1094])
Epoch: 944, Step: 1, Reward: 1228.0, Loss: -1228.0


1it [00:00, 19.11it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7353])
Epoch: 945, Step: 1, Reward: 876.0, Loss: -876.0


1it [00:00, 20.87it/s]
1it [00:00, 18.45it/s]


step length:  tensor([9.9715])
Epoch: 946, Step: 1, Reward: 1390.0, Loss: -1390.0


0it [00:00, ?it/s]

step length:  tensor([10.2201])


1it [00:00, 19.87it/s]


Epoch: 947, Step: 1, Reward: 959.0, Loss: -959.0


0it [00:00, ?it/s]

step length:  tensor([10.9456])
Epoch: 948, Step: 1, Reward: 1061.0, Loss: -1061.0


1it [00:00, 19.68it/s]
0it [00:00, ?it/s]

step length:  tensor([9.6666])
Epoch: 949, Step: 1, Reward: 683.0, Loss: -683.0


1it [00:00, 20.21it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1042])


1it [00:00, 19.78it/s]


Epoch: 950, Step: 1, Reward: 1217.0, Loss: -1217.0


0it [00:00, ?it/s]

step length:  tensor([10.5467])
Epoch: 951, Step: 1, Reward: 1082.0, Loss: -1082.0


1it [00:00, 15.19it/s]
0it [00:00, ?it/s]

step length:  tensor([9.9934])


1it [00:00, 18.30it/s]


Epoch: 952, Step: 1, Reward: 936.0, Loss: -936.0


0it [00:00, ?it/s]

step length:  tensor([10.2488])


1it [00:00, 22.14it/s]


Epoch: 953, Step: 1, Reward: 1030.0, Loss: -1030.0


0it [00:00, ?it/s]

step length:  tensor([10.3674])
Epoch: 954, Step: 1, Reward: 1045.0, Loss: -1045.0


1it [00:00, 19.74it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2498])
Epoch: 955, Step: 1, Reward: 896.0, Loss: -896.0


1it [00:00, 20.34it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1355])
Epoch: 956, Step: 1, Reward: 1072.0, Loss: -1072.0


1it [00:00, 20.14it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4809])
Epoch: 957, Step: 1, Reward: 794.0, Loss: -794.0


1it [00:00, 16.92it/s]
0it [00:00, ?it/s]

step length:  

1it [00:00, 20.96it/s]


tensor([10.3887])
Epoch: 958, Step: 1, Reward: 983.0, Loss: -983.0


0it [00:00, ?it/s]

step length:  tensor([10.4163])
Epoch: 959, Step: 1, Reward: 650.0, Loss: -650.0


1it [00:00, 19.93it/s]
1it [00:00, 20.60it/s]

step length:  tensor([9.6653])
Epoch: 960, Step: 1, Reward: 694.0, Loss: -694.0



0it [00:00, ?it/s]

step length:  tensor([10.5942])
Epoch: 961, Step: 1, Reward: 753.0, Loss: -753.0


1it [00:00, 18.36it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7582])
Epoch: 962, Step: 1, Reward: 960.0, Loss: -960.0


1it [00:00, 20.45it/s]
0it [00:00, ?it/s]

step length:  tensor([9.5476])


1it [00:00, 20.88it/s]


Epoch: 963, Step: 1, Reward: 985.0, Loss: -985.0


1it [00:00, 21.49it/s]


step length:  tensor([11.5095])
Epoch: 964, Step: 1, Reward: 1282.0, Loss: -1282.0


0it [00:00, ?it/s]

step length:  tensor([10.8278])
Epoch: 965, Step: 1, Reward: 895.0, Loss: -895.0


1it [00:00, 14.44it/s]
0it [00:00, ?it/s]

step length:  tensor([9.8521])


1it [00:00, 22.21it/s]


Epoch: 966, Step: 1, Reward: 854.0, Loss: -854.0


0it [00:00, ?it/s]

step length:  tensor([10.9313])


1it [00:00, 18.20it/s]


Epoch: 967, Step: 1, Reward: 1178.0, Loss: -1178.0


0it [00:00, ?it/s]

step length:  tensor([10.4682])
Epoch: 968, Step: 1, Reward: 950.0, Loss: -950.0


1it [00:00, 24.35it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7839])
Epoch: 969, Step: 1, Reward: 1094.0, Loss: -1094.0


1it [00:00, 15.11it/s]
1it [00:00, 18.16it/s]


step length:  tensor([10.4980])
Epoch: 970, Step: 1, Reward: 1069.0, Loss: -1069.0


0it [00:00, ?it/s]

step length:  tensor([10.3831])
Epoch: 971, Step: 1, Reward: 1003.0, Loss: -1003.0


1it [00:00, 15.47it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4620])


1it [00:00, 21.14it/s]


Epoch: 972, Step: 1, Reward: 765.0, Loss: -765.0


0it [00:00, ?it/s]

step length:  tensor([10.0438])


1it [00:00, 14.42it/s]


Epoch: 973, Step: 1, Reward: 1418.0, Loss: -1418.0


0it [00:00, ?it/s]

step length:  tensor([10.5387])
Epoch: 974, Step: 1, Reward: 925.0, Loss: -925.0


1it [00:00, 20.31it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7778])
Epoch: 975, Step: 1, Reward: 951.0, Loss: -951.0


1it [00:00, 21.08it/s]
0it [00:00, ?it/s]

step length:  tensor([9.9869])
Epoch: 976, Step: 1, Reward: 832.0, Loss: -832.0


1it [00:00, 18.62it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4960])
Epoch: 977, Step: 1, Reward: 961.0, Loss: -961.0


1it [00:00, 17.86it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5276])
Epoch: 978, Step: 1, Reward: 1014.0, Loss: -1014.0


1it [00:00, 19.56it/s]
0it [00:00, ?it/s]

step length:  tensor([10.1187])
Epoch: 979, Step: 1, Reward: 1118.0, Loss: -1118.0


1it [00:00, 15.95it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5537])
Epoch: 980, Step: 1, Reward: 864.0, Loss: -864.0


1it [00:00, 23.61it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5488])
Epoch: 981, Step: 1, Reward: 716.0, Loss: -716.0


1it [00:00, 20.43it/s]
0it [00:00, ?it/s]

step length:  tensor([9.4919])
Epoch: 982, Step: 1, Reward: 1079.0, Loss: -1079.0


1it [00:00, 18.95it/s]
1it [00:00, 23.74it/s]


step length:  tensor([10.0996])
Epoch: 983, Step: 1, Reward: 1058.0, Loss: -1058.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 24.13it/s]


 tensor([10.4936])
Epoch: 984, Step: 1, Reward: 861.0, Loss: -861.0


0it [00:00, ?it/s]

step length:  tensor([10.2735])
Epoch: 985, Step: 1, Reward: 1131.0, Loss: -1131.0


1it [00:00, 16.58it/s]
0it [00:00, ?it/s]

step length:  tensor([9.7232])
Epoch: 986, Step: 1, Reward: 1248.0, Loss: -1248.0


1it [00:00, 21.24it/s]
0it [00:00, ?it/s]

step length:  tensor([10.5657])


1it [00:00, 13.58it/s]


Epoch: 987, Step: 1, Reward: 1003.0, Loss: -1003.0


0it [00:00, ?it/s]

step length:  tensor([10.9553])
Epoch: 988, Step: 1, Reward: 970.0, Loss: -970.0


1it [00:00, 17.91it/s]
0it [00:00, ?it/s]

step length:  tensor([11.0676])
Epoch: 989, Step: 1, Reward: 975.0, Loss: -975.0


1it [00:00, 19.87it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4227])


1it [00:00, 20.38it/s]


Epoch: 990, Step: 1, Reward: 1015.0, Loss: -1015.0


0it [00:00, ?it/s]

step length:  tensor([9.9266])
Epoch: 991, Step: 1, Reward: 982.0, Loss: -982.0


1it [00:00, 15.98it/s]
0it [00:00, ?it/s]

step length:  tensor([10.4330])
Epoch: 992, Step: 1, Reward: 614.0, Loss: -614.0


1it [00:00, 26.82it/s]
0it [00:00, ?it/s]

step length:  tensor([9.9120])


1it [00:00, 14.12it/s]


Epoch: 993, Step: 1, Reward: 776.0, Loss: -776.0


0it [00:00, ?it/s]

step length:  tensor([10.1332])
Epoch: 994, Step: 1, Reward: 1143.0, Loss: -1143.0


1it [00:00, 17.37it/s]
0it [00:00, ?it/s]

step length:  tensor([10.7365])


1it [00:00, 18.35it/s]


Epoch: 995, Step: 1, Reward: 986.0, Loss: -986.0


0it [00:00, ?it/s]

step length:  tensor([10.1757])
Epoch: 996, Step: 1, Reward: 831.0, Loss: -831.0


1it [00:00, 20.64it/s]
0it [00:00, ?it/s]

step length:  tensor([10.2003])
Epoch: 997, Step: 1, Reward: 945.0, Loss: -945.0


1it [00:00, 21.17it/s]
0it [00:00, ?it/s]

step length:  tensor([10.8824])


1it [00:00, 14.97it/s]


Epoch: 998, Step: 1, Reward: 1166.0, Loss: -1166.0


0it [00:00, ?it/s]

step length:  

1it [00:00, 20.09it/s]


tensor([10.3775])
Epoch: 999, Step: 1, Reward: 946.0, Loss: -946.0


0it [00:00, ?it/s]

step length: 

1it [00:00, 14.17it/s]

 tensor([10.1065])
Epoch: 1000, Step: 1, Reward: 1031.0, Loss: -1031.0
Training completed.





In [None]:
# 테스트
import time

num_cities_test = 100  # 테스트할 도시 수
max_reward_test = 100  # 최대 보상 값
max_distance_test = 10  # 최대 이동 가능 거리
hidden_dim_test = 256  # 히든 디멘션

# 새로운 환경 생성
env_test = PCTSPEnvironment(num_cities_test, max_reward_test, max_distance_test)

# 랜덤 좌표 생성
test_coordinates = torch.rand((num_cities_test, 2))

# 모델과 데이터셋 준비
test_dataset = TensorDataset(test_coordinates.unsqueeze(0))
test_dataloader = DataLoader(test_dataset, batch_size=1)

# 모델과 크리틱 생성
test_model = PtrNet(hidden_dim=hidden_dim_test)
test_critic = Critic(hidden_dim=hidden_dim_test)

# 모델과 크리틱의 상태를 CPU로 이동
test_model.cpu()
test_critic.cpu()

# 모델 및 크리틱의 파라미터 불러오기
test_model.load_state_dict(model.state_dict())
test_critic.load_state_dict(critic.state_dict())

# 모델 및 크리틱을 평가 모드로 설정
test_model.eval()
test_critic.eval()

# 테스트 루프
total_distances = []
total_rewards = []
start_time = time.time()  # 시작 시간 기록

for i, s_i_test in tqdm(enumerate(test_dataloader)):
    s_i_test = s_i_test[0]

    # 모델을 통한 예측
    total_reward_test, pi_test = test_model(s_i_test, env_test.city_rewards, max_distance_test)
    total_distance_test = test_model.get_length(s_i_test, pi_test)

    # 결과 기록
    total_distances.append(total_distance_test.item())
    total_rewards.append(total_reward_test.item())

end_time = time.time()  # 종료 시간 기록
calculation_time = end_time - start_time  # 계산에 걸린 시간 계산

# 결과 출력
print(f"Total distance of the path: {sum(total_distances)}")
print(f"Total reward collected: {sum(total_rewards)}")
print(f"Calculation Time: {calculation_time} seconds")

Random Data for Current Episode:
City Coordinates:
 tensor([[4.1113e-02, 2.7530e-01],
        [3.3297e-01, 9.0903e-01],
        [1.3806e-01, 8.5710e-01],
        [6.0656e-01, 3.1280e-01],
        [1.2300e-01, 8.9359e-01],
        [9.7994e-01, 3.3151e-01],
        [6.0642e-01, 3.6744e-02],
        [6.8014e-01, 6.8255e-01],
        [3.1067e-01, 7.9592e-01],
        [1.7952e-01, 4.6431e-01],
        [3.5741e-01, 8.0906e-01],
        [9.1592e-01, 2.8179e-01],
        [1.3437e-02, 7.5215e-01],
        [4.4005e-01, 2.4348e-01],
        [2.2654e-02, 4.2046e-02],
        [9.5371e-01, 6.5710e-01],
        [2.8128e-01, 3.2654e-01],
        [4.5201e-01, 9.5240e-01],
        [9.2409e-01, 5.5540e-01],
        [6.6416e-01, 6.7016e-01],
        [9.0091e-01, 8.6161e-01],
        [5.2090e-01, 2.2549e-01],
        [6.2680e-01, 5.2165e-01],
        [5.1865e-01, 3.6116e-01],
        [3.1807e-01, 8.7791e-01],
        [5.4913e-01, 1.6708e-02],
        [9.4732e-01, 6.6145e-01],
        [6.2318e-01, 8.5188e-0

0it [00:00, ?it/s]

1it [00:00, 66.63it/s]

step length:  tensor([9.8192])
Total distance of the path: 9.819174766540527
Total reward collected: 1176.0
Calculation Time: 0.017009258270263672 seconds





In [None]:
values = [
    40, 55, 60, 76, 42, 36, 9, 2, 51, 23, 33, 78, 27, 41, 84, 51, 6, 76,
    38, 57, 71, 32, 98, 16, 99, 31, 46, 52, 56, 10, 2, 82, 37, 44, 23, 12,
    33, 86, 57, 58, 22, 85, 38, 34, 64, 42, 89, 50, 54, 77, 95, 12, 90, 11,
    52, 87, 47, 93, 71, 69, 3, 86, 21, 9, 86, 29, 65, 80, 11, 80, 61, 22,
    38, 93, 77, 91, 26, 7, 39, 2, 46, 16, 66, 54, 92, 4, 32, 84, 80, 73,
    41, 56, 45, 84, 20, 52, 61, 30, 33, 3
]

total_sum = sum(values)
print("리스트의 총합:", total_sum)


리스트의 총합: 4910
