In [2]:
!pip3 install highway_env


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [3]:
import gymnasium
import highway_env
import numpy as np
import pprint

import torch
import torch.nn as nn
import torch.nn.functional as F
import copy



In [None]:


env = gymnasium.make("highway-v0", render_mode='rgb_array')
# pprint.pprint(env.unwrapped.config)
env.unwrapped.config["lanes_count"] = 3
env.unwrapped.config["duration"] = 10
env.unwrapped.config["vehicles_density"] = 2 # 3
env.unwrapped.config["vehicles_count"] = 10

env.unwrapped.config["action"]["type"] = "DiscreteAction"
ACTION_SIZE = 9
# ACTION_SIZE = 5
observation = {
        "type": "Kinematics",
        "vehicles_count": 5,
        "features": ["x", "y", "vx", "vy", "cos_h"],
        "features_range": {
            "x": [-100, 100],
            "y": [-100, 100],
            "vx": [-20, 20],
            "vy": [-20, 20]
        },
        "absolute": True,
        # "absolute": True,
        "order": "sorted"
    }
env.unwrapped.config["observation"] = observation



In [29]:

class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_size=50):
        super().__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.layer_1 = nn.Linear(input_size, hidden_size)
        self.norm_1 = nn.BatchNorm1d(hidden_size)
        self.layer_2 = nn.Linear(hidden_size, hidden_size)
        self.norm_2 = nn.BatchNorm1d(hidden_size)
        self.layer_3 = nn.Linear(hidden_size, output_size)
        self.norm_3 = nn.BatchNorm1d(output_size)

    def forward(self, obs, batch_size=1):
        batch_norm_on = batch_size != 1
        if obs is None:
            retval = torch.tensor([[1/self.output_size] * self.output_size] * batch_size)
            return retval
        
       
        x = torch.tensor(obs)
        x = x.view(-1, self.input_size)


        x = self.layer_1(x)
        if batch_norm_on:
            x = self.norm_1(x)
        x = F.sigmoid(x)
        x = self.layer_2(x)
        if batch_norm_on:
            x = self.norm_2(x)
        x = F.sigmoid(x)
        x = self.layer_3(x)
        if batch_norm_on:
            x = self.norm_3(x)
        x = F.softmax(x, dim=1)
        
        return x
    


In [30]:
import random


class ReplayMemory(object):
    def __init__(self, capacity=1000):
        self.capacity = capacity
        self.memory = []

    def push(self, *args):
        """Save a transition"""
        self.memory.append(args)
        if len(self.memory) > self.capacity:
            self.memory.pop(0)

    def sample(self, batch_size):
        batch = random.sample(self.memory, batch_size)
        states = []
        actions = []
        rewards = []
        for i, memory in enumerate(batch):
            states.append(torch.tensor(memory[0]))
            actions.append(torch.tensor(memory[1]))
            rewards.append(torch.tensor(memory[2]))
        states = torch.stack(states)
        actions = torch.stack(actions)
        rewards = torch.stack(rewards)
            
        return states, actions, rewards

    def __len__(self):
        return len(self.memory)

In [46]:
def select_action(obs, policy_net):
    if obs is None:
        return 0
    policy = policy_net(obs)
    policy = policy.detach().numpy().flatten()
    return np.random.choice(len(policy), p=policy)
    # policy.numpy()
    # return torch.argmax(policy, dim=1).item()

def optimize_model(policy_net, value_net, memory, BATCH_SIZE, GAMMA, optimizer):

    if len(memory) < BATCH_SIZE:
        return
    # print('learn')
    state_batch, action_batch, reward_batch = memory.sample(BATCH_SIZE)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net

    state_action_values = policy_net(state_batch, batch_size=BATCH_SIZE).gather(1, action_batch.unsqueeze(1))
    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1).values
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.

    next_state_values = torch.max(value_net(state_batch, batch_size=BATCH_SIZE), dim=1).values
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
    print(f'Loss: {loss:.2f}')
    # [batch_size, 1, 1]

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()
    


In [48]:
import time
OBS_SIZE = env.unwrapped.config["observation"]["vehicles_count"] * len(env.unwrapped.config["observation"]["features"])
env.unwrapped.config["duration"] = 20

EPOCHS = 100
batch_size = 16
GAMMA = 0.99
TAU = 0.005
LR = 1e-2

policy_net = Model(OBS_SIZE, ACTION_SIZE)
value_net = Model(OBS_SIZE, ACTION_SIZE)
optimizer = torch.optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)


all_parameters = list(policy_net.parameters()) + list(value_net.parameters())
optimizer_random = torch.optim.Adam(all_parameters, lr=0.1)

env.reset()
# while(True):
#     env.render()

memory = ReplayMemory()
for epoch in range(EPOCHS):
    print(f'epoch: {epoch}')
    env.reset()
    obs = None
    done = truncated = False
    
    total_reward = 0
    while not (done or truncated):
        action = select_action(obs, policy_net)
        next_obs, reward, done, truncated, info = env.step(action)

        if info["rewards"]["on_road_reward"] == 0:
            pass# reward -= 10
        # "features": ["x", "y", "vx", "vy", "cos_h"],
        # reward += next_obs[0][1] * 10
        if done or truncated:
            break
        if obs is not None:
            # print(f'sub: {abs(next_obs[0][4] - obs[0][4])}')
            # reward -= abs(next_obs[0][4] - obs[0][4]) * 0.2
            memory.push(obs, action, reward)
        obs = next_obs
        
        total_reward += reward
        # print(total_reward)
        
        if epoch < 5: # 5
            target = torch.Tensor([[1 / ACTION_SIZE] * ACTION_SIZE])
            policy = policy_net(obs)
            values = value_net(obs)
            criterion = torch.nn.CrossEntropyLoss()
            loss_policy = criterion(policy, target)
            
            loss_values = criterion(policy, values)
            loss = loss_policy + loss_values
            optimizer_random.zero_grad()
            loss.backward()
            optimizer.step()

            

        else:
            print(f'policy: {policy_net(obs)}')
            print(f'value: {value_net(obs)}')
            optimize_model(policy_net, value_net, memory, batch_size, GAMMA, optimizer)

            value_net_state_dict = value_net.state_dict()
            policy_net_state_dict = policy_net.state_dict()
            for key in policy_net_state_dict:
                value_net_state_dict[key] = value_net_state_dict[key]*TAU + value_net_state_dict[key]*(1-TAU)
            value_net.load_state_dict(value_net_state_dict)
            env.render()




epoch: 0
epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
policy: tensor([[0.1246, 0.1493, 0.4039, 0.2568, 0.0653]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2963, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
policy: tensor([[0.1245, 0.1493, 0.4040, 0.2568, 0.0653]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2963, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
epoch: 6
policy: tensor([[0.1247, 0.1493, 0.4038, 0.2569, 0.0653]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2963, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
policy: tensor([[0.1246, 0.1492, 0.4041, 0.2568, 0.0653]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2964, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
epoch: 7
policy: tensor([[0.1247, 0.1493, 0.4038, 0.2568, 0.0653]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2961, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
policy: tensor([[0.1246, 0.1493, 0.4041, 0.2567, 0.0653]], grad

  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.1181, 0.1459, 0.3884, 0.2802, 0.0674]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2964, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.39
epoch: 8
policy: tensor([[0.1133, 0.1429, 0.3921, 0.2849, 0.0667]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1862, 0.2960, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.35


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.1122, 0.1406, 0.3965, 0.2844, 0.0663]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1861, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.30
policy: tensor([[0.1123, 0.1389, 0.4005, 0.2833, 0.0650]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1858, 0.2967, 0.2120, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.30


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 9
policy: tensor([[0.1112, 0.1357, 0.4044, 0.2857, 0.0630]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1592, 0.1862, 0.2964, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.29
epoch: 10
policy: tensor([[0.1093, 0.1336, 0.4064, 0.2897, 0.0609]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1863, 0.2961, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.24


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 11
policy: tensor([[0.1071, 0.1327, 0.4063, 0.2945, 0.0594]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2961, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.28
policy: tensor([[0.1053, 0.1322, 0.4055, 0.2986, 0.0584]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2959, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.29


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.1044, 0.1321, 0.4053, 0.3002, 0.0580]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1858, 0.2964, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.31
policy: tensor([[0.1036, 0.1311, 0.4038, 0.3038, 0.0577]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2963, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.31


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 12
policy: tensor([[0.1028, 0.1306, 0.4042, 0.3049, 0.0575]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.30
epoch: 13
policy: tensor([[0.1030, 0.1315, 0.4009, 0.3074, 0.0571]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1861, 0.2963, 0.2120, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.30


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.1032, 0.1329, 0.3967, 0.3097, 0.0574]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2963, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.28
policy: tensor([[0.1024, 0.1326, 0.3923, 0.3155, 0.0571]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2962, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.27


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 14
policy: tensor([[0.1024, 0.1335, 0.3895, 0.3175, 0.0571]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1863, 0.2961, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.29
policy: tensor([[0.1019, 0.1343, 0.3859, 0.3211, 0.0568]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2959, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.31


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.1017, 0.1357, 0.3835, 0.3228, 0.0563]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2958, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.30
policy: tensor([[0.1015, 0.1363, 0.3837, 0.3225, 0.0559]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1863, 0.2960, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.23


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.1010, 0.1375, 0.3831, 0.3230, 0.0555]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1863, 0.2960, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.30
epoch: 15
policy: tensor([[0.1005, 0.1387, 0.3823, 0.3237, 0.0548]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1861, 0.2964, 0.2120, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.35


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 16
policy: tensor([[0.0994, 0.1390, 0.3795, 0.3280, 0.0541]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2961, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.24
policy: tensor([[0.0990, 0.1390, 0.3775, 0.3305, 0.0540]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1862, 0.2962, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.28


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0986, 0.1402, 0.3765, 0.3315, 0.0532]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2962, 0.2118, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.30
policy: tensor([[0.0983, 0.1409, 0.3727, 0.3354, 0.0526]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2962, 0.2120, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.28


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 17
policy: tensor([[0.0984, 0.1395, 0.3736, 0.3365, 0.0520]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2961, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.23
policy: tensor([[0.0978, 0.1390, 0.3710, 0.3409, 0.0512]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2959, 0.2120, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.32


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 18
policy: tensor([[0.0990, 0.1370, 0.3678, 0.3456, 0.0506]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1864, 0.2962, 0.2116, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.26
epoch: 19
policy: tensor([[0.0997, 0.1379, 0.3672, 0.3453, 0.0500]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1862, 0.2961, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.23


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.1000, 0.1388, 0.3667, 0.3449, 0.0497]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2961, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.26
policy: tensor([[0.1000, 0.1387, 0.3694, 0.3426, 0.0494]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2962, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.28


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.1004, 0.1386, 0.3676, 0.3447, 0.0487]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1861, 0.2963, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.32
epoch: 20
policy: tensor([[0.1005, 0.1382, 0.3696, 0.3435, 0.0481]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.34


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0996, 0.1393, 0.3731, 0.3403, 0.0477]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.28
policy: tensor([[0.0986, 0.1395, 0.3783, 0.3366, 0.0470]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1860, 0.2964, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.36


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 21
policy: tensor([[0.0978, 0.1367, 0.3829, 0.3366, 0.0459]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.22
policy: tensor([[0.0983, 0.1393, 0.3818, 0.3342, 0.0464]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1864, 0.2958, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.29


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 22
policy: tensor([[0.0964, 0.1377, 0.3841, 0.3369, 0.0449]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2962, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.29
policy: tensor([[0.0951, 0.1392, 0.3836, 0.3374, 0.0447]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1862, 0.2960, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.26


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0939, 0.1406, 0.3858, 0.3356, 0.0442]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.34
policy: tensor([[0.0936, 0.1396, 0.3875, 0.3353, 0.0440]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2960, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.28


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0923, 0.1401, 0.3885, 0.3356, 0.0435]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2961, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.30
policy: tensor([[0.0920, 0.1429, 0.3873, 0.3340, 0.0438]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2962, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.21


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0913, 0.1424, 0.3882, 0.3347, 0.0434]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1862, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.27
policy: tensor([[0.0906, 0.1423, 0.3882, 0.3357, 0.0431]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1862, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.32


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0899, 0.1424, 0.3867, 0.3382, 0.0428]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2963, 0.2119, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.34
policy: tensor([[0.0906, 0.1449, 0.3813, 0.3397, 0.0435]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1862, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.33


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 23
policy: tensor([[0.0913, 0.1457, 0.3727, 0.3466, 0.0436]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1862, 0.2961, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.39
policy: tensor([[0.0922, 0.1470, 0.3667, 0.3502, 0.0439]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1862, 0.2959, 0.2120, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.34


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0922, 0.1478, 0.3621, 0.3531, 0.0449]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1864, 0.2955, 0.2120, 0.1465]], grad_fn=<SoftmaxBackward0>)
Loss: 0.35
policy: tensor([[0.0940, 0.1522, 0.3480, 0.3611, 0.0448]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1863, 0.2959, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.35


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 24
policy: tensor([[0.0929, 0.1508, 0.3579, 0.3543, 0.0441]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1859, 0.2964, 0.2122, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.32
policy: tensor([[0.0928, 0.1523, 0.3545, 0.3560, 0.0444]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1859, 0.2964, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.30


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 25
policy: tensor([[0.0938, 0.1533, 0.3502, 0.3582, 0.0445]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2964, 0.2118, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.31
epoch: 26
policy: tensor([[0.0930, 0.1544, 0.3546, 0.3532, 0.0447]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1861, 0.2964, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.34


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0927, 0.1572, 0.3480, 0.3570, 0.0451]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1860, 0.2964, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.28
epoch: 27
policy: tensor([[0.0932, 0.1556, 0.3494, 0.3579, 0.0439]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2961, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.41


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 28
policy: tensor([[0.0930, 0.1561, 0.3551, 0.3520, 0.0437]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2963, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.35
epoch: 29
policy: tensor([[0.0945, 0.1588, 0.3504, 0.3519, 0.0444]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2961, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.27


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 30
policy: tensor([[0.0953, 0.1596, 0.3409, 0.3601, 0.0440]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1862, 0.2963, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.45
epoch: 31
policy: tensor([[0.0952, 0.1579, 0.3553, 0.3483, 0.0434]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2963, 0.2119, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.37


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0955, 0.1586, 0.3557, 0.3464, 0.0438]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1861, 0.2963, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.37
policy: tensor([[0.0961, 0.1565, 0.3569, 0.3468, 0.0437]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2964, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.36


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 32
policy: tensor([[0.0953, 0.1546, 0.3639, 0.3426, 0.0437]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.35
epoch: 33
policy: tensor([[0.0965, 0.1506, 0.3685, 0.3406, 0.0438]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2961, 0.2118, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.38


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0960, 0.1507, 0.3705, 0.3388, 0.0441]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1864, 0.2958, 0.2118, 0.1465]], grad_fn=<SoftmaxBackward0>)
Loss: 0.39
epoch: 34
policy: tensor([[0.0961, 0.1489, 0.3725, 0.3392, 0.0433]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2963, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.33


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0943, 0.1493, 0.3693, 0.3439, 0.0432]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1860, 0.2964, 0.2120, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.37
policy: tensor([[0.0930, 0.1480, 0.3687, 0.3471, 0.0432]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2961, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.33


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 35
policy: tensor([[0.0941, 0.1436, 0.3785, 0.3413, 0.0425]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.29
policy: tensor([[0.0936, 0.1428, 0.3792, 0.3417, 0.0428]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2963, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.33


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0927, 0.1400, 0.3803, 0.3447, 0.0422]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1859, 0.2964, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.34
epoch: 36
policy: tensor([[0.0940, 0.1391, 0.3761, 0.3485, 0.0423]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1862, 0.2960, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.30


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 37
policy: tensor([[0.0938, 0.1354, 0.3827, 0.3467, 0.0415]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2961, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.39
epoch: 38
policy: tensor([[0.0938, 0.1337, 0.3890, 0.3426, 0.0409]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1862, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.28


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0947, 0.1331, 0.3905, 0.3410, 0.0407]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1861, 0.2963, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.34
policy: tensor([[0.0957, 0.1323, 0.3909, 0.3408, 0.0404]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1860, 0.2964, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.33


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 39
policy: tensor([[0.0970, 0.1286, 0.4015, 0.3331, 0.0398]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1592, 0.1862, 0.2965, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.35
policy: tensor([[0.0962, 0.1296, 0.3909, 0.3442, 0.0391]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1858, 0.2965, 0.2121, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.32


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0959, 0.1295, 0.3920, 0.3437, 0.0389]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1858, 0.2966, 0.2120, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.25
epoch: 40
policy: tensor([[0.0969, 0.1255, 0.4013, 0.3387, 0.0377]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2963, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.35


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0962, 0.1256, 0.4032, 0.3377, 0.0373]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2962, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.29
epoch: 41
policy: tensor([[0.0965, 0.1243, 0.4093, 0.3328, 0.0371]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1863, 0.2961, 0.2118, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.36


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0957, 0.1257, 0.4058, 0.3358, 0.0371]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2958, 0.2119, 0.1465]], grad_fn=<SoftmaxBackward0>)
Loss: 0.33
policy: tensor([[0.0948, 0.1246, 0.4066, 0.3375, 0.0365]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1864, 0.2958, 0.2119, 0.1465]], grad_fn=<SoftmaxBackward0>)
Loss: 0.35


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 42
policy: tensor([[0.0942, 0.1233, 0.4123, 0.3343, 0.0359]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.33
policy: tensor([[0.0919, 0.1244, 0.4139, 0.3336, 0.0361]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2961, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.40


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 43
policy: tensor([[0.0901, 0.1232, 0.4222, 0.3290, 0.0355]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2962, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.38
policy: tensor([[0.0866, 0.1225, 0.4272, 0.3286, 0.0352]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2962, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.43


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0846, 0.1217, 0.4388, 0.3196, 0.0353]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1863, 0.2959, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.25
epoch: 44
policy: tensor([[0.0816, 0.1204, 0.4394, 0.3240, 0.0346]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2961, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.37


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 45
policy: tensor([[0.0794, 0.1195, 0.4461, 0.3210, 0.0341]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.28
epoch: 46
epoch: 47
policy: tensor([[0.0768, 0.1201, 0.4509, 0.3184, 0.0338]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2964, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.38


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0748, 0.1222, 0.4538, 0.3152, 0.0340]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2963, 0.2120, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.43
policy: tensor([[0.0741, 0.1239, 0.4508, 0.3169, 0.0343]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1859, 0.2965, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.33


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 48
policy: tensor([[0.0723, 0.1216, 0.4585, 0.3143, 0.0333]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.32
policy: tensor([[0.0709, 0.1236, 0.4517, 0.3202, 0.0336]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1860, 0.2964, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.37


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0705, 0.1219, 0.4622, 0.3120, 0.0335]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1596, 0.1865, 0.2955, 0.2118, 0.1465]], grad_fn=<SoftmaxBackward0>)
Loss: 0.42
epoch: 49
policy: tensor([[0.0676, 0.1215, 0.4617, 0.3167, 0.0325]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1860, 0.2962, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.37


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 50
policy: tensor([[0.0665, 0.1209, 0.4627, 0.3177, 0.0321]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2963, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.32
epoch: 51
policy: tensor([[0.0659, 0.1218, 0.4684, 0.3118, 0.0321]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1860, 0.2960, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.34


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 52
policy: tensor([[0.0659, 0.1228, 0.4711, 0.3082, 0.0322]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2962, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.36
epoch: 53
policy: tensor([[0.0634, 0.1211, 0.4760, 0.3087, 0.0309]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1860, 0.2962, 0.2121, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.45


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0637, 0.1216, 0.4778, 0.3058, 0.0311]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2962, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.45
epoch: 54
policy: tensor([[0.0650, 0.1196, 0.4947, 0.2893, 0.0314]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1864, 0.2962, 0.2117, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.29


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0647, 0.1204, 0.4944, 0.2892, 0.0314]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1863, 0.2961, 0.2118, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.36
epoch: 55
policy: tensor([[0.0637, 0.1177, 0.4960, 0.2921, 0.0305]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2961, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.41


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0660, 0.1221, 0.4713, 0.3087, 0.0320]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2959, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.29
epoch: 56
policy: tensor([[0.0632, 0.1188, 0.4977, 0.2898, 0.0304]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1861, 0.2961, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.40


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0639, 0.1205, 0.4898, 0.2948, 0.0310]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2960, 0.2120, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.35
epoch: 57
policy: tensor([[0.0639, 0.1158, 0.5117, 0.2782, 0.0304]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1863, 0.2961, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.47


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0640, 0.1170, 0.5161, 0.2722, 0.0308]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2964, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.41
epoch: 58
policy: tensor([[0.0620, 0.1136, 0.5433, 0.2516, 0.0296]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1861, 0.2960, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.32


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 59
policy: tensor([[0.0639, 0.1152, 0.5293, 0.2609, 0.0307]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.38
policy: tensor([[0.0642, 0.1151, 0.5316, 0.2583, 0.0307]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1859, 0.2965, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.29


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 60
policy: tensor([[0.0652, 0.1130, 0.5420, 0.2495, 0.0303]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2961, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.48
policy: tensor([[0.0656, 0.1108, 0.5546, 0.2388, 0.0303]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1863, 0.2960, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.35


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 61
policy: tensor([[0.0658, 0.1093, 0.5592, 0.2360, 0.0297]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2964, 0.2118, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.40
epoch: 62
policy: tensor([[0.0671, 0.1093, 0.5598, 0.2340, 0.0298]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2963, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.38


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0699, 0.1089, 0.5592, 0.2319, 0.0301]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1863, 0.2959, 0.2120, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.42
epoch: 63
policy: tensor([[0.0699, 0.1038, 0.5818, 0.2154, 0.0292]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1863, 0.2962, 0.2118, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.35


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0705, 0.1066, 0.5761, 0.2175, 0.0292]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2960, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.36
policy: tensor([[0.0684, 0.1070, 0.5754, 0.2209, 0.0283]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1859, 0.2964, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.40


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0701, 0.1060, 0.5824, 0.2133, 0.0283]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1860, 0.2961, 0.2121, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.31
policy: tensor([[0.0696, 0.1045, 0.5874, 0.2108, 0.0277]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1859, 0.2964, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.40


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0721, 0.1033, 0.5908, 0.2059, 0.0279]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2962, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.36
policy: tensor([[0.0726, 0.1018, 0.5921, 0.2059, 0.0276]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1862, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.39


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0730, 0.0994, 0.5972, 0.2030, 0.0273]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1863, 0.2962, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.31
epoch: 64
policy: tensor([[0.0725, 0.0983, 0.5971, 0.2050, 0.0271]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2962, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.42


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 65
policy: tensor([[0.0710, 0.0960, 0.5987, 0.2077, 0.0266]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2962, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.38
policy: tensor([[0.0710, 0.0970, 0.5910, 0.2142, 0.0268]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1860, 0.2962, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.37


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0715, 0.0960, 0.5930, 0.2130, 0.0265]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1858, 0.2965, 0.2121, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.38
policy: tensor([[0.0715, 0.0959, 0.5910, 0.2153, 0.0263]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1859, 0.2964, 0.2120, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.31


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0708, 0.0969, 0.5869, 0.2192, 0.0262]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2960, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.34
epoch: 66
policy: tensor([[0.0688, 0.0967, 0.5868, 0.2220, 0.0257]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1860, 0.2964, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.28


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0674, 0.0977, 0.5812, 0.2279, 0.0257]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1859, 0.2965, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.25
policy: tensor([[0.0658, 0.0957, 0.5870, 0.2264, 0.0252]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1858, 0.2965, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.36


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 67
policy: tensor([[0.0686, 0.0965, 0.5780, 0.2311, 0.0258]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1864, 0.2961, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.42
epoch: 68
policy: tensor([[0.0683, 0.0977, 0.5705, 0.2378, 0.0257]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2962, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.32


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0682, 0.1001, 0.5662, 0.2394, 0.0260]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2959, 0.2120, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.38
epoch: 69
policy: tensor([[0.0713, 0.0983, 0.5627, 0.2414, 0.0264]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2959, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.33


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0685, 0.0985, 0.5668, 0.2402, 0.0260]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2962, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.34
epoch: 70
policy: tensor([[0.0693, 0.0969, 0.5669, 0.2407, 0.0262]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2960, 0.2118, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.36


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 71
policy: tensor([[0.0675, 0.0959, 0.5732, 0.2375, 0.0259]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1862, 0.2961, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.33
epoch: 72
policy: tensor([[0.0672, 0.0964, 0.5703, 0.2399, 0.0262]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1861, 0.2963, 0.2119, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.40


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 73
policy: tensor([[0.0683, 0.0962, 0.5681, 0.2406, 0.0267]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1863, 0.2960, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.32
policy: tensor([[0.0676, 0.0976, 0.5677, 0.2402, 0.0269]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1863, 0.2958, 0.2120, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.29


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 74
policy: tensor([[0.0644, 0.0944, 0.5830, 0.2325, 0.0257]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1859, 0.2963, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.39
policy: tensor([[0.0653, 0.0940, 0.5678, 0.2463, 0.0266]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1859, 0.2964, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.40


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0645, 0.0925, 0.5664, 0.2498, 0.0267]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1859, 0.2963, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.32
policy: tensor([[0.0647, 0.0887, 0.5691, 0.2504, 0.0270]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1864, 0.2959, 0.2118, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.43


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 75
policy: tensor([[0.0641, 0.0877, 0.5674, 0.2538, 0.0270]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1862, 0.2961, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.33
policy: tensor([[0.0639, 0.0868, 0.5625, 0.2595, 0.0273]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1596, 0.1862, 0.2960, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.39


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 76
policy: tensor([[0.0635, 0.0844, 0.5709, 0.2541, 0.0270]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2961, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.38
epoch: 77
policy: tensor([[0.0633, 0.0838, 0.5704, 0.2553, 0.0271]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2960, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.39


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0623, 0.0831, 0.5762, 0.2514, 0.0270]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2958, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.39
policy: tensor([[0.0607, 0.0828, 0.5871, 0.2431, 0.0264]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1858, 0.2966, 0.2121, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.43


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0601, 0.0811, 0.5883, 0.2442, 0.0262]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1859, 0.2966, 0.2120, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.37
policy: tensor([[0.0600, 0.0816, 0.5992, 0.2328, 0.0264]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2963, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.39


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0598, 0.0810, 0.6048, 0.2280, 0.0263]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2962, 0.2120, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.45
policy: tensor([[0.0603, 0.0800, 0.6101, 0.2232, 0.0263]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.40


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0614, 0.0792, 0.6172, 0.2157, 0.0264]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1863, 0.2962, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.33
policy: tensor([[0.0619, 0.0793, 0.6203, 0.2122, 0.0264]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1863, 0.2962, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.36


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0621, 0.0794, 0.6216, 0.2108, 0.0261]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1862, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.27
policy: tensor([[0.0628, 0.0796, 0.6215, 0.2100, 0.0261]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1862, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.49


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 78
policy: tensor([[0.0629, 0.0817, 0.6137, 0.2153, 0.0264]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1860, 0.2960, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.35
epoch: 79
policy: tensor([[0.0647, 0.0820, 0.6065, 0.2202, 0.0266]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2963, 0.2120, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.34


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0658, 0.0849, 0.6017, 0.2205, 0.0270]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2962, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.25
policy: tensor([[0.0658, 0.0857, 0.6063, 0.2152, 0.0270]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1865, 0.2955, 0.2120, 0.1465]], grad_fn=<SoftmaxBackward0>)
Loss: 0.49


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0660, 0.0869, 0.6075, 0.2125, 0.0270]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1865, 0.2955, 0.2120, 0.1465]], grad_fn=<SoftmaxBackward0>)
Loss: 0.36
epoch: 80
policy: tensor([[0.0641, 0.0838, 0.6164, 0.2098, 0.0258]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2964, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.34


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 81
policy: tensor([[0.0635, 0.0842, 0.6345, 0.1928, 0.0251]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2962, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.38
policy: tensor([[0.0670, 0.0869, 0.6093, 0.2101, 0.0267]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2959, 0.2119, 0.1465]], grad_fn=<SoftmaxBackward0>)
Loss: 0.35


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 82
policy: tensor([[0.0658, 0.0840, 0.6283, 0.1964, 0.0254]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2960, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.24
epoch: 83
policy: tensor([[0.0701, 0.0867, 0.5896, 0.2262, 0.0273]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2962, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.42


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 84
policy: tensor([[0.0699, 0.0879, 0.5878, 0.2271, 0.0273]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2964, 0.2119, 0.1462]], grad_fn=<SoftmaxBackward0>)
Loss: 0.27
policy: tensor([[0.0717, 0.0903, 0.5765, 0.2337, 0.0277]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2964, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.40


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0715, 0.0916, 0.5742, 0.2345, 0.0281]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1862, 0.2959, 0.2121, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.37
policy: tensor([[0.0721, 0.0900, 0.5746, 0.2353, 0.0281]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1596, 0.1865, 0.2955, 0.2119, 0.1465]], grad_fn=<SoftmaxBackward0>)
Loss: 0.17


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0721, 0.0925, 0.5622, 0.2452, 0.0280]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1859, 0.2963, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.31
policy: tensor([[0.0722, 0.0889, 0.5675, 0.2440, 0.0275]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2963, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.25


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0737, 0.0886, 0.5646, 0.2454, 0.0278]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2962, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.33
epoch: 85
policy: tensor([[0.0734, 0.0886, 0.5576, 0.2527, 0.0278]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1861, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.37


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0729, 0.0902, 0.5561, 0.2529, 0.0279]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1860, 0.2963, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.28
policy: tensor([[0.0721, 0.0882, 0.5497, 0.2623, 0.0277]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2962, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.38


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 86
policy: tensor([[0.0727, 0.0863, 0.5587, 0.2547, 0.0275]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2963, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.28
policy: tensor([[0.0724, 0.0868, 0.5540, 0.2591, 0.0276]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2963, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.32


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 87
policy: tensor([[0.0704, 0.0835, 0.5677, 0.2517, 0.0266]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1863, 0.2962, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.30
epoch: 88
policy: tensor([[0.0707, 0.0845, 0.5614, 0.2566, 0.0268]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1861, 0.2962, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.40


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 89
policy: tensor([[0.0726, 0.0847, 0.5592, 0.2563, 0.0272]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1860, 0.2964, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.41
epoch: 90
policy: tensor([[0.0709, 0.0824, 0.5623, 0.2579, 0.0265]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.37


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 91
policy: tensor([[0.0738, 0.0831, 0.5596, 0.2562, 0.0273]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1862, 0.2964, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.37
epoch: 92
policy: tensor([[0.0742, 0.0814, 0.5688, 0.2486, 0.0269]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1864, 0.2960, 0.2118, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.32


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 93
policy: tensor([[0.0737, 0.0833, 0.5587, 0.2571, 0.0272]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1861, 0.2963, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.35
policy: tensor([[0.0739, 0.0858, 0.5450, 0.2675, 0.0278]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1859, 0.2964, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.30


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 94
policy: tensor([[0.0738, 0.0834, 0.5513, 0.2638, 0.0277]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1862, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.42
policy: tensor([[0.0742, 0.0843, 0.5572, 0.2564, 0.0278]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1863, 0.2961, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.35


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 95
policy: tensor([[0.0730, 0.0832, 0.5663, 0.2501, 0.0275]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1862, 0.2962, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.39
policy: tensor([[0.0725, 0.0836, 0.5577, 0.2586, 0.0277]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2959, 0.2119, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.37


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 96
policy: tensor([[0.0736, 0.0829, 0.5579, 0.2578, 0.0279]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2962, 0.2118, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.35
policy: tensor([[0.0732, 0.0830, 0.5583, 0.2576, 0.0280]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1863, 0.2961, 0.2118, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.38


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 97
policy: tensor([[0.0742, 0.0858, 0.5428, 0.2684, 0.0288]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1861, 0.2961, 0.2121, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.39
policy: tensor([[0.0735, 0.0875, 0.5365, 0.2734, 0.0291]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1861, 0.2959, 0.2121, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.48


  x = torch.tensor(obs)
  x = torch.tensor(obs)


policy: tensor([[0.0746, 0.0885, 0.5259, 0.2811, 0.0298]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2963, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.37
policy: tensor([[0.0715, 0.0861, 0.5499, 0.2640, 0.0285]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1595, 0.1862, 0.2960, 0.2120, 0.1464]], grad_fn=<SoftmaxBackward0>)
Loss: 0.49


  x = torch.tensor(obs)
  x = torch.tensor(obs)


epoch: 98
policy: tensor([[0.0721, 0.0857, 0.5550, 0.2586, 0.0287]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1593, 0.1861, 0.2964, 0.2119, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.21
epoch: 99
policy: tensor([[0.0704, 0.0875, 0.5537, 0.2598, 0.0286]], grad_fn=<SoftmaxBackward0>)
value: tensor([[0.1594, 0.1861, 0.2962, 0.2120, 0.1463]], grad_fn=<SoftmaxBackward0>)
Loss: 0.39


  x = torch.tensor(obs)
  x = torch.tensor(obs)


In [50]:

GENERATIONS = 1000
SEEDS_PER_GEN = 2
MODELS = 10
OBS_SIZE = env.unwrapped.config["observation"]["vehicles_count"] * len(env.unwrapped.config["observation"]["features"])

models = [Model(OBS_SIZE, ACTION_SIZE) for i in range(MODELS)]

for generation in range(GENERATIONS):
    env.unwrapped.config["duration"] = 8 + 0.5 * generation
    print(f'Generation: {generation}')
    # create models

    scores = [0 for i in range(MODELS)]

    for i in range(SEEDS_PER_GEN):
        seed = np.random.randint(1_000_000)
        for model_num in range(MODELS):
            print(f'{model_num} ', end='')
            render = False
            if model_num % 1 == 0:
                render = True
            model = models[model_num]

            env.reset(seed=seed)
            done = truncated = False
            obs = None
            score = 0
            while not (done or truncated):
                probabilities = model(obs).view(-1).detach().numpy()
                action = np.argmax(probabilities)
                # action = np.random.choice(ACTION_SIZE, p=probabilities)
                obs, reward, done, truncated, info = env.step(action)
                if info["rewards"]["on_road_reward"] == 0:
                    # reward -= 10
                    done = True
                if render:
                    env.render()
                # score += reward
                score += obs[0][2]
            print(score)
            # print(obs)

            # if done:
            #     score -= 10

            scores[model_num] += score
            # scores[model_num] += score
        print()
    best_score = max(scores)
    best_model_num = scores.index(best_score)
    best_model = models[best_model_num]
    print(f'Best model_num: {best_model_num} | score: {best_score}\n')
    models = [copy.deepcopy(best_model) for i in range(MODELS)]
    change = 0.01 * (0.999 ** generation)
    for model in models:
        for param in model.parameters():
            param.data += change * torch.randn_like(param)
    torch.save(best_model.state_dict(), f'model_gen{generation}')





Generation: 0
0 8.0
1 1.7637636065483093
2 6.509786248207092
3 1.7637636065483093
4 6.509786248207092
5 1.7637636065483093
6 4.863429844379425
7 6.509786248207092
8 8.0
9 1.7637636065483093

0 1.4644529819488525
1 1.4755589962005615
2 1.4750087559223175
3 1.4755589962005615
4 1.4750087559223175
5 1.4755589962005615
6 1.48556450009346
7 1.4750087559223175
8 1.4644529819488525
9 1.4755589962005615

Best model_num: 0 | score: 9.464452981948853

Generation: 1
0 1.6037171483039856
1 1.6037171483039856
2 1.6037171483039856
3 1.6037171483039856
4 1.6037171483039856
5 1.6037171483039856
6 1.6037171483039856
7 1.6037171483039856
8 1.6037171483039856
9 1.6037171483039856

0 4.617260575294495
1 4.617260575294495
2 4.617260575294495
3 4.617260575294495
4 4.617260575294495
5 4.617260575294495
6 4.617260575294495
7 4.617260575294495
8 4.617260575294495
9 4.617260575294495

Best model_num: 0 | score: 6.22097772359848

Generation: 2
0 9.0
1 9.0
2 9.0
3 9.0
4 9.0
5 9.0
6 9.0
7 9.0
8 9.0
9 9.0

0 9.0
1 

AttributeError: 'NoneType' object has no attribute 'get_image'