In [25]:
import copy
import glob
import os
import time

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [26]:
class RolloutStorage(object):
    def __init__(self, num_steps, num_processes, obs_shape, action_space, state_size):
        self.observations = torch.zeros(num_steps + 1, num_processes, *obs_shape)
        self.states = torch.zeros(num_steps + 1, num_processes, state_size)
        self.rewards = torch.zeros(num_steps, num_processes, 1)
        self.value_preds = torch.zeros(num_steps + 1, num_processes, 1)
        self.returns = torch.zeros(num_steps + 1, num_processes, 1)
        self.action_log_probs = torch.zeros(num_steps, num_processes, 1)
        
        action_shape = 1

        self.actions = torch.zeros(num_steps, num_processes, action_shape)
            
        self.actions = self.actions.long()
        self.masks = torch.ones(num_steps + 1, num_processes, 1)

    def cuda(self):
        self.observations = self.observations.cuda()
        self.states = self.states.cuda()
        self.rewards = self.rewards.cuda()
        self.value_preds = self.value_preds.cuda()
        self.returns = self.returns.cuda()
        self.action_log_probs = self.action_log_probs.cuda()
        self.actions = self.actions.cuda()
        self.masks = self.masks.cuda()

    def insert(self, step, current_obs, state, action, action_log_prob, value_pred, reward, mask):
        self.observations[step + 1].copy_(current_obs)
        self.states[step + 1].copy_(state)
        self.actions[step].copy_(action)
        self.action_log_probs[step].copy_(action_log_prob)
        self.value_preds[step].copy_(value_pred)
        self.rewards[step].copy_(reward)
        self.masks[step + 1].copy_(mask)

    def after_update(self):
        self.observations[0].copy_(self.observations[-1])
        self.states[0].copy_(self.states[-1])
        self.masks[0].copy_(self.masks[-1])

    def compute_returns(self, next_value, use_gae, gamma, tau):
        self.returns[-1] = next_value
        for step in reversed(range(self.rewards.size(0))):
            self.returns[step] = self.returns[step + 1] * gamma * self.masks[step + 1] + self.rewards[step]
                
class Categorical(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(Categorical, self).__init__()
        self.linear = nn.Linear(num_inputs, num_outputs)

    def forward(self, x):
        x = self.linear(x)
        return x

    def sample(self, x, deterministic):
        x = self(x)

        probs = F.softmax(x)
        if deterministic is False:
            action = probs.multinomial()
        else:
            action = probs.max(1)[1]
        return action

    def logprobs_and_entropy(self, x, actions):
        x = self(x)

        log_probs = F.log_softmax(x)
        probs = F.softmax(x)

        action_log_probs = log_probs.gather(1, actions)

        dist_entropy = -(log_probs * probs).sum(-1).mean()
        return action_log_probs, dist_entropy
    

def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1 or classname.find('Linear') != -1:
        orthogonal(m.weight.data)
        if m.bias is not None:
            m.bias.data.fill_(0)


class CNNPolicy(nn.Module):
    def __init__(self, num_inputs, action_space):
        super(CNNPolicy, self).__init__()
        self.conv1 = nn.Conv2d(num_inputs, 32, 8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
        self.conv3 = nn.Conv2d(64, 32, 3, stride=1)

        self.linear1 = nn.Linear(32 * 7 * 7, 512)

        self.critic_linear = nn.Linear(512, 1)

        num_outputs = action_space.n
        self.dist = Categorical(512, num_outputs)

        self.train() # training mode. Only affects dropout, batchnorm etc
        self.reset_parameters()
        
    def act(self, inputs, states, masks, deterministic=False):
        value, x, states = self(inputs, states, masks)
        action = self.dist.sample(x, deterministic=deterministic)
        action_log_probs, dist_entropy = self.dist.logprobs_and_entropy(x, action)
        return value, action, action_log_probs, states

    def evaluate_actions(self, inputs, states, masks, actions):
        value, x, states = self(inputs, states, masks)
        action_log_probs, dist_entropy = self.dist.logprobs_and_entropy(x, actions)
        return value, action_log_probs, dist_entropy, states

    @property
    def state_size(self):
        return 1

    def reset_parameters(self):
        self.apply(weights_init)

        relu_gain = nn.init.calculate_gain('relu')
        self.conv1.weight.data.mul_(relu_gain)
        self.conv2.weight.data.mul_(relu_gain)
        self.conv3.weight.data.mul_(relu_gain)
        self.linear1.weight.data.mul_(relu_gain)

    def forward(self, inputs, states, masks):
        x = self.conv1(inputs / 255.0)
        x = F.relu(x)

        x = self.conv2(x)
        x = F.relu(x)

        x = self.conv3(x)
        x = F.relu(x)

        x = x.view(-1, 32 * 7 * 7)
        x = self.linear1(x)
        x = F.relu(x)

        return self.critic_linear(x), x, states
    

# Necessary for my KFAC implementation.
class AddBias(nn.Module):
    def __init__(self, bias):
        super(AddBias, self).__init__()
        self._bias = nn.Parameter(bias.unsqueeze(1))

    def forward(self, x):
        if x.dim() == 2:
            bias = self._bias.t().view(1, -1)
        else:
            bias = self._bias.t().view(1, -1, 1, 1)

        return x + bias

# A temporary solution from the master branch.
# https://github.com/pytorch/pytorch/blob/7752fe5d4e50052b3b0bbc9109e599f8157febc0/torch/nn/init.py#L312
# Remove after the next version of PyTorch gets release.
def orthogonal(tensor, gain=1):
    if tensor.ndimension() < 2:
        raise ValueError("Only tensors with 2 or more dimensions are supported")

    rows = tensor.size(0)
    cols = tensor[0].numel()
    flattened = torch.Tensor(rows, cols).normal_(0, 1)

    if rows < cols:
        flattened.t_()

    # Compute the qr factorization
    q, r = torch.qr(flattened)
    # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf
    d = torch.diag(r, 0)
    ph = d.sign()
    q *= ph.expand_as(q)

    if rows < cols:
        q.t_()

    tensor.view_as(q).copy_(q)
    tensor.mul_(gain)
    return tensor


In [30]:
import os
import sys
import numpy as np
import torch
import random
import datetime
from util import flatten_state, flatten_state_no_board
from pommerman.agents import SimpleAgent, RandomAgent, PlayerAgent, BaseAgent
from pommerman.configs import ffa_v0_fast_env
from pommerman.envs.v0 import Pomme
from pommerman.characters import Bomber
from pommerman import utility, characters
from pommerman.constants import Action
from collections import deque
from torch.nn.parameter import Parameter
import torch.nn.init as init
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import time

use_cuda = torch.cuda.is_available()
print('CUDA IS ON: {}'.format(use_cuda))
if use_cuda:
    torch.set_default_tensor_type('torch.cuda.FloatTensor')

def get_cuda(x):
    """ Converts tensors to cuda, if available. """
    if use_cuda:
        return x.cuda()
    return x

def get_numpy(x):
    """ Get numpy array for both cuda and not. """
    if use_cuda:
        return x.cpu().data.numpy()
    return x.data.numpy()

torch.cuda.is_available()

# Instantiate the environment
config = ffa_v0_fast_env()
env = Pomme(**config["env_kwargs"])


#n_inputs = env.observation_space.shape[0]
n_inputs = 372
n_hidden = 500
n_outputs = env.action_space.n

print('state shape:', n_inputs)
print('action shape:', n_outputs)

class TrainingAgent(BaseAgent):

    def __init__(self, character=characters.Bomber):
        super().__init__(character)


    def act(self, obs, action_space):
        return 0

'''
Agent class that does nothing
'''
class StopAgent(BaseAgent):

    def __init__(self, character=characters.Bomber, *args, **kwargs):
        super(StopAgent,self).__init__(character,*args, **kwargs)
    
    def act(self, obs, action_space):
        return 0


CUDA IS ON: True
state shape: 372
action shape: 6


In [31]:
class args:
    def __init__(self):
        self.env_name='PongNoFrameskip-v4'
        self.seed=1
        self.log_dir=''
        self.save_dir='saved_models'
        self.cuda=False
        self.num_stack=4
        self.num_steps=5
        self.num_processes=1
        self.lr=7e-4
        self.eps=1e-5
        self.alpha=.99
        self.max_grad_norm=.5
        self.value_loss_coef=.5
        self.entropy_coef=.1
        self.num_frames=1000
        self.use_gae=False
        self.gamma=.99
        self.tau=.95
        self.save_interval=1000
        self.log_interval=100
        self.vis_interval=100
        self.load_model=False
        self.save_model=False
        
args = args()

SAVE_PATH = "saved_models/a2c_121717.pt"
LOAD_PATH = "saved_models/a2c_121717.pt"

num_updates = 100

torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

In [36]:
def main():
    #os.environ['OMP_NUM_THREADS'] = '1'

    #envs = make_env(args.env_name, args.seed, 0, args.log_dir)

    #if args.num_processes > 1:
    #    envs = SubprocVecEnv(envs)
    #else:
    #    envs = DummyVecEnv(envs)

    #obs_shape = 372
    #obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])
    print(env)
    print(env.action_space.n)
    actor_critic = CNNPolicy(372, env.action_space)
  
    #if args.load_model:
    #    actor_critic.load_state_dict(torch.load(LOAD_PATH))

    #action_shape = 1

    if args.cuda:
        actor_critic.cuda()

    optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size)
    
    #current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)
    
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            
            value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True),
                                                                      Variable(rollouts.states[step], volatile=True),
                                                                      Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True),
                                  Variable(rollouts.states[-1], volatile=True),
                                  Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                                                                                       Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                                                                                       Variable(rollouts.masks[:-1].view(-1, 1)),
                                                                                       Variable(rollouts.actions.view(-1, action_shape)))

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

        optimizer.step()
 
        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_model:
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
                
            torch.save(actor_critic.state_dict(), SAVE_PATH)

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), dist_entropy.data[0],
                       value_loss.data[0], action_loss.data[0]))
            
main()

<Pomme instance>
6
<generator object Module.parameters at 0x0000021B346CCCA8>


NameError: name 'sdadadad' is not defined

# TUTORIAL

In [117]:
import os
import sys
import numpy as np
import torch
import random
import datetime
from util import flatten_state, flatten_state_no_board
from pommerman.agents import SimpleAgent, RandomAgent, PlayerAgent, BaseAgent
from pommerman.configs import ffa_v0_fast_env
from pommerman.envs.v0 import Pomme
from pommerman.characters import Bomber
from pommerman import utility, characters
from pommerman.constants import Action
from collections import deque
from torch.nn.parameter import Parameter
import torch.nn.init as init
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn
import time

use_cuda = False#torch.cuda.is_available()
print('CUDA IS ON: {}'.format(use_cuda))
#if use_cuda:
#    torch.set_default_tensor_type('torch.cuda.FloatTensor')

def get_cuda(x):
    """ Converts tensors to cuda, if available. """
    if use_cuda:
        return x.cuda()
    return x

def get_numpy(x):
    """ Get numpy array for both cuda and not. """
    if use_cuda:
        return x.cpu().data.numpy()
    return x.data.numpy()

torch.cuda.is_available()

# Instantiate the environment
config = ffa_v0_fast_env()
env = Pomme(**config["env_kwargs"])

#n_inputs = env.observation_space.shape[0]
n_inputs = 372
n_hidden = 500
n_outputs = env.action_space.n

print('state shape:', n_inputs)
print('action shape:', n_outputs)

class TrainingAgent(BaseAgent):

    def __init__(self, character=characters.Bomber):
        super().__init__(character)


    def act(self, obs, action_space):
        return 0

'''
Agent class that does nothing
'''
class StopAgent(BaseAgent):

    def __init__(self, character=characters.Bomber, *args, **kwargs):
        super(StopAgent,self).__init__(character,*args, **kwargs)
    
    def act(self, obs, action_space):
        return 0
    
# Add four random agents
agents = []
#for agent_id in range(4):
#    agents[agent_id] = RandomAgent(config["agent"](agent_id, config["game_type"]))
agents = {
    '0' : SimpleAgent(config["agent"](0, config["game_type"])),
    '1' : SimpleAgent(config["agent"](1, config["game_type"])),
    '2' : SimpleAgent(config["agent"](2, config["game_type"])),
    '3' : TrainingAgent(config["agent"](3, config["game_type"]))
}
env.set_agents(list(agents.values()))
env.set_training_agent(3)
env.set_init_game_state(None)

CUDA IS ON: False
state shape: 372
action shape: 6


In [118]:
# Flattens a state s on the form list<dict> where each dict contains information of a state
def flatten_state(s):
    # Usage Example:
	# def forward(self, x):
	#     x = flatten_state(x)
	# where x is np.atleast1d(S[0])
	return torch.from_numpy(np.array([flatten_state_aux(x) for x in s])).float()


def flatten_state_aux(s):
    # Lists
    #print ("---------------------------")
    #print (s)
    #print ("---------------------------")
    alive = [1 if x in s['alive'] else 0 for x in range(10,14)]
    board = s['board']
    bomb_blast_strength = s['bomb_blast_strength']
    bomb_life = s['bomb_life']
    # Tuples
    position = s['position']
    # Ints
    blast_strength = s['blast_strength']
    can_kick = s['can_kick']
    ammo = s['ammo']
    # Enums
    teammate = s['teammate'] #9 for FFA
    enemies = s['enemies'] #11,12,13 for FFA and training agent id = 0

    a = np.append(np.array(alive),np.array(board).flatten())
    a = np.append(a,np.array(bomb_blast_strength).flatten())
    a = np.append(a,np.array(bomb_life).flatten())
    a = np.append(a,position[0])
    a = np.append(a,position[1])
    a = np.append(a,blast_strength)
    a = np.append(a,can_kick)
    a = np.append(a,ammo)
    # Commented out as we get size 376 but expected 372. I assume we calculated wrong.
    # Makes sense to ignore these imo
    #a = np.append(a,teammate.value)
    #a = np.append(a,[e.value for e in enemies])
    return a.astype(float)

In [131]:
import gym
import os
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
from torch.autograd import Variable


def calc_actual_state_values(rewards, dones):
    R = []
    rewards.reverse()

    #print(rewards)
    #print(dones)
    # If we happen to end the set on a terminal state, set next return to zero
    if dones[-1] == True: next_return = 0
        
    # If not terminal state, bootstrap v(s) using our critic
    # TODO: don't need to estimate again, just take from last value of v(s) estimates
    else: 
        print("no")
        s = torch.from_numpy(states[-1]).float().unsqueeze(0)
        next_return = model.get_state_value(Variable(s)).data[0][0] 
    
    # Backup from last state to calculate "true" returns for each state in the set
    R.append(next_return)
    dones.reverse()
    for r in range(1, len(rewards)):
        if not dones[r]: this_return = rewards[r] + next_return * GAMMA
        else: this_return = 0
        R.append(this_return)
        next_return = this_return

    R.reverse()
    state_values_true = Variable(torch.FloatTensor(R)).unsqueeze(1)
    
    return state_values_true

def reflect(states, actions, rewards, dones):
    
    # Calculating the ground truth "labels" as described above
    state_values_true = calc_actual_state_values(rewards,dones)
    #print(states)
    states = np.asarray(states)
    action_probs, state_values_est = model.evaluate_actions(states)
    action_log_probs = action_probs.log() 
    
    a = Variable(torch.LongTensor(actions).view(-1,1))
    chosen_action_log_probs = action_log_probs.gather(1, a)

    # This is also the TD error
    advantages = state_values_true - state_values_est

    entropy = (action_probs * action_log_probs).sum(1).mean()
    action_gain = (chosen_action_log_probs * advantages).mean()
    value_loss = advantages.pow(2).mean()
    total_loss = value_loss - action_gain - 0.0001*entropy
    print("Total Loss:",total_loss)
    print("Reward:",rewards[0])

    optimizer.zero_grad()
    total_loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()

In [140]:
class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        self.linear1 = nn.Linear(N_INPUTS, 64)
        self.linear2 = nn.Linear(64, 128)
        self.linear3 = nn.Linear(128, 64)
        
        self.actor = nn.Linear(64, N_ACTIONS)
        self.critic = nn.Linear(64, 1)
    
    # In a PyTorch model, you only have to define the forward pass. PyTorch computes the backwards pass for you!
    def forward(self, x):
        x = flatten_state(x)
        #print(x)
        x = self.linear1(x)
        x = F.relu(x)
        x = self.linear2(x)
        x = F.relu(x)
        x = self.linear3(x)
        x = F.relu(x) 
        return x
    
    # Only the Actor head
    def get_action_probs(self, x):
        x = self(x)
        action_probs = F.softmax(self.actor(x),dim=1)
        return action_probs
    
    # Only the Critic head
    def get_state_value(self, x):
        x = self(x)
        state_value = self.critic(x)
        return state_value
    
    # Both heads
    def evaluate_actions(self, x):
        #print("y")
        x = self(x)
        #print("no")
        #print(x)
        action_probs = F.softmax(self.actor(x),dim=1)
        state_values = self.critic(x)
        return action_probs, state_values

In [146]:
# Discount factor. Model is not very sensitive to this value.
GAMMA = .95

# LR of 3e-2 explodes the gradients, LR of 3e-4 trains slower
LR = 3e-3
N_GAMES = 1000

# OpenAI baselines uses nstep of 5.
#N_STEPS = 20

N_ACTIONS = 6 # get from env
N_INPUTS = 372 # get from env

model = ActorCritic()

if use_cuda:
    print("wat")
    model.cuda()
    
optimizer = optim.Adam(model.parameters(), lr=LR)

state = env.reset()
finished_games = 0

print(env)
action_probs = []
for i in range(N_GAMES):
    states, actions, rewards, dones = [], [], [], []
    state = env.reset()
    done = False
    if(i%100 == 0):
        print("Game:",i)
    # Gather training data
    try:
        while not done:
            
            s = np.atleast_1d(state[3])
            #print(s)

            action_probs = model.get_action_probs(s)
            a = action_probs.multinomial(1)[0].item()

            acts = env.act(state)
            acts.append(a)

            next_state, reward, done, _ = env.step(acts)

            states.append(state[3]); actions.append(a); rewards.append(reward[3]); dones.append(done)

            state = next_state
    except RuntimeError:
        print("Error, action probs:",action_probs)

    # Reflect on training data
    reflect(states, actions, rewards, dones)

print("Done")

<Pomme instance>
Game: 0
Total Loss: tensor(0.0179, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.2065, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.5127, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.6577, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.7552, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.6603, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.7193, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.5798, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.8145, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.8524, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.6953, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.6884, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.8831, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.6980, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.7590, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tens

Total Loss: tensor(-0.4892, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.4313, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.6242, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.5971, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.5594, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.5618, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.5469, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.5848, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.6030, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.3790, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.5728, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.6771, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.5757, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.4147, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.5001, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.4851, grad_fn=<ThS

Total Loss: tensor(-0.5215, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.5408, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.5260, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.6530, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.8279, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.6059, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.6312, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-1.2781, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-1.4548, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.2541, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.2380, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-1.6250, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.6779, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.2156, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.3138, grad_fn=<ThSubBackward>)
Reward: -1
Total Loss: tensor(-0.2536, grad_fn=<ThS

IndexError: list index out of range