# MC REINFORCE

In [1]:
'''
A bunch of imports, you don't have to worry about these
'''

import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import namedtuple, deque
import torch.optim as optim
import gym
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
'''
Please refer to the first tutorial for more details on the specifics of environments
We've only added important commands you might find useful for experiments.
'''

'''
List of example environments
(Source - https://gym.openai.com/envs/#classic_control)

'Acrobot-v1'
'Cartpole-v1'
'MountainCar-v0'
'''

env = gym.make('CartPole-v1')
env.seed(0)

state_shape = env.observation_space.shape[0]
no_of_actions = env.action_space.n

print("State shape:", state_shape)
print("Number of Actions: ",no_of_actions)
print("Sampled Action",env.action_space.sample())
print("----")

'''
# Understanding State, Action, Reward Dynamics

The agent decides an action to take depending on the state.

The Environment keeps a variable specifically for the current state.
- Everytime an action is passed to the environment, it calculates the new state and updates the current state variable.
- It returns the new current state and reward for the agent to take the next action

'''

state = env.reset()
''' This returns the initial state (when environment is reset) '''

print("Current_State: ",state)
print("----")

action = env.action_space.sample()
''' We take a random action now '''

print("Sampled Action2: ", action)
print("----")

next_state, reward, done, info = env.step(action)
''' env.step is used to calculate new state and obtain reward based on old state and action taken  '''

print("Next_State: ",next_state)
print("Reward: ",reward)
print("Done: ", done)
print("Info: ", info)
print("----")


State shape: 4
Number of Actions:  2
Sampled Action 0
----
Current_State:  [-0.04456399  0.04653909  0.01326909 -0.02099827]
----
Sampled Action2:  1
----
Next_State:  [-0.04363321  0.24146826  0.01284913 -0.30946528]
Reward:  1.0
Done:  False
Info:  {}
----


# MC REINFORCE W/O BASELINE

In [3]:

import torch
import torch.nn as nn
import torch.nn.functional as F

import random
import torch
import numpy as np
from collections import deque, namedtuple

from scipy.special import softmax
from torch.distributions import Categorical

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

'''
Bunch of Hyper parameters (Which you might have to tune later)
'''
'''BUFFER_SIZE = int(1e5)  # replay buffer size
BATCH_SIZE = 64         # minibatch size
GAMMA = 0.99            # discount factor
LR = 5e-4               # learning rate
UPDATE_EVERY = 20       # how often to update the network (When Q target is present)'''

class Policy(nn.Module):

    def __init__(self, state_size, action_size, seed=0, num_policy_layers =1,num_policy_layer_units= 64):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(Policy, self).__init__()
        self.seed = torch.manual_seed(seed)
        activation = nn.LeakyReLU
        self.fc_start = nn.Sequential(*[nn.Linear(state_size, num_policy_layer_units),activation()])
        self.fc_hidden = nn.Sequential(*[nn.Sequential(*[nn.Linear(num_policy_layer_units,num_policy_layer_units),activation()]) for _ in range(num_policy_layers-1)])
        self.fc_end = nn.Linear(num_policy_layer_units, action_size)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        xs = self.fc_start(state)
        x_hidden = self.fc_hidden(xs)
        x_out = self.fc_end(x_hidden)
        return F.softmax(x_out, dim=1)
    
    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        model = Categorical(probs)
        action = model.sample()
        return action.item(), model.log_prob(action)


In [4]:
def reinforce(policy, optimizer, n_episodes=1000, max_t=1000, gamma=0.99, print_every=100):
    scores_deque = deque(maxlen=100)
    scores = []
    for e in range(n_episodes):
        saved_log_probs = []
        rewards = []
        state = env.reset()
        # Collect trajectory
        for t in range(max_t):
            # Sample the action from current policy
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break
        # Calculate total expected reward
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        
        # Recalculate the total reward applying discounted factor
        discounts = [gamma ** i for i in range(len(rewards) + 1)]
        R = sum([a * b for a,b in zip(discounts, rewards)])
        
        # Calculate the loss 
        policy_loss = []
        for log_prob in saved_log_probs:
            # Note that we are using Gradient Ascent, not Descent. So we need to calculate it with negative rewards.
            policy_loss.append(-log_prob * R)
        # After that, we concatenate whole policy loss in 0th dimension
        policy_loss = torch.cat(policy_loss).sum()
        
        # Backpropagation
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        if e % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(e, np.mean(scores_deque)))
    return scores

# Final Draft

In [5]:
#!pip install wandb

In [6]:
import wandb
wandb.login(key = "8545e71f98dc96fbac53295facb12404fc77016d")

[34m[1mwandb[0m: Currently logged in as: [33mnayinisriharsh-iitm[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\nayin\.netrc


True

In [7]:
def train_and_tune(config=None):
  # Initialize a new wandb run
  with wandb.init(config=config):
    # If called by wandb.agent, as below,
    # this config will be set by Sweep Controller
    config = wandb.config
    wandb.run.name='plr'+str(config.policy_lr)+'-nlr'+str(config.num_policy_layers)+'-uts'+str(config.num_policy_layer_units)
    state_shape = env.observation_space.shape[0]
    action_shape = env.action_space.n
    num_exp = 1
    max_episodes = 1000
    total_rewards = np.zeros([num_exp,max_episodes])
    for i in range(num_exp):
        state_shape = env.observation_space.shape[0]
        action_shape = env.action_space.n
        policy = Policy(state_size=state_shape, action_size=action_shape, seed=i, num_policy_layers =config.num_policy_layers,num_policy_layer_units= config.num_policy_layer_units).to(device)
        optimizer = optim.Adam(policy.parameters(), lr=config.policy_lr)
        scores = reinforce(policy, optimizer, n_episodes=max_episodes)
        total_rewards[i] = scores
    Regret = np.mean(np.sum(500-total_rewards,axis=1),axis=0)
    #wandb.log({"train_mean_reward":rewards,"train_mean_steps":steps,"test_mean_reward":r,"train_mean_steps":s})
    #data = [[x, y] for (x, y) in zip(np.arange(config.episodes), rewards)]
    #table1 = wandb.Table(data=data, columns=["x", "y"])
    #data = [[x, y] for (x, y) in zip(np.arange(config.episodes), steps)]
    #table2 = wandb.Table(data=data, columns=["x", "y"])
    wandb.log(
        {
            "avg_regret":Regret#,'avg_test_steps':s #,"train_reward": wandb.plot.line(table1, "x", "y", title="Reward vs Episode"),"train_steps": wandb.plot.line(table2, "x", "y", title="Steps vs Episode"),
        }
    )

In [8]:
sweep_config={'method':'bayes',
              'metric' : {
                  'name':'avg_regret',
                  'goal':'minimize'},
              'parameters':{
                  'policy_lr':{'values':[1e-2, 1e-3, 1e-4]},
                  'num_policy_layers':{'values':[1,2,3]},
                  'num_policy_layer_units':{'values':[64,128,256,512]},
                  }}
import pprint
pprint.pprint(sweep_config)
sweep_id=wandb.sweep(sweep_config,project="CS6700_PROGRAMMING_ASSIGNMENT_2")

{'method': 'bayes',
 'metric': {'goal': 'minimize', 'name': 'avg_regret'},
 'parameters': {'num_policy_layer_units': {'values': [64, 128, 256]},
                'num_policy_layers': {'values': [1, 2, 3]},
                'policy_lr': {'values': [0.01, 0.001, 0.0001]}}}
Create sweep with ID: obqrbz6p
Sweep URL: https://wandb.ai/nayinisriharsh-iitm/CS6700_PROGRAMMING_ASSIGNMENT_2/sweeps/obqrbz6p


{'method': 'bayes',
 'metric': {'goal': 'minimize', 'name': 'avg_regret'},
 'parameters': {'act_algorithm': {'values': ['softmax', 'epsilon']},
                'batch_size': {'values': [32, 64, 128]},
                'buffer_size': {'values': [100, 1000, 10000, 100000]},
                'epsilon_tau_decay': {'values': [0.995, 0.95, 0.9, 0.85]},
                'epsilon_tau_start': {'values': [1, 0.5]},
                'lr': {'values': [0.01, 0.001, 0.0001]},
                'num_common_layer_units': {'values': [128, 256, 512]},
                'num_common_layers': {'values': [1, 2, 3]},
                'type': {'values': [1]},
                'update_every': {'values': [10, 20, 30]}}}
Create sweep with ID: 29klzxlf
Sweep URL: https://wandb.ai/nayinisriharsh-iitm/CS6700_PROGRAMMING_ASSIGNMENT_2/sweeps/obqrbz6p

In [9]:
wandb.agent(sweep_id, train_and_tune,count=40)

[34m[1mwandb[0m: Agent Starting Run: dgo8mw7t with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 2
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 19.00
Episode 100	Average Score: 35.14
Episode 200	Average Score: 33.53
Episode 300	Average Score: 42.81
Episode 400	Average Score: 43.30
Episode 500	Average Score: 35.73
Episode 600	Average Score: 59.78
Episode 700	Average Score: 22.50
Episode 800	Average Score: 43.24
Episode 900	Average Score: 98.51


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,449590.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 0siuec18 with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 3
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 15.00
Episode 100	Average Score: 20.96
Episode 200	Average Score: 14.89
Episode 300	Average Score: 10.26
Episode 400	Average Score: 9.62
Episode 500	Average Score: 13.74
Episode 600	Average Score: 9.24
Episode 700	Average Score: 9.33
Episode 800	Average Score: 9.27
Episode 900	Average Score: 9.30


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.04207345420734542, max=1.…

0,1
avg_regret,▁

0,1
avg_regret,488399.0


[34m[1mwandb[0m: Agent Starting Run: dw4gkwdv with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 64
[34m[1mwandb[0m: 	num_policy_layers: 3
[34m[1mwandb[0m: 	policy_lr: 0.0001


Episode 0	Average Score: 11.00
Episode 100	Average Score: 21.65
Episode 200	Average Score: 22.11
Episode 300	Average Score: 22.34
Episode 400	Average Score: 24.08
Episode 500	Average Score: 22.54
Episode 600	Average Score: 24.41
Episode 700	Average Score: 26.90
Episode 800	Average Score: 30.27
Episode 900	Average Score: 28.14


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,474631.0


[34m[1mwandb[0m: Agent Starting Run: xs5d6rq5 with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 3
[34m[1mwandb[0m: 	policy_lr: 0.01


Episode 0	Average Score: 22.00
Episode 100	Average Score: 9.50
Episode 200	Average Score: 9.44
Episode 300	Average Score: 9.29
Episode 400	Average Score: 9.47
Episode 500	Average Score: 9.41
Episode 600	Average Score: 9.31
Episode 700	Average Score: 9.42
Episode 800	Average Score: 9.34
Episode 900	Average Score: 9.40


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,490598.0


[34m[1mwandb[0m: Agent Starting Run: 2n5dz3ug with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 128
[34m[1mwandb[0m: 	num_policy_layers: 3
[34m[1mwandb[0m: 	policy_lr: 0.01


Episode 0	Average Score: 15.00
Episode 100	Average Score: 32.29
Episode 200	Average Score: 9.32
Episode 300	Average Score: 9.46
Episode 400	Average Score: 9.43
Episode 500	Average Score: 9.46
Episode 600	Average Score: 9.33
Episode 700	Average Score: 9.42
Episode 800	Average Score: 9.26
Episode 900	Average Score: 9.27


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.041932603591388604, max=1…

0,1
avg_regret,▁

0,1
avg_regret,488335.0


[34m[1mwandb[0m: Agent Starting Run: f9mnfybg with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 64
[34m[1mwandb[0m: 	num_policy_layers: 2
[34m[1mwandb[0m: 	policy_lr: 0.01


Episode 0	Average Score: 15.00
Episode 100	Average Score: 10.31
Episode 200	Average Score: 9.35
Episode 300	Average Score: 9.29
Episode 400	Average Score: 9.30
Episode 500	Average Score: 9.37
Episode 600	Average Score: 9.46
Episode 700	Average Score: 9.37
Episode 800	Average Score: 9.26
Episode 900	Average Score: 9.19


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,490563.0


[34m[1mwandb[0m: Agent Starting Run: 84nxsuqp with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 128
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011277777777932999, max=1.0…

Episode 0	Average Score: 47.00
Episode 100	Average Score: 28.15
Episode 200	Average Score: 30.04
Episode 300	Average Score: 48.08
Episode 400	Average Score: 55.90
Episode 500	Average Score: 69.99
Episode 600	Average Score: 60.52
Episode 700	Average Score: 79.86
Episode 800	Average Score: 83.60
Episode 900	Average Score: 82.50


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,430314.0


[34m[1mwandb[0m: Agent Starting Run: 9kuvd84b with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.0001


Episode 0	Average Score: 12.00
Episode 100	Average Score: 23.47
Episode 200	Average Score: 22.10
Episode 300	Average Score: 24.90
Episode 400	Average Score: 25.62
Episode 500	Average Score: 29.81
Episode 600	Average Score: 36.74
Episode 700	Average Score: 38.07
Episode 800	Average Score: 39.45
Episode 900	Average Score: 39.35


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.041888451747280724, max=1…

0,1
avg_regret,▁

0,1
avg_regret,467848.0


[34m[1mwandb[0m: Agent Starting Run: cz1x2yvh with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 2
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 25.00
Episode 100	Average Score: 18.92
Episode 200	Average Score: 51.24
Episode 300	Average Score: 22.76
Episode 400	Average Score: 41.86
Episode 500	Average Score: 16.85
Episode 600	Average Score: 15.40
Episode 700	Average Score: 35.95
Episode 800	Average Score: 28.80
Episode 900	Average Score: 36.62


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,465908.0


[34m[1mwandb[0m: Agent Starting Run: cgxa8irn with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 12.00
Episode 100	Average Score: 31.23
Episode 200	Average Score: 44.25
Episode 300	Average Score: 61.00
Episode 400	Average Score: 52.75
Episode 500	Average Score: 63.18
Episode 600	Average Score: 47.90
Episode 700	Average Score: 57.18
Episode 800	Average Score: 62.37
Episode 900	Average Score: 61.84


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.04185815837328484, max=1.…

0,1
avg_regret,▁

0,1
avg_regret,444457.0


[34m[1mwandb[0m: Agent Starting Run: aaebzyyq with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 128
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 30.00
Episode 100	Average Score: 25.95
Episode 200	Average Score: 31.48
Episode 300	Average Score: 42.95
Episode 400	Average Score: 55.11
Episode 500	Average Score: 55.76
Episode 600	Average Score: 60.08
Episode 700	Average Score: 65.52
Episode 800	Average Score: 72.42
Episode 900	Average Score: 80.75


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,441597.0


[34m[1mwandb[0m: Agent Starting Run: ngbd92en with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 11.00
Episode 100	Average Score: 32.63
Episode 200	Average Score: 45.38
Episode 300	Average Score: 58.20
Episode 400	Average Score: 55.91
Episode 500	Average Score: 68.82
Episode 600	Average Score: 80.10
Episode 700	Average Score: 151.57
Episode 800	Average Score: 114.82
Episode 900	Average Score: 128.85


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,413436.0


[34m[1mwandb[0m: Agent Starting Run: 2wq8qy0k with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 11.00
Episode 100	Average Score: 24.43
Episode 200	Average Score: 40.23
Episode 300	Average Score: 71.13
Episode 400	Average Score: 65.67
Episode 500	Average Score: 62.12
Episode 600	Average Score: 49.24
Episode 700	Average Score: 66.32
Episode 800	Average Score: 62.33
Episode 900	Average Score: 66.77


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.04209827357237716, max=1.…

0,1
avg_regret,▁

0,1
avg_regret,441354.0


[34m[1mwandb[0m: Agent Starting Run: daljflmh with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 13.00
Episode 100	Average Score: 23.50
Episode 200	Average Score: 48.36
Episode 300	Average Score: 56.75
Episode 400	Average Score: 63.59
Episode 500	Average Score: 77.71
Episode 600	Average Score: 82.98
Episode 700	Average Score: 71.53
Episode 800	Average Score: 74.84
Episode 900	Average Score: 81.26


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,431475.0


[34m[1mwandb[0m: Agent Starting Run: zhqzax04 with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 11.00
Episode 100	Average Score: 30.54
Episode 200	Average Score: 35.14
Episode 300	Average Score: 42.11
Episode 400	Average Score: 38.57
Episode 500	Average Score: 53.04
Episode 600	Average Score: 59.43
Episode 700	Average Score: 40.44
Episode 800	Average Score: 67.76
Episode 900	Average Score: 76.50


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.04206646967030778, max=1.…

0,1
avg_regret,▁

0,1
avg_regret,451521.0


[34m[1mwandb[0m: Agent Starting Run: avim8a97 with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 12.00
Episode 100	Average Score: 27.79
Episode 200	Average Score: 42.39
Episode 300	Average Score: 54.09
Episode 400	Average Score: 41.13
Episode 500	Average Score: 79.16
Episode 600	Average Score: 75.72
Episode 700	Average Score: 100.18
Episode 800	Average Score: 94.19
Episode 900	Average Score: 109.58


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,420654.0


[34m[1mwandb[0m: Agent Starting Run: hifks7sa with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 11.00
Episode 100	Average Score: 28.65
Episode 200	Average Score: 48.39
Episode 300	Average Score: 55.81
Episode 400	Average Score: 48.34
Episode 500	Average Score: 62.24
Episode 600	Average Score: 44.52
Episode 700	Average Score: 55.75
Episode 800	Average Score: 67.48
Episode 900	Average Score: 68.81


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.04209827357237716, max=1.…

0,1
avg_regret,▁

0,1
avg_regret,441554.0


[34m[1mwandb[0m: Agent Starting Run: caj0lqj0 with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 11.00
Episode 100	Average Score: 23.95
Episode 200	Average Score: 36.38
Episode 300	Average Score: 51.78
Episode 400	Average Score: 45.57
Episode 500	Average Score: 70.33
Episode 600	Average Score: 64.45
Episode 700	Average Score: 81.26
Episode 800	Average Score: 78.28
Episode 900	Average Score: 55.71


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.041888451747280724, max=1…

0,1
avg_regret,▁

0,1
avg_regret,439752.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 3lbonem9 with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 10.00
Episode 100	Average Score: 28.27
Episode 200	Average Score: 40.91
Episode 300	Average Score: 59.04
Episode 400	Average Score: 67.73
Episode 500	Average Score: 69.15
Episode 600	Average Score: 54.50
Episode 700	Average Score: 91.41
Episode 800	Average Score: 91.04
Episode 900	Average Score: 135.61


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.041884297520661154, max=1…

0,1
avg_regret,▁

0,1
avg_regret,425303.0


[34m[1mwandb[0m: Agent Starting Run: odzptctz with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 11.00
Episode 100	Average Score: 26.89
Episode 200	Average Score: 40.34
Episode 300	Average Score: 59.23
Episode 400	Average Score: 30.09
Episode 500	Average Score: 24.86
Episode 600	Average Score: 48.78
Episode 700	Average Score: 44.91
Episode 800	Average Score: 61.59
Episode 900	Average Score: 49.13


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.04209827357237716, max=1.…

0,1
avg_regret,▁

0,1
avg_regret,456281.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 0zi4t41s with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 13.00
Episode 100	Average Score: 22.86
Episode 200	Average Score: 31.99
Episode 300	Average Score: 47.84
Episode 400	Average Score: 58.19
Episode 500	Average Score: 59.54
Episode 600	Average Score: 54.10
Episode 700	Average Score: 78.65
Episode 800	Average Score: 67.96
Episode 900	Average Score: 85.81


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,439377.0


[34m[1mwandb[0m: Agent Starting Run: qenioj1j with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 13.00
Episode 100	Average Score: 31.22
Episode 200	Average Score: 41.00
Episode 300	Average Score: 50.03
Episode 400	Average Score: 62.32
Episode 500	Average Score: 66.79
Episode 600	Average Score: 56.14
Episode 700	Average Score: 73.84
Episode 800	Average Score: 72.82
Episode 900	Average Score: 80.82


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,430631.0


[34m[1mwandb[0m: Agent Starting Run: g8yzabwq with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.01


Episode 0	Average Score: 12.00
Episode 100	Average Score: 31.62
Episode 200	Average Score: 62.21
Episode 300	Average Score: 46.43
Episode 400	Average Score: 61.69
Episode 500	Average Score: 45.97
Episode 600	Average Score: 53.35
Episode 700	Average Score: 10.39
Episode 800	Average Score: 9.37
Episode 900	Average Score: 9.36


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,466029.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: d7t559b7 with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 128
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.0001


Episode 0	Average Score: 30.00
Episode 100	Average Score: 19.60
Episode 200	Average Score: 22.10
Episode 300	Average Score: 21.00
Episode 400	Average Score: 21.71
Episode 500	Average Score: 20.91
Episode 600	Average Score: 25.66
Episode 700	Average Score: 26.05
Episode 800	Average Score: 27.37
Episode 900	Average Score: 26.41


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,476264.0


[34m[1mwandb[0m: Agent Starting Run: guywscln with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 11.00
Episode 100	Average Score: 33.46
Episode 200	Average Score: 39.33
Episode 300	Average Score: 46.89
Episode 400	Average Score: 47.69
Episode 500	Average Score: 51.15
Episode 600	Average Score: 44.44
Episode 700	Average Score: 46.51
Episode 800	Average Score: 64.58
Episode 900	Average Score: 73.99


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.04209827357237716, max=1.…

0,1
avg_regret,▁

0,1
avg_regret,447061.0


[34m[1mwandb[0m: Agent Starting Run: d1pocqsu with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 11.00
Episode 100	Average Score: 25.92
Episode 200	Average Score: 44.66
Episode 300	Average Score: 58.35
Episode 400	Average Score: 49.58
Episode 500	Average Score: 83.43
Episode 600	Average Score: 94.33
Episode 700	Average Score: 114.97
Episode 800	Average Score: 99.91
Episode 900	Average Score: 147.54


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,416890.0


[34m[1mwandb[0m: Agent Starting Run: 4a6nzggo with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 11.00
Episode 100	Average Score: 29.07
Episode 200	Average Score: 46.64
Episode 300	Average Score: 46.21
Episode 400	Average Score: 53.29
Episode 500	Average Score: 62.19
Episode 600	Average Score: 54.95
Episode 700	Average Score: 67.25
Episode 800	Average Score: 69.36
Episode 900	Average Score: 58.84


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.04206646967030778, max=1.…

0,1
avg_regret,▁

0,1
avg_regret,442985.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: asjobnf7 with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 12.00
Episode 100	Average Score: 25.73
Episode 200	Average Score: 40.00
Episode 300	Average Score: 53.27
Episode 400	Average Score: 48.10
Episode 500	Average Score: 77.39
Episode 600	Average Score: 42.34
Episode 700	Average Score: 76.15
Episode 800	Average Score: 69.73
Episode 900	Average Score: 104.04


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,432273.0


[34m[1mwandb[0m: Agent Starting Run: ge0e103j with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 128
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 27.00
Episode 100	Average Score: 25.76
Episode 200	Average Score: 33.01
Episode 300	Average Score: 43.38
Episode 400	Average Score: 49.99
Episode 500	Average Score: 61.99
Episode 600	Average Score: 60.07
Episode 700	Average Score: 55.71
Episode 800	Average Score: 70.49
Episode 900	Average Score: 70.37


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.041888451747280724, max=1…

0,1
avg_regret,▁

0,1
avg_regret,443291.0


[34m[1mwandb[0m: Agent Starting Run: honvnsgw with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 128
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 46.00
Episode 100	Average Score: 27.55
Episode 200	Average Score: 37.12
Episode 300	Average Score: 56.14
Episode 400	Average Score: 52.01
Episode 500	Average Score: 63.81
Episode 600	Average Score: 85.42
Episode 700	Average Score: 135.72
Episode 800	Average Score: 122.26
Episode 900	Average Score: 110.92


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,419983.0


[34m[1mwandb[0m: Agent Starting Run: yl2dspwe with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 13.00
Episode 100	Average Score: 26.49
Episode 200	Average Score: 29.88
Episode 300	Average Score: 42.18
Episode 400	Average Score: 41.54
Episode 500	Average Score: 43.61
Episode 600	Average Score: 54.66
Episode 700	Average Score: 46.90
Episode 800	Average Score: 58.09
Episode 900	Average Score: 61.28


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,453739.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: s91nzvk4 with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 12.00
Episode 100	Average Score: 30.22
Episode 200	Average Score: 53.10
Episode 300	Average Score: 53.79
Episode 400	Average Score: 49.57
Episode 500	Average Score: 55.69
Episode 600	Average Score: 40.03
Episode 700	Average Score: 56.73
Episode 800	Average Score: 61.50
Episode 900	Average Score: 63.72


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,449031.0


[34m[1mwandb[0m: Agent Starting Run: 27lwv7q4 with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 128
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 27.00
Episode 100	Average Score: 26.69
Episode 200	Average Score: 32.59
Episode 300	Average Score: 46.96
Episode 400	Average Score: 57.06
Episode 500	Average Score: 77.93
Episode 600	Average Score: 92.79
Episode 700	Average Score: 120.42
Episode 800	Average Score: 93.30
Episode 900	Average Score: 119.94


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,418320.0


[34m[1mwandb[0m: Agent Starting Run: pqw15o1y with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 128
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 86.00
Episode 100	Average Score: 24.68
Episode 200	Average Score: 26.89
Episode 300	Average Score: 45.87
Episode 400	Average Score: 56.85
Episode 500	Average Score: 74.18
Episode 600	Average Score: 87.65
Episode 700	Average Score: 135.35
Episode 800	Average Score: 147.85
Episode 900	Average Score: 134.62


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.041914584159724974, max=1…

0,1
avg_regret,▁

0,1
avg_regret,412042.0


[34m[1mwandb[0m: Agent Starting Run: zejd4yrc with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 128
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 44.00
Episode 100	Average Score: 22.96
Episode 200	Average Score: 28.98
Episode 300	Average Score: 47.87
Episode 400	Average Score: 52.16
Episode 500	Average Score: 73.89
Episode 600	Average Score: 56.07
Episode 700	Average Score: 76.98
Episode 800	Average Score: 77.70
Episode 900	Average Score: 133.91


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,429933.0


[34m[1mwandb[0m: Agent Starting Run: 9sp265nb with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 128
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 49.00
Episode 100	Average Score: 24.48
Episode 200	Average Score: 31.49
Episode 300	Average Score: 40.89
Episode 400	Average Score: 56.63
Episode 500	Average Score: 62.53
Episode 600	Average Score: 65.57
Episode 700	Average Score: 76.63
Episode 800	Average Score: 72.51
Episode 900	Average Score: 71.93


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,440268.0


[34m[1mwandb[0m: Agent Starting Run: abgyga87 with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 128
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 60.00
Episode 100	Average Score: 24.42
Episode 200	Average Score: 36.55
Episode 300	Average Score: 45.73
Episode 400	Average Score: 54.16
Episode 500	Average Score: 79.48
Episode 600	Average Score: 88.61
Episode 700	Average Score: 90.42
Episode 800	Average Score: 96.26
Episode 900	Average Score: 135.18


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,424380.0


[34m[1mwandb[0m: Agent Starting Run: ov8xlnfe with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 128
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 27.00
Episode 100	Average Score: 30.20
Episode 200	Average Score: 36.45
Episode 300	Average Score: 53.53
Episode 400	Average Score: 51.09
Episode 500	Average Score: 68.04
Episode 600	Average Score: 57.04
Episode 700	Average Score: 82.15
Episode 800	Average Score: 82.85
Episode 900	Average Score: 94.49


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,428737.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 1ui008b6 with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 128
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 27.00
Episode 100	Average Score: 22.80
Episode 200	Average Score: 34.50
Episode 300	Average Score: 50.05
Episode 400	Average Score: 51.30
Episode 500	Average Score: 83.62
Episode 600	Average Score: 58.25
Episode 700	Average Score: 95.39
Episode 800	Average Score: 112.54
Episode 900	Average Score: 102.89


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,429439.0


[34m[1mwandb[0m: Agent Starting Run: dnwkp75o with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 128
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.001


Episode 0	Average Score: 25.00
Episode 100	Average Score: 24.94
Episode 200	Average Score: 34.94
Episode 300	Average Score: 50.12
Episode 400	Average Score: 55.90
Episode 500	Average Score: 90.17
Episode 600	Average Score: 53.26
Episode 700	Average Score: 99.90
Episode 800	Average Score: 129.85
Episode 900	Average Score: 107.96


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,424466.0
