# MC REINFORCE

In [10]:
'''
A bunch of imports, you don't have to worry about these
'''

import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import namedtuple, deque
import torch.optim as optim
import gym
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [11]:
'''
Please refer to the first tutorial for more details on the specifics of environments
We've only added important commands you might find useful for experiments.
'''

'''
List of example environments
(Source - https://gym.openai.com/envs/#classic_control)

'Acrobot-v1'
'Cartpole-v1'
'MountainCar-v0'
'''

env = gym.make('CartPole-v1')
env.seed(0)

state_shape = env.observation_space.shape[0]
no_of_actions = env.action_space.n

print("State shape:", state_shape)
print("Number of Actions: ",no_of_actions)
print("Sampled Action",env.action_space.sample())
print("----")

'''
# Understanding State, Action, Reward Dynamics

The agent decides an action to take depending on the state.

The Environment keeps a variable specifically for the current state.
- Everytime an action is passed to the environment, it calculates the new state and updates the current state variable.
- It returns the new current state and reward for the agent to take the next action

'''

state = env.reset()
''' This returns the initial state (when environment is reset) '''

print("Current_State: ",state)
print("----")

action = env.action_space.sample()
''' We take a random action now '''

print("Sampled Action2: ", action)
print("----")

next_state, reward, done, info = env.step(action)
''' env.step is used to calculate new state and obtain reward based on old state and action taken  '''

print("Next_State: ",next_state)
print("Reward: ",reward)
print("Done: ", done)
print("Info: ", info)
print("----")


State shape: 4
Number of Actions:  2
Sampled Action 0
----
Current_State:  [-0.04456399  0.04653909  0.01326909 -0.02099827]
----
Sampled Action2:  0
----
Next_State:  [-0.04363321 -0.14877061  0.01284913  0.2758415 ]
Reward:  1.0
Done:  False
Info:  {}
----


# MC REINFORCE W/O BASELINE

In [12]:

import torch
import torch.nn as nn
import torch.nn.functional as F

import random
import torch
import numpy as np
from collections import deque, namedtuple

from scipy.special import softmax
from torch.distributions import Categorical

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

'''
Bunch of Hyper parameters (Which you might have to tune later)
'''
'''BUFFER_SIZE = int(1e5)  # replay buffer size
BATCH_SIZE = 64         # minibatch size
GAMMA = 0.99            # discount factor
LR = 5e-4               # learning rate
UPDATE_EVERY = 20       # how often to update the network (When Q target is present)'''

class Policy(nn.Module):

    def __init__(self, state_size, action_size, seed=0, num_policy_layers =1,num_policy_layer_units= 64):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(Policy, self).__init__()
        self.seed = torch.manual_seed(seed)
        activation = nn.LeakyReLU
        self.fc_start = nn.Sequential(*[nn.Linear(state_size, num_policy_layer_units),activation()])
        self.fc_hidden = nn.Sequential(*[nn.Sequential(*[nn.Linear(num_policy_layer_units,num_policy_layer_units),activation()]) for _ in range(num_policy_layers-1)])
        self.fc_end = nn.Linear(num_policy_layer_units, action_size)

    def forward(self, state):
        """Build a network that maps state -> action values."""
        xs = self.fc_start(state)
        x_hidden = self.fc_hidden(xs)
        x_out = self.fc_end(x_hidden)
        return F.softmax(x_out, dim=1)
    
    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        model = Categorical(probs)
        action = model.sample()
        return action.item(), model.log_prob(action)


In [13]:
def reinforce(policy, optimizer, n_episodes=1000, max_t=1000, gamma=0.99, print_every=100):
    scores_deque = deque(maxlen=100)
    scores = []
    for e in range(n_episodes):
        saved_log_probs = []
        rewards = []
        state = env.reset()
        # Collect trajectory
        for t in range(max_t):
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break

        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))

        # Convert rewards to torch tensor
        rewards = torch.FloatTensor(rewards)

        # Calculate discounted returns
        discounts = torch.FloatTensor([gamma ** i for i in range(len(rewards))])
        returns = torch.FloatTensor([sum(rewards[i:] * discounts[:len(rewards)-i]) for i in range(len(rewards))])

        # Calculate policy loss
        policy_loss = []
        for log_prob, ret in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * ret)
        policy_loss = torch.stack(policy_loss).sum()

        # Backpropagation and optimization step
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        if e % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(e, np.mean(scores_deque)))

    return scores

# Final Draft

In [14]:
#!pip install wandb

In [15]:
import wandb
wandb.login(key = "8545e71f98dc96fbac53295facb12404fc77016d")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\nayin\.netrc


True

In [16]:
def train_and_tune(config=None):
  # Initialize a new wandb run
  with wandb.init(config=config):
    # If called by wandb.agent, as below,
    # this config will be set by Sweep Controller
    config = wandb.config
    wandb.run.name='plr'+str(config.policy_lr)+'-nlr'+str(config.num_policy_layers)+'-uts'+str(config.num_policy_layer_units)
    state_shape = env.observation_space.shape[0]
    action_shape = env.action_space.n
    num_exp = 1
    max_episodes = 1000
    total_rewards = np.zeros([num_exp,max_episodes])
    for i in range(num_exp):
        state_shape = env.observation_space.shape[0]
        action_shape = env.action_space.n
        policy = Policy(state_size=state_shape, action_size=action_shape, seed=i, num_policy_layers =config.num_policy_layers,num_policy_layer_units= config.num_policy_layer_units).to(device)
        optimizer = optim.Adam(policy.parameters(), lr=config.policy_lr)
        scores = reinforce(policy, optimizer, n_episodes=max_episodes)
        total_rewards[i] = scores
    Regret = np.mean(np.sum(500-total_rewards,axis=1),axis=0)
    #wandb.log({"train_mean_reward":rewards,"train_mean_steps":steps,"test_mean_reward":r,"train_mean_steps":s})
    #data = [[x, y] for (x, y) in zip(np.arange(config.episodes), rewards)]
    #table1 = wandb.Table(data=data, columns=["x", "y"])
    #data = [[x, y] for (x, y) in zip(np.arange(config.episodes), steps)]
    #table2 = wandb.Table(data=data, columns=["x", "y"])
    wandb.log(
        {
            "avg_regret":Regret#,'avg_test_steps':s #,"train_reward": wandb.plot.line(table1, "x", "y", title="Reward vs Episode"),"train_steps": wandb.plot.line(table2, "x", "y", title="Steps vs Episode"),
        }
    )

In [17]:
sweep_config={'method':'bayes',
              'metric' : {
                  'name':'avg_regret',
                  'goal':'minimize'},
              'parameters':{
                  'policy_lr':{'values':[1e-2, 1e-3, 1e-4]},
                  'num_policy_layers':{'values':[1,2,3]},
                  'num_policy_layer_units':{'values':[64,128,256,512]},
                  }}
import pprint
pprint.pprint(sweep_config)
sweep_id=wandb.sweep(sweep_config,project="CS6700_PROGRAMMING_ASSIGNMENT_2")

{'method': 'bayes',
 'metric': {'goal': 'minimize', 'name': 'avg_regret'},
 'parameters': {'num_policy_layer_units': {'values': [64, 128, 256, 512]},
                'num_policy_layers': {'values': [1, 2, 3]},
                'policy_lr': {'values': [0.01, 0.001, 0.0001]}}}
Create sweep with ID: 4v1fcu0q
Sweep URL: https://wandb.ai/nayinisriharsh-iitm/CS6700_PROGRAMMING_ASSIGNMENT_2/sweeps/4v1fcu0q


{'method': 'bayes',
 'metric': {'goal': 'minimize', 'name': 'avg_regret'},
 'parameters': {'act_algorithm': {'values': ['softmax', 'epsilon']},
                'batch_size': {'values': [32, 64, 128]},
                'buffer_size': {'values': [100, 1000, 10000, 100000]},
                'epsilon_tau_decay': {'values': [0.995, 0.95, 0.9, 0.85]},
                'epsilon_tau_start': {'values': [1, 0.5]},
                'lr': {'values': [0.01, 0.001, 0.0001]},
                'num_common_layer_units': {'values': [128, 256, 512]},
                'num_common_layers': {'values': [1, 2, 3]},
                'type': {'values': [1]},
                'update_every': {'values': [10, 20, 30]}}}
Create sweep with ID: 29klzxlf
Sweep URL: https://wandb.ai/nayinisriharsh-iitm/CS6700_PROGRAMMING_ASSIGNMENT_2/sweeps/4v1fcu0q

In [18]:
wandb.agent(sweep_id, train_and_tune,count=20)

[34m[1mwandb[0m: Agent Starting Run: 7b8humcr with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 128
[34m[1mwandb[0m: 	num_policy_layers: 2
[34m[1mwandb[0m: 	policy_lr: 0.0001


Episode 0	Average Score: 14.00
Episode 100	Average Score: 25.43
Episode 200	Average Score: 30.18
Episode 300	Average Score: 40.62
Episode 400	Average Score: 55.93
Episode 500	Average Score: 107.21
Episode 600	Average Score: 164.07
Episode 700	Average Score: 180.56
Episode 800	Average Score: 224.55
Episode 900	Average Score: 246.88


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,364570.0


[34m[1mwandb[0m: Agent Starting Run: 8b27nqbl with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.01


Episode 0	Average Score: 11.00
Episode 100	Average Score: 37.87
Episode 200	Average Score: 9.39
Episode 300	Average Score: 9.09
Episode 400	Average Score: 9.39
Episode 500	Average Score: 9.33
Episode 600	Average Score: 9.50
Episode 700	Average Score: 9.38
Episode 800	Average Score: 9.45
Episode 900	Average Score: 9.43


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.0419009193729744, max=1.0…

0,1
avg_regret,▁

0,1
avg_regret,487781.0


[34m[1mwandb[0m: Agent Starting Run: 1q4ldsr6 with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 64
[34m[1mwandb[0m: 	num_policy_layers: 3
[34m[1mwandb[0m: 	policy_lr: 0.01


Episode 0	Average Score: 11.00
Episode 100	Average Score: 15.89
Episode 200	Average Score: 9.26
Episode 300	Average Score: 9.23
Episode 400	Average Score: 9.40
Episode 500	Average Score: 9.51
Episode 600	Average Score: 9.40
Episode 700	Average Score: 9.41
Episode 800	Average Score: 9.42
Episode 900	Average Score: 9.39


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.04190369096441328, max=1.…

0,1
avg_regret,▁

0,1
avg_regret,489973.0


[34m[1mwandb[0m: Agent Starting Run: 4zv8mf20 with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 128
[34m[1mwandb[0m: 	num_policy_layers: 2
[34m[1mwandb[0m: 	policy_lr: 0.0001


Episode 0	Average Score: 13.00
Episode 100	Average Score: 27.70
Episode 200	Average Score: 34.81
Episode 300	Average Score: 44.89
Episode 400	Average Score: 70.55
Episode 500	Average Score: 129.13
Episode 600	Average Score: 180.30
Episode 700	Average Score: 245.40
Episode 800	Average Score: 245.45
Episode 900	Average Score: 291.07


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.04205809128630705, max=1.…

0,1
avg_regret,▁

0,1
avg_regret,343453.0


[34m[1mwandb[0m: Agent Starting Run: 91ssg4q4 with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 128
[34m[1mwandb[0m: 	num_policy_layers: 2
[34m[1mwandb[0m: 	policy_lr: 0.0001


Episode 0	Average Score: 15.00
Episode 100	Average Score: 25.30
Episode 200	Average Score: 32.60
Episode 300	Average Score: 44.77
Episode 400	Average Score: 61.57
Episode 500	Average Score: 98.35
Episode 600	Average Score: 160.78
Episode 700	Average Score: 204.21
Episode 800	Average Score: 221.22
Episode 900	Average Score: 261.66


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,360167.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: qi7pfwb0 with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 2
[34m[1mwandb[0m: 	policy_lr: 0.0001


Episode 0	Average Score: 19.00
Episode 100	Average Score: 26.12
Episode 200	Average Score: 39.98
Episode 300	Average Score: 73.70
Episode 400	Average Score: 108.63
Episode 500	Average Score: 161.00
Episode 600	Average Score: 234.88
Episode 700	Average Score: 194.36
Episode 800	Average Score: 189.39
Episode 900	Average Score: 232.89


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.0420566952134369, max=1.0…

0,1
avg_regret,▁

0,1
avg_regret,345513.0


[34m[1mwandb[0m: Agent Starting Run: p9qsy9mw with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 3
[34m[1mwandb[0m: 	policy_lr: 0.0001


Episode 0	Average Score: 21.00
Episode 100	Average Score: 29.58
Episode 200	Average Score: 53.49
Episode 300	Average Score: 112.34
Episode 400	Average Score: 182.29
Episode 500	Average Score: 109.44
Episode 600	Average Score: 210.77
Episode 700	Average Score: 331.14
Episode 800	Average Score: 402.45
Episode 900	Average Score: 469.36


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,264261.0


[34m[1mwandb[0m: Agent Starting Run: h6dx9128 with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 512
[34m[1mwandb[0m: 	num_policy_layers: 3
[34m[1mwandb[0m: 	policy_lr: 0.0001


Episode 0	Average Score: 52.00
Episode 100	Average Score: 36.90
Episode 200	Average Score: 104.18
Episode 300	Average Score: 160.11
Episode 400	Average Score: 24.51
Episode 500	Average Score: 157.15
Episode 600	Average Score: 249.52
Episode 700	Average Score: 218.69
Episode 800	Average Score: 46.94
Episode 900	Average Score: 234.17


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,361542.0


[34m[1mwandb[0m: Agent Starting Run: nnuf7k2k with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 3
[34m[1mwandb[0m: 	policy_lr: 0.0001


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

Episode 0	Average Score: 14.00
Episode 100	Average Score: 25.68
Episode 200	Average Score: 44.24
Episode 300	Average Score: 110.59
Episode 400	Average Score: 104.60
Episode 500	Average Score: 153.78
Episode 600	Average Score: 184.50
Episode 700	Average Score: 243.18
Episode 800	Average Score: 202.77
Episode 900	Average Score: 288.33


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.04205529923324593, max=1.…

0,1
avg_regret,▁

0,1
avg_regret,341597.0


[34m[1mwandb[0m: Agent Starting Run: ojht34ai with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 64
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.0001


Episode 0	Average Score: 19.00
Episode 100	Average Score: 21.71
Episode 200	Average Score: 24.16
Episode 300	Average Score: 24.33
Episode 400	Average Score: 25.03
Episode 500	Average Score: 26.65
Episode 600	Average Score: 27.49
Episode 700	Average Score: 27.12
Episode 800	Average Score: 32.78
Episode 900	Average Score: 35.10


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,472109.0


[34m[1mwandb[0m: Agent Starting Run: tgagfr4w with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 256
[34m[1mwandb[0m: 	num_policy_layers: 3
[34m[1mwandb[0m: 	policy_lr: 0.0001


Episode 0	Average Score: 14.00
Episode 100	Average Score: 25.34
Episode 200	Average Score: 36.53
Episode 300	Average Score: 50.84
Episode 400	Average Score: 84.19
Episode 500	Average Score: 81.06
Episode 600	Average Score: 107.79
Episode 700	Average Score: 149.28
Episode 800	Average Score: 265.95
Episode 900	Average Score: 362.41


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,349371.0


[34m[1mwandb[0m: Agent Starting Run: ou9ospga with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 64
[34m[1mwandb[0m: 	num_policy_layers: 3
[34m[1mwandb[0m: 	policy_lr: 0.0001


Episode 0	Average Score: 12.00
Episode 100	Average Score: 20.72
Episode 200	Average Score: 20.03
Episode 300	Average Score: 27.30
Episode 400	Average Score: 29.52
Episode 500	Average Score: 33.37
Episode 600	Average Score: 49.01
Episode 700	Average Score: 77.16
Episode 800	Average Score: 136.55
Episode 900	Average Score: 185.17


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,418048.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: zvh6m15o with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 512
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.0001


Episode 0	Average Score: 15.00
Episode 100	Average Score: 26.06
Episode 200	Average Score: 34.81
Episode 300	Average Score: 38.33
Episode 400	Average Score: 56.40
Episode 500	Average Score: 71.10
Episode 600	Average Score: 106.29
Episode 700	Average Score: 150.50
Episode 800	Average Score: 158.97
Episode 900	Average Score: 183.31


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,396819.0


[34m[1mwandb[0m: Agent Starting Run: 1lieulha with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 64
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.01


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

Episode 0	Average Score: 22.00
Episode 100	Average Score: 96.21
Episode 200	Average Score: 172.25
Episode 300	Average Score: 210.59
Episode 400	Average Score: 225.07
Episode 500	Average Score: 102.72
Episode 600	Average Score: 110.22
Episode 700	Average Score: 107.61
Episode 800	Average Score: 99.92
Episode 900	Average Score: 98.98


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,366467.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 3ly3dftb with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 64
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.01


Episode 0	Average Score: 15.00
Episode 100	Average Score: 161.60
Episode 200	Average Score: 102.05
Episode 300	Average Score: 286.87
Episode 400	Average Score: 186.63
Episode 500	Average Score: 232.79
Episode 600	Average Score: 379.52
Episode 700	Average Score: 370.93
Episode 800	Average Score: 434.64
Episode 900	Average Score: 426.65


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,230866.0


[34m[1mwandb[0m: Agent Starting Run: 25eyqr4e with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 64
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.01


Episode 0	Average Score: 20.00
Episode 100	Average Score: 66.05
Episode 200	Average Score: 147.92
Episode 300	Average Score: 102.94
Episode 400	Average Score: 88.11
Episode 500	Average Score: 116.51
Episode 600	Average Score: 91.28
Episode 700	Average Score: 96.62
Episode 800	Average Score: 116.21
Episode 900	Average Score: 173.25


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,389108.0


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ui2m2yqk with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 64
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.01


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

Episode 0	Average Score: 14.00
Episode 100	Average Score: 90.34
Episode 200	Average Score: 155.83
Episode 300	Average Score: 110.55
Episode 400	Average Score: 132.18
Episode 500	Average Score: 88.32
Episode 600	Average Score: 116.39
Episode 700	Average Score: 146.03
Episode 800	Average Score: 203.75
Episode 900	Average Score: 201.85


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,362214.0


[34m[1mwandb[0m: Agent Starting Run: td0qyns8 with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 64
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.01


Episode 0	Average Score: 15.00
Episode 100	Average Score: 127.84
Episode 200	Average Score: 59.61
Episode 300	Average Score: 111.53
Episode 400	Average Score: 15.29
Episode 500	Average Score: 73.79
Episode 600	Average Score: 91.48
Episode 700	Average Score: 94.20
Episode 800	Average Score: 106.46
Episode 900	Average Score: 97.21


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,395159.0


[34m[1mwandb[0m: Agent Starting Run: lbsifqit with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 64
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.01


Episode 0	Average Score: 14.00
Episode 100	Average Score: 115.96
Episode 200	Average Score: 89.16
Episode 300	Average Score: 128.73
Episode 400	Average Score: 229.78
Episode 500	Average Score: 54.77
Episode 600	Average Score: 347.83
Episode 700	Average Score: 131.71
Episode 800	Average Score: 160.13
Episode 900	Average Score: 264.69


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_regret,▁

0,1
avg_regret,323049.0


[34m[1mwandb[0m: Agent Starting Run: kfgimv8f with config:
[34m[1mwandb[0m: 	num_policy_layer_units: 64
[34m[1mwandb[0m: 	num_policy_layers: 1
[34m[1mwandb[0m: 	policy_lr: 0.01


Episode 0	Average Score: 15.00
Episode 100	Average Score: 113.67
Episode 200	Average Score: 111.15
Episode 300	Average Score: 148.02
Episode 400	Average Score: 128.63
Episode 500	Average Score: 119.52
Episode 600	Average Score: 149.97
Episode 700	Average Score: 167.10
Episode 800	Average Score: 134.50
Episode 900	Average Score: 144.42


VBox(children=(Label(value='0.001 MB of 0.029 MB uploaded\r'), FloatProgress(value=0.04187737564039002, max=1.…

0,1
avg_regret,▁

0,1
avg_regret,358978.0
