In [6]:
#Import required libraries

import argparse
import wandb
import gym
import numpy as np
from itertools import count
from collections import namedtuple
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import matplotlib.pyplot as plt

In [7]:
#Set constants for training
seed = 543 #constant
gamma = 0.99 #constant
lr_optim = 1e-3 #tunable

torch.manual_seed(seed)
log_interval = 10


In [8]:
class Network(nn.Module):
    def __init__(self,hidden_size,state_shape = 4,action_size = 2 ):
        super(Network, self).__init__()
        self.affine1 = nn.Linear(state_shape, int(hidden_size))
        self.action_head = nn.Linear(int(hidden_size), action_size)
        self.saved_actions = []
        self.episode_rewards = []
        self.episode_states = []

    def forward(self, x):
        x = F.relu(self.affine1(x))
        action_prob = F.softmax(self.action_head(x), dim=-1)
        return action_prob


In [9]:

class REINFORCE_MCB:

    def __init__(self,hidden,lr):
        env = gym.make('CartPole-v1')
        self.env = env
        self.episodes = 500
        env.reset()
        state_shape = env.observation_space.shape[0]
        no_of_actions = env.action_space.n
        model = Network(hidden,state_shape,no_of_actions)
        self.model = model
        self.optimizer = optim.Adam(model.parameters(), lr)
        self.max_len = 10000
        
    def select_action(self,state):
        state = torch.from_numpy(state).float()
        probs = self.model(state)
        m = Categorical(probs)
        action = m.sample()
        return action.item()

    def update(self,rewards, states, actions):
        G = 0
        gamma = 0.99
        self.optimizer.zero_grad()
        for i in reversed(range(len(rewards))):  
            reward = rewards[i]
            state = torch.tensor(states[i].reshape(1, -1),
                                 dtype=torch.float)
            action = torch.tensor(actions[i]).view(-1, 1)
            log_prob = torch.log(self.model(state).gather(1, action))  
            G = gamma * G + reward  
            loss = -log_prob * G       
            loss.backward()             
        self.optimizer.step()
        del self.model.episode_rewards[:]
        del self.model.saved_actions[:]
        del self.model.episode_states[:]
     
    def train(self):
        total_reward = []
        avg_reward = []
        running_reward = 10
        # run infinitely many episodes
        for i_episode in range(self.episodes):
            # reset environment and episode reward
            state, _ = self.env.reset()
            self.model.episode_states.append(state)
            ep_reward = 0
            for t in range(1, self.max_len):
            # select action from policy
                action = self.select_action(state)
                state, reward, done, _, _ = self.env.step(action)
                self.model.episode_states.append(state)
                self.model.episode_rewards.append(reward)
                self.model.saved_actions.append(action)
                ep_reward += reward
                if done:
                   break 
            total_reward.append(ep_reward)
            self.update(self.model.episode_rewards, self.model.episode_states, self.model.saved_actions)   
            running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
            avg_reward.append(running_reward)

            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
            if np.mean(avg_reward)>= self.env.spec.reward_threshold or i_episode >400:
                print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(avg_reward)))
                break
        return avg_reward
    
    def PerformExpmt(self,num_expmt):
        reward_avgs = []
        for i in range(num_expmt):  
            print("Experiment: %d"%(i+1))
            rewards = self.train()   
            reward_avgs.append(np.asarray(rewards))
        reward_avgs_mean = np.mean(np.array(reward_avgs), axis=0)
        reward_avgs_std = np.std(reward_avgs, axis=0)
        plt.figure(figsize=(10, 6))
        plt.plot(range(self.episodes), reward_avgs_mean, label='Reward Avg', color='blue')
        plt.fill_between(range(self.episodes), reward_avgs_mean - reward_avgs_std, reward_avgs_mean + reward_avgs_std, alpha=0.3, color='blue')
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        plt.legend()
        plt.savefig('withoutbaseline.png')
        plt.show()
        return reward_avgs_mean, reward_avgs_std
       

Training

In [10]:
hidden = 128
lr = 1e-3
reinforce = REINFORCE_MCB(hidden,lr)
avg_reward = reinforce.train()

Episode 0	Last reward: 9.00	Average reward: 9.95
Episode 1	Last reward: 23.00	Average reward: 10.60
Episode 2	Last reward: 29.00	Average reward: 11.52
Episode 3	Last reward: 17.00	Average reward: 11.80
Episode 4	Last reward: 32.00	Average reward: 12.81
Episode 5	Last reward: 21.00	Average reward: 13.22
Episode 6	Last reward: 17.00	Average reward: 13.41
Episode 7	Last reward: 19.00	Average reward: 13.69
Episode 8	Last reward: 20.00	Average reward: 14.00
Episode 9	Last reward: 13.00	Average reward: 13.95
Episode 10	Last reward: 20.00	Average reward: 14.25
Episode 11	Last reward: 32.00	Average reward: 15.14
Episode 12	Last reward: 9.00	Average reward: 14.83
Episode 13	Last reward: 17.00	Average reward: 14.94
Episode 14	Last reward: 22.00	Average reward: 15.29
Episode 15	Last reward: 35.00	Average reward: 16.28
Episode 16	Last reward: 35.00	Average reward: 17.22
Episode 17	Last reward: 18.00	Average reward: 17.26
Episode 18	Last reward: 45.00	Average reward: 18.64
Episode 19	Last reward: 5

Running Expmt

In [11]:
avg_reward = reinforce.train(2)

TypeError: REINFORCE_MCB.train() takes 1 positional argument but 2 were given

Tuning

In [None]:
def tune(hidden,lr):
    reinforce = REINFORCE_MCB(hidden,lr)
    avg_reward = reinforce.train()
    return avg_reward


sweep_config = {
    'method': 'bayes'
    }

metric = {
    'name': 'loss',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric
parameters_dict = {
    'optimizer': {
        'values': ['adam']
        },
    'hidden': {
        # a flat distribution between 0 and 0.1
        "values": [64, 128, 256],
      },
      'lr': {
        # a flat distribution between 0 and 0.1
        'distribution': 'uniform',
        'min': 0,
        'max': 0.01
      }
    }

sweep_config['parameters'] = parameters_dict
sweep_id = wandb.sweep(sweep_config, project="pytorch-sweeps-demo")

def tuner(config=sweep_config):
    # Initialize a new wandb run
    with wandb.init(config=config):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        avg_reward = tune(config.hidden,config.lr)
        for epoch in range(len(avg_reward)):
            wandb.log({"Score": avg_reward[epoch], "epoch": epoch})   

wandb.agent(sweep_id, tuner, count=5)