In [None]:

import numpy as np
import wandb
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import namedtuple, deque
import torch.optim as optim
import datetime
import gym
#from gym.wrappers.record_video import RecordVideo
import glob
import io
import base64
import matplotlib.pyplot as plt
from IPython.display import HTML
from IPython import display as ipythondisplay
from PIL import Image

In [None]:
wandb.login()

In [None]:

import torch
import torch.nn as nn
import torch.nn.functional as F


'''
Bunch of Hyper parameters (Which you might have to tune later)
'''
BUFFER_SIZE = int(1e5)  # replay buffer size

GAMMA = 0.99            # discount factor
LR = 5e-4               # learning rate



class QNetwork1(nn.Module):

    def __init__(self,fc1_units, fc2_units,type,state_size, action_size):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            fc1_units (int): Number of nodes in first hidden layer
            fc2_units (int): Number of nodes in second hidden layer
        """
        super(QNetwork1, self).__init__()
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.advantage_fc = nn.Linear(fc2_units, action_size)
        self.value_fc = nn.Linear(fc2_units, 1)
        self.type = type

    def forward(self, state):
        """Build a network that maps state -> action values."""
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        advantage = self.advantage_fc(x)
        value = self.value_fc(x)
        if self.type == 1:
            q_values = value + (advantage - advantage.mean(dim = 1, keepdim = True))
        else:
            q_values = value + (advantage - torch.max(advantage))
        return q_values
    
    

In [None]:
import random
import torch
import numpy as np
from collections import deque, namedtuple

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, action_size, buffer_size, batch_size, seed):
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [None]:
from scipy.special import softmax

class TutorialAgent():
    def __init__(self,lr,Batch_Size,update_rate,type,state_size, action_size,fc1 =128, fc2= 64, seed = 0):
        ''' Agent Environment Interaction '''
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = Batch_Size
        self.seed = random.seed(seed)
        self.qnetwork_local = QNetwork1(int(fc1),int(fc2),type,state_size, action_size).to(device)
        self.qnetwork_target = QNetwork1(int(fc1),int(fc2),type,state_size, action_size).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr)
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, int(Batch_Size), seed)
        self.t_step = 0
        self.update_rate = int(update_rate)
        

    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)
        if len(self.memory) >= self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)
        self.t_step = (self.t_step + 1) % self.update_rate
        if self.t_step == 0:

            self.qnetwork_target.load_state_dict(self.qnetwork_local.state_dict())

    def act(self, state, policy, hyp=0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        if(policy == "eps greedy"):
          if random.random() > hyp:
              return np.argmax(action_values.cpu().data.numpy())
          else:
              return random.choice(np.arange(self.action_size))
        if(policy == "softmax"):
          action_probs = softmax(action_values.cpu().data.numpy().flatten() / hyp)
          return np.random.choice(np.arange(self.action_size), p=action_probs)

    def learn(self, experiences, gamma):
        """ +E EXPERIENCE REPLAY PRESENT """
        states, actions, rewards, next_states, dones = experiences
        Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.qnetwork_local(states).gather(1, actions)
        loss = F.mse_loss(Q_expected, Q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.qnetwork_local.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    ''' Trial run to check if algorithm runs and saves the data '''

class DuelingDQN:
    def __init__(self,env, policy, agent):
        self.env = env
        self.episodes = 4000
        self.max_t = 10000
        self.policy = policy
        self.agent = agent

    def train(self):
        hyp_start = 1.0
        hyp_end = 0.01
        hyp_decay = 0.995
        hyp = hyp_start
        scores_window = deque(maxlen=100)
        avg_rewards = []
        for i_episode in range(1, self.episodes+1):
            state,_ = self.env.reset()
            score = 0
            for t in range(self.max_t):
                action = self.agent.act(state, self.policy, hyp = hyp)
                next_state, reward, done, _, _ = self.env.step(action)
                self.agent.step(state, action, reward, next_state, done)
                state = next_state
                score += reward
                if done:
                 break
            scores_window.append(score)
            hyp = max(hyp_end, hyp_decay*hyp)
            ''' decrease epsilon '''
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
            avg_rewards.append(np.mean(scores_window))
            if i_episode % 10 == 0:
                print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            #if np.mean(scores_window)>= self.env.spec.reward_threshold or i_episode >350:
            if i_episode >350:
                print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
                break
        return avg_rewards
    
    def regret(self,rewards):
        regret = 0
        for i in range(rewards):
            regret += self.env.spec.reward_threshold - rewards[i]
        return regret

    def PerformExpmt(self,num_expmt,type):
        reward_avgs = []
        for i in range(num_expmt):  
            print("Experiment: %d"%(i+1))
            rewards = self.train()   
            reward_avgs.append(np.asarray(rewards))
        reward_avgs_mean = np.mean(np.array(reward_avgs), axis=0)
        reward_avgs_std = np.std(reward_avgs, axis=0)
        plt.figure(figsize=(10, 6))
        plt.plot(range(self.episodes), reward_avgs_mean, label='Reward Avg', color='blue')
        plt.fill_between(range(self.episodes), reward_avgs_mean - reward_avgs_std, reward_avgs_mean + reward_avgs_std, alpha=0.3, color='blue')
        plt.xlabel('Episode')
        plt.legend()
        plt.savefig(f"{type}.png")
        plt.show()
        return reward_avgs_mean, reward_avgs_std


Training

In [None]:
env = gym.make("Acrobot-v1")
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
type = 2 #max
lr = 0.004891  
batch = 64
update = 20   
for i in range(2):
    agent = TutorialAgent(lr,batch,update,type,state_size,action_size, seed = 0)
    DDQN = DuelingDQN(env,"softmax",agent)
    DDQN.PerformExpmt(2,i)  # type 0 is max and type 1 is mean


Running Expmts

In [None]:
DDQN.PerformExpmt(2)

Tuning

In [None]:
def Objective(lr,batch,update):
    env = gym.make("Acrobot-v1")
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    type = 1 #mean
    # type  = 2 # max
    agent = TutorialAgent(lr,batch,update,type,state_size,action_size, seed = 0)
    DDQN = DuelingDQN(env,"softmax",agent)
    avg_rewards = DDQN.train()
    return avg_rewards

sweep_config = {
    'method': 'bayes'
    }

metric = {
    'name': 'loss',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric
parameters_dict = {
    'optimizer': {
        'values': ['adam', 'sgd']
        },
    'lr': {
        # a flat distribution between 0 and 0.1
        'distribution': 'uniform',
        'min': 0.00001,
        'max': 0.01
      },
    'batch': {
        # a flat distribution between 0 and 0.1
        'distribution': 'uniform',
        'min': 40,
        'max': 80
      },
    'update': {
        # a flat distribution between 0 and 0.1
        'distribution': 'uniform',
        'min': 20,
        'max': 30
      }
    }

sweep_config['parameters'] = parameters_dict
sweep_id = wandb.sweep(sweep_config, project="pytorch-sweeps-demo")

def train(config=sweep_config):
    # Initialize a new wandb run
    with wandb.init(config=config):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        avg_reward = Objective(config.lr,config.batch,config.update)


        for epoch in range(len(avg_reward)):
            wandb.log({"Score": avg_reward[epoch], "epoch": epoch})   

wandb.agent(sweep_id, train, count=3)