<a href="https://colab.research.google.com/github/MoustHolmes/AMAS_Project/blob/main/DDPG_Wandb_sweep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs

In [2]:
!pip3 install box2d-py
!pip install wandb



In [3]:
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Define the Q Learning Agent

In [4]:
%%writefile DDPG_Agent.py
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import os

class CriticNetwork(nn.Module):
    def __init__(self, beta, input_dims, fc1_dims, fc2_dims, n_actions, name,
                 chkpt_dir='/content/gdrive/My Drive'):
        super(CriticNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.name = name
        model_save_name = name + '_2.pt'
        # self.path = F"/content/gdrive/My Drive/{model_save_name}" 
        self.checkpoint_dir = F"/content/gdrive/My Drive/{model_save_name}"
        # self.checkpoint_dir = chkpt_dir
        self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ddpg')

        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)

        self.bn1 = nn.LayerNorm(self.fc1_dims)
        self.bn2 = nn.LayerNorm(self.fc2_dims)
        #self.bn1 = nn.BatchNorm1d(self.fc1_dims)
        #self.bn2 = nn.BatchNorm1d(self.fc2_dims)

        self.action_value = nn.Linear(self.n_actions, self.fc2_dims)
        
        self.q = nn.Linear(self.fc2_dims, 1)

        f1 = 1./np.sqrt(self.fc1.weight.data.size()[0])
        self.fc1.weight.data.uniform_(-f1, f1)
        self.fc1.bias.data.uniform_(-f1, f1)

        f2 = 1./np.sqrt(self.fc2.weight.data.size()[0])
        self.fc2.weight.data.uniform_(-f2, f2)
        self.fc2.bias.data.uniform_(-f2, f2)

        f3 = 0.003
        self.q.weight.data.uniform_(-f3, f3)
        self.q.bias.data.uniform_(-f3, f3)

        f4 = 1./np.sqrt(self.action_value.weight.data.size()[0])
        
        self.action_value.weight.data.uniform_(-f4, f4)
        self.action_value.bias.data.uniform_(-f4, f4)

        self.optimizer = optim.Adam(self.parameters(), lr=beta,
                                    weight_decay=0.01)
        
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1')

        self.to(self.device)

    def forward(self, state, action):
        state_value = self.fc1(state)
        state_value = self.bn1(state_value)
        state_value = F.relu(state_value)
        state_value = self.fc2(state_value)
        state_value = self.bn2(state_value)
        #state_value = F.relu(state_value)
        #action_value = F.relu(self.action_value(action))
        action_value = self.action_value(action)
        state_action_value = F.relu(T.add(state_value, action_value))
        #state_action_value = T.add(state_value, action_value)
        state_action_value = self.q(state_action_value)

        return state_action_value

    def save_checkpoint(self):
        print('... saving checkpoint ...')
        # T.save(self.state_dict(), self.path)
        # T.save(self.state_dict(), self.checkpoint_file)
        T.save(self.state_dict(), self.checkpoint_dir)

    def load_checkpoint(self):
        print('... loading checkpoint ...')
        self.load_state_dict(T.load(self.checkpoint_dir))

    def save_best(self):
        print('... saving best checkpoint ...')
        checkpoint_file = os.path.join(self.checkpoint_dir, self.name+'_best')
        T.save(self.state_dict(), checkpoint_file)

class ActorNetwork(nn.Module):
    def __init__(self, alpha, input_dims, fc1_dims, fc2_dims, n_actions, name,
                 chkpt_dir='/content/gdrive/My Drive'):
        super(ActorNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.name = name
        model_save_name = name + '_2.pt'
        # self.path = F"/content/gdrive/My Drive/{model_save_name}" 
        self.checkpoint_dir = F"/content/gdrive/My Drive/{model_save_name}"
        # self.checkpoint_dir = chkpt_dir
        self.checkpoint_file = os.path.join(self.checkpoint_dir, name+'_ddpg')

        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)

        self.bn1 = nn.LayerNorm(self.fc1_dims)
        self.bn2 = nn.LayerNorm(self.fc2_dims)

        #self.bn1 = nn.BatchNorm1d(self.fc1_dims)
        #self.bn2 = nn.BatchNorm1d(self.fc2_dims)

        self.mu = nn.Linear(self.fc2_dims, self.n_actions)

        f2 = 1./np.sqrt(self.fc2.weight.data.size()[0])
        self.fc2.weight.data.uniform_(-f2, f2)
        self.fc2.bias.data.uniform_(-f2, f2)

        f1 = 1./np.sqrt(self.fc1.weight.data.size()[0])
        self.fc1.weight.data.uniform_(-f1, f1)
        self.fc1.bias.data.uniform_(-f1, f1)

        f3 = 0.003
        self.mu.weight.data.uniform_(-f3, f3)
        self.mu.bias.data.uniform_(-f3, f3)

        self.optimizer = optim.Adam(self.parameters(), lr=alpha)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cuda:1')

        self.to(self.device)

    def forward(self, state):
        x = self.fc1(state)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = T.tanh(self.mu(x))

        return x

    def save_checkpoint(self):
        print('... saving checkpoint ...')
        # T.save(self.state_dict(), self.checkpoint_file)
        T.save(self.state_dict(), self.checkpoint_dir)#changed file to dir 

    def load_checkpoint(self):
        print('... loading checkpoint ...')
        self.load_state_dict(T.load(self.checkpoint_dir))#changed file to dir

    def save_best(self):
        print('... saving best checkpoint ...')
        checkpoint_file = os.path.join(self.checkpoint_dir, self.name+'_best')
        T.save(self.state_dict(), checkpoint_file)


class OUActionNoise():
    def __init__(self, mu, sigma=0.15, theta=0.2, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

class ReplayBuffer():
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, *input_shape))
        self.new_state_memory = np.zeros((self.mem_size, *input_shape))
        self.action_memory = np.zeros((self.mem_size, n_actions))
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)

    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.new_state_memory[index] = state_
        self.terminal_memory[index] = done

        self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)

        batch = np.random.choice(max_mem, batch_size)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        dones = self.terminal_memory[batch]

        return states, actions, rewards, states_, dones


class Agent():
    def __init__(self, input_dims, n_actions,
                 alpha, beta, tau, gamma=0.99,
                 output_bounds=1,
                 fc1_dims=400, fc2_dims=300,
                 noise_sigma = 0.15,
                 batch_size=256,
                 max_size=1000000):
      #output bounds are my addition 
        self.output_bounds = output_bounds
        self.alpha = alpha
        self.beta = beta
        self.tau = tau
        self.gamma = gamma
        self.noise_sigma = noise_sigma
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        
        self.batch_size = batch_size

        self.memory = ReplayBuffer(max_size, input_dims, n_actions)
        #we have  to scale the noise to match the outpu bounds

        var_names = str(alpha) +'_'+ str(beta) +'_'+ str(gamma) +'_' +str(fc1_dims) +'_'+ str(fc2_dims) +'_'+ str(noise_sigma)

        self.noise = OUActionNoise(mu=np.zeros(n_actions), sigma = noise_sigma *output_bounds)

        self.actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims,
                                n_actions=n_actions, name='actor' +var_names)
        self.critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims,
                                n_actions=n_actions, name='critic' +var_names)

        self.target_actor = ActorNetwork(alpha, input_dims, fc1_dims, fc2_dims,
                                n_actions = n_actions, name = 'target_actor' +var_names)

        self.target_critic = CriticNetwork(beta, input_dims, fc1_dims, fc2_dims,
                                n_actions = n_actions, name = 'target_critic' +var_names)

        self.update_network_parameters(tau=1)

    def choose_action(self, observation):
        self.actor.eval()
        state = T.tensor([observation], dtype=T.float).to(self.actor.device)
        mu = self.actor.forward(state).to(self.actor.device)
        mu_prime = mu + T.tensor(self.noise(), 
                                    dtype=T.float).to(self.actor.device)
        self.actor.train()

        return mu_prime.cpu().detach().numpy()[0]*self.output_bounds

    def choose_action_no_noise(self, observation):#my addition
        self.actor.eval()
        state = T.tensor([observation], dtype=T.float).to(self.actor.device)
        mu = self.actor.forward(state).to(self.actor.device)
        return mu.cpu().detach().numpy()[0]*self.output_bounds

    def choose_action_no_noise_batch(self, observation):#my addition
        self.actor.eval()
        state = T.tensor(observation, dtype=T.float).to(self.actor.device)
        mu = self.actor.forward(state)#.to(self.actor.device)
        return mu.cpu().detach().numpy()*self.output_bounds

    def remember(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def save_models(self):
        self.actor.save_checkpoint()
        self.target_actor.save_checkpoint()
        self.critic.save_checkpoint()
        self.target_critic.save_checkpoint()

    def load_models(self):
        self.actor.load_checkpoint()
        self.target_actor.load_checkpoint()
        self.critic.load_checkpoint()
        self.target_critic.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        states, actions, rewards, states_, done = \
                self.memory.sample_buffer(self.batch_size)

        states = T.tensor(states, dtype=T.float).to(self.actor.device)
        states_ = T.tensor(states_, dtype=T.float).to(self.actor.device)
        actions = T.tensor(actions, dtype=T.float).to(self.actor.device)
        rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device)
        done = T.tensor(done).to(self.actor.device)

        target_actions = self.target_actor.forward(states_)
        critic_value_ = self.target_critic.forward(states_, target_actions)
        critic_value = self.critic.forward(states, actions)

        critic_value_[done] = 0.0
        critic_value_ = critic_value_.view(-1)

        target = rewards + self.gamma*critic_value_
        target = target.view(self.batch_size, 1)

        self.critic.optimizer.zero_grad()
        critic_loss = F.mse_loss(target, critic_value)
        critic_loss.backward()
        self.critic.optimizer.step()

        self.actor.optimizer.zero_grad()
        actor_loss = -self.critic.forward(states, self.actor.forward(states))
        actor_loss = T.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimizer.step()

        self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critic_state_dict = dict(target_critic_params)
        target_actor_state_dict = dict(target_actor_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
                                (1-tau)*target_critic_state_dict[name].clone()

        for name in actor_state_dict:
             actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                                 (1-tau)*target_actor_state_dict[name].clone()

        # self.target_critic.load_state_dict(critic_state_dict)
        # self.target_actor.load_state_dict(actor_state_dict)

        #self.target_critic.load_state_dict(critic_state_dict, strict=False)
        #self.target_actor.load_state_dict(actor_state_dict, strict=False)

    def print_args(self):
        print('alpha    : ' +str(self.alpha))
        print('beta     : ' +str(self.beta))
        print('gamma    : ' +str(self.gamma))
        print('tau      : ' +str(self.tau))
        print('noise_sig: ' +str(self.noise_sigma))
        print('fc1_dims : ' +str(self.fc1_dims))
        print('fc2_dims : ' +str(self.fc2_dims))

Writing DDPG_Agent.py


## Train and Test

In [8]:
%%writefile main.py

import numpy as np
import wandb
from DDPG_Agent import Agent
import argparse
import gym 
import pprint 

def main():
    wandb.init(project='AMAS_Project_DDPG', config=args)
    pprint.pprint(args)
    env = gym.make('LunarLanderContinuous-v2')
    agent = Agent(alpha = args.alpha, beta = args.beta, 
                tau = args.tau, gamma = args.gamma,
                fc1_dims = args.fc1_dims, fc2_dims=args.fc2_dims,
                batch_size = args.batch_size, 
                n_actions=4, input_dims = env.observation_space.shape)
    
    agent.print_args()

    best_avg_score, weighted_best_score = train(env, agent, episodes = args.episodes, avg_len = args.avg_len, burn_in_time = args.burn_in_time)

    wandb.log({'best_avg_score': best_avg_score,'weighted_best_score':weighted_best_score})

def train(env, agent, episodes=500, avg_len = 50, burn_in_time = 50):
    """The play function runs iterations and updates Q-values if desired."""

    
    scores = []
    best_score = float('-inf')

    for i in range(episodes):
        score = 0
        done = False
        observation = env.reset()
        agent.noise.reset()
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.remember(observation, action, reward, 
                                    observation_, done)
            agent.learn()
            observation = observation_
        scores.append(score)
        

        avg_score = np.mean(scores[-avg_len:])
        if avg_score > best_score and i > burn_in_time:
          best_score = avg_score

        print('episode ', i, 'score %.1f' % score,
            'average score %.1f' % avg_score, 'best score:%.1f' %best_score)
  
        wandb.log({'Scores': score,'Avg_Score': avg_score, 'episodes': episodes})

    weighted_best_score =np.max( np.array(scores[burn_in_time:])/np.arange(len(scores))[burn_in_time:]) 
    print(weighted_best_score)
    return best_score, weighted_best_score

def argumentParser():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--alpha',        default = 0.0001, type=float, help='Learning Rate')
    parser.add_argument('--beta',         default = 0.001, type=float, help='Learning Rate')
    parser.add_argument('--gamma',        default = 0.99, type=float, help='Discounting Factor')
    parser.add_argument('--tau',          default = 0.001, type=float, help='Learning Rate')
    parser.add_argument('--episodes',     default = 2000, type=int, help='number of episodes')
    parser.add_argument('--burn_in_time', default = 50, type=int, help='number of episodes before calculating avg score')
    parser.add_argument('--avg_len',      default = 50, type=int, help='number of episodes avg is calculated over')
    parser.add_argument('--fc1_dims',     default = 400, type=int, help='size of first fully conected layer in the network')
    parser.add_argument('--fc2_dims',     default = 300, type=int, help='size of second fully conected layer in the network')
    parser.add_argument('--batch_size',   default = 64, type=int, help='size of second fully conected layer in the network')

    return parser

if __name__ == '__main__':
  global args
  args = argumentParser().parse_args()
  main()

Overwriting main.py


In [None]:
!python3 main.py

[34m[1mwandb[0m: Currently logged in as: [33mmoustholmes[0m (use `wandb login --relogin` to force relogin)
2021-03-26 00:42:53.015620: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[34m[1mwandb[0m: Tracking run with wandb version 0.10.23
[34m[1mwandb[0m: Syncing run [33mlucky-cloud-143[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/moustholmes/AMAS_Project_DDPG[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/moustholmes/AMAS_Project_DDPG/runs/2m5qks6o[0m
[34m[1mwandb[0m: Run data is saved locally in /content/wandb/run-20210326_004252-2m5qks6o
[34m[1mwandb[0m: Run `wandb offline` to turn off syncing.

Namespace(alpha=2.5e-05, avg_len=50, batch_size=64, beta=0.00025, burn_in_time=50, episodes=9000, fc1_dims=400, fc2_dims=300, gamma=0.99, tau=0.001)
alpha    : 2.5e-05
beta     : 0.00025
gamma    : 0.99
tau      : 0.001
noise_sig: 0.15
fc1_dims : 400
fc2_dims : 3

## Sweep for Hyperparameter tuning

In [6]:
%%writefile sweep.yaml
project: "AMAS_Project_DDPG"
program: main.py
method: bayes
metric:
  name: best_score
  goal: maximize
parameters:
  alpha:
    values: [0.0001, 0.0005, 0.000025, 0.00005, 0.00001, 0.000005]
  beta:
    values: [0.001, 0.005, 0.00025, 0.0005, 0.0001, 0.00005]
  gamma:
    values: [0.999, 0.99, 0.9, 0.5]
  tau:
    values: [ 0.008, 0.005, 0.002, 0.001, 0.0008, 0.0005, 0.0001]
  fc1_dims:
    values: [128, 256, 384, 512]
  fc2_dims:
    values: [128, 256, 384, 512]

Writing sweep.yaml


In [None]:
!wandb sweep sweep.yaml

[34m[1mwandb[0m: Creating sweep from: sweep.yaml
[34m[1mwandb[0m: Created sweep with ID: [33m1rluf6s2[0m
[34m[1mwandb[0m: View sweep at: [34m[4mhttps://wandb.ai/moustholmes/AMAS_Project_DDPG/sweeps/1rluf6s2[0m
[34m[1mwandb[0m: Run sweep agent with: [33mwandb agent moustholmes/AMAS_Project_DDPG/1rluf6s2[0m


In [None]:
!wandb agent moustholmes/AMAS_Project_DDPG/xt7dgddd

[34m[1mwandb[0m: Starting wandb agent 🕵️
2021-03-25 10:56:10,786 - wandb.wandb_agent - INFO - Running runs: []
2021-03-25 10:56:11,075 - wandb.wandb_agent - INFO - Agent received command: run
2021-03-25 10:56:11,075 - wandb.wandb_agent - INFO - Agent starting run with config:
	alpha: 2.5e-05
	beta: 0.0001
	fc1_dims: 256
	fc2_dims: 384
	gamma: 0.99
	tau: 0.0005
2021-03-25 10:56:11,076 - wandb.wandb_agent - INFO - About to run command: /usr/bin/env python main.py --alpha=2.5e-05 --beta=0.0001 --fc1_dims=256 --fc2_dims=384 --gamma=0.99 --tau=0.0005
[34m[1mwandb[0m: Currently logged in as: [33mmaskel[0m (use `wandb login --relogin` to force relogin)
2021-03-25 10:56:13.951101: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[34m[1mwandb[0m: Tracking run with wandb version 0.10.23
[34m[1mwandb[0m: Syncing run [33mlively-sweep-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/moustholmes