<a href="https://colab.research.google.com/github/MoustHolmes/AMAS_Project/blob/main/Wandb_sweep_DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs

In [None]:
!pip3 install box2d-py
!pip install wandb

Collecting box2d-py
[?25l  Downloading https://files.pythonhosted.org/packages/87/34/da5393985c3ff9a76351df6127c275dcb5749ae0abbe8d5210f06d97405d/box2d_py-2.3.8-cp37-cp37m-manylinux1_x86_64.whl (448kB)
[K     |▊                               | 10kB 14.3MB/s eta 0:00:01[K     |█▌                              | 20kB 14.8MB/s eta 0:00:01[K     |██▏                             | 30kB 10.5MB/s eta 0:00:01[K     |███                             | 40kB 12.2MB/s eta 0:00:01[K     |███▋                            | 51kB 7.7MB/s eta 0:00:01[K     |████▍                           | 61kB 8.4MB/s eta 0:00:01[K     |█████▏                          | 71kB 8.4MB/s eta 0:00:01[K     |█████▉                          | 81kB 8.5MB/s eta 0:00:01[K     |██████▋                         | 92kB 8.9MB/s eta 0:00:01[K     |███████▎                        | 102kB 7.5MB/s eta 0:00:01[K     |████████                        | 112kB 7.5MB/s eta 0:00:01[K     |████████▊                       |

In [None]:
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Define the Q Learning Agent

In [None]:
%%writefile DQN_Agent.py
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

class DeepQNetwork(nn.Module):
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims, 
            n_actions):
        super(DeepQNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)

        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        actions = self.fc3(x)

        return actions

class Agent():
    def __init__(self, gamma, lr, 
            input_dims, n_actions,
            epsilon = 1,
            fc1_dims = 256 , fc2_dims = 256,
            eps_min =0.05, eps_dec =5e-4,
            max_mem_size =100000, batch_size = 64, replace_target =100):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.lr = lr
        self.action_space = [i for i in range(n_actions)]
        self.mem_size = max_mem_size
        self.batch_size = batch_size
        self.mem_cntr = 0
        self.iter_cntr = 0
        self.replace_target = replace_target
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims

        self.Q_eval = DeepQNetwork(lr, n_actions=n_actions, input_dims=input_dims,
                                    fc1_dims= fc1_dims, fc2_dims =fc2_dims)
        self.Q_next = DeepQNetwork(lr, n_actions=n_actions, input_dims=input_dims,
                                    fc1_dims= fc1_dims, fc2_dims =fc2_dims)

        self.state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)

    def store_transition(self, state, action, reward, state_, terminal):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = terminal

        self.mem_cntr += 1

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation]).to(self.Q_eval.device)
            actions = self.Q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def learn(self):
        if self.mem_cntr < self.batch_size:
            return

        self.Q_eval.optimizer.zero_grad()
        
        max_mem = min(self.mem_cntr, self.mem_size)

        batch = np.random.choice(max_mem, self.batch_size, replace=False)
        
        batch_index = np.arange(self.batch_size, dtype=np.int32)

        state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
        new_state_batch = T.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
        action_batch = self.action_memory[batch]
        reward_batch = T.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
        terminal_batch = T.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)

        q_pred = self.Q_eval.forward(state_batch)[batch_index, action_batch]
        q_next = self.Q_next.forward(new_state_batch)
        q_eval = self.Q_eval.forward(new_state_batch)
        q_next[terminal_batch] = 0.0

        max_actions = T.argmax(q_eval,dim=1)

        # q_target = reward_batch + self.gamma*T.max(q_next,dim=1)[0]
        q_target = reward_batch + self.gamma*q_next[batch_index, max_actions]

        loss = self.Q_eval.loss(q_target, q_pred).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()

        self.iter_cntr += 1
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min \
                       else self.eps_min

        if self.iter_cntr % self.replace_target == 0:
          self.Q_next.load_state_dict(self.Q_eval.state_dict())

          
    def print_args(self):
        print('gamma    : ' +str(self.gamma))
        print('lr       : ' +str(self.lr))
        print('eps_min  : ' +str(self.eps_min))
        print('eps_dec  : ' +str(self.eps_dec))
        print('fc1_dims : ' +str(self.fc1_dims))
        print('fc2_dims : ' +str(self.fc2_dims))
        print('replace_t: ' +str(self.replace_target))

Overwriting DQN_Agent.py


## Train and Test

In [None]:
%%writefile main.py

import numpy as np
import wandb
from DQN_Agent import Agent
import argparse
import gym 
import pprint 

def main():
    wandb.init(project='AMAS_Project', config=args)
    pprint.pprint(args)
    env = gym.make('LunarLander-v2')
    agent = Agent( gamma = args.gamma,
                  eps_min = args.eps_min, eps_dec = args.eps_dec,
                  lr=args.lr,
                  replace_target = args.replace_target,
                  fc1_dims = args.fc1_dims, fc2_dims = args.fc2_dims,
                  n_actions=4, input_dims=[8], batch_size=64)
    agent.print_args()
    
    best_avg_score, weighted_best_score = train(env, agent, episodes = args.episodes, avg_len = args.avg_len, burn_in_time = args.burn_in_time)

    wandb.log({'best_avg_score': best_avg_score,'weighted_best_score':weighted_best_score})

def train(env, agent, episodes=500, avg_len = 50, burn_in_time = 50):
    """The play function runs iterations and updates Q-values if desired."""

    
    scores, eps_history = [], []
    best_score = float('-inf')

    for i in range(episodes):
        score = 0
        done = False
        observation = env.reset()
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.store_transition(observation, action, reward, 
                                    observation_, done)
            agent.learn()
            observation = observation_
        scores.append(score)
        eps_history.append(agent.epsilon)

        avg_score = np.mean(scores[-avg_len:])
        if avg_score > best_score and i > burn_in_time:
          best_score = avg_score

        print('episode ', i, 'score %.1f' % score,
            'average score %.1f' % avg_score, 'best score:%.1f' %best_score)
  
        wandb.log({'Scores': score,'Avg_Score': avg_score, 'episodes': episodes})

    weighted_best_score =np.max( np.array(scores[burn_in_time:])/np.arange(len(scores))[burn_in_time:]) 
    print(weighted_best_score)
    return best_score, weighted_best_score

def argumentParser():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--eps_min', default=0.1, type=float, help='Probability of chossing random action')
    parser.add_argument('--eps_dec', default=5e-7, type=float, help='Probability of chossing random action')
    parser.add_argument('--lr', default=0.0001, type=float, help='Learning Rate')
    parser.add_argument('--gamma', default=0.95, type=float, help='Discounting Factor')
    parser.add_argument('--episodes', default=400, type=int, help='number of episodes')
    parser.add_argument('--burn_in_time', default=50, type=int, help='number of episodes before calculating avg score')
    parser.add_argument('--avg_len', default=50, type=int, help='number of episodes avg is calculated over')
    parser.add_argument('--replace_target', default=200, type=int, help='number of episodes avg is calculated over')
    parser.add_argument('--fc1_dims', default=256, type=int, help='size of first fully conected layer in the network')
    parser.add_argument('--fc2_dims', default=256, type=int, help='size of second fully conected layer in the network')

    return parser

if __name__ == '__main__':
  global args
  args = argumentParser().parse_args()
  main()

Overwriting main.py


ArgumentParser(prog='ipykernel_launcher.py', usage=None, description=None, formatter_class=<class 'argparse.ArgumentDefaultsHelpFormatter'>, conflict_handler='error', add_help=True)


In [None]:
!python3 main.py

Namespace(Episodes=250, avg_len=50, burn_in_time=50, eps_dec=0.01, eps_min=0.01, fc1_dims=256, fc2_dims=256, gamma=0.95, lr=0.001, replace_target=100)
[34m[1mwandb[0m: Currently logged in as: [33mmoustholmes[0m (use `wandb login --relogin` to force relogin)
2021-03-23 23:24:37.716248: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[34m[1mwandb[0m: Tracking run with wandb version 0.10.23
[34m[1mwandb[0m: Syncing run [33mglamorous-sponge-90[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/moustholmes/AMAS_Project[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/moustholmes/AMAS_Project/runs/1hdnoszw[0m
[34m[1mwandb[0m: Run data is saved locally in /content/wandb/run-20210323_232436-1hdnoszw
[34m[1mwandb[0m: Run `wandb offline` to turn off syncing.

Namespace(Episodes=250, avg_len=50, burn_in_time=50, eps_dec=0.01, eps_min=0.01, fc1_dims=256, fc2_dims=256, gamma=0.9

## Sweep for Hyperparameter tuning

In [None]:
%%writefile sweep.yaml
project: "AMAS_Project_DQN"
program: main.py
method: bayes
metric:
  name: best_score
  goal: maximize
parameters:
  lr:
    values: [0.01, 0.001, 0.005, 0.0001, 0.0005, 0.00001]
  gamma:
    values: [0.999, 0.99, 0.9, 0.8]
  eps_dec:
    values: [0.001, 0.005, 0.0001, 0.0005, 0.00001,0.000001]
  eps_min:
    values: [0.2, 0.1, 0.08, 0.05, 0.001]
  fc1_dims:
    values: [64, 128, 256, 512, 768, 1024]
  fc2_dims:
    values: [64, 128, 256, 512, 768, 1024]
  replace_target:
    values: [5, 10, 50, 100, 150, 200, 500, 1000]

Overwriting sweep.yaml


In [None]:
!wandb sweep sweep.yaml

[34m[1mwandb[0m: Creating sweep from: sweep.yaml
[34m[1mwandb[0m: Created sweep with ID: [33m77sc7tfe[0m
[34m[1mwandb[0m: View sweep at: [34m[4mhttps://wandb.ai/moustholmes/AMAS_Project/sweeps/77sc7tfe[0m
[34m[1mwandb[0m: Run sweep agent with: [33mwandb agent moustholmes/AMAS_Project/77sc7tfe[0m


In [None]:
!wandb agent moustholmes/AMAS_Project/77sc7tfe

[1;30;43mStreaming af output blev afkortet til de sidste 5000 linjer.[0m
episode  211 score -18.0 average score -28.8 best score:-25.0
episode  212 score -187.5 average score -31.9 best score:-25.0
episode  213 score -95.6 average score -33.7 best score:-25.0
episode  214 score -73.7 average score -34.9 best score:-25.0
episode  215 score -41.3 average score -35.1 best score:-25.0
episode  216 score 80.4 average score -33.7 best score:-25.0
episode  217 score -101.0 average score -34.7 best score:-25.0
episode  218 score -37.1 average score -37.0 best score:-25.0
episode  219 score 24.2 average score -36.6 best score:-25.0
episode  220 score 83.8 average score -34.6 best score:-25.0
episode  221 score -3.5 average score -34.7 best score:-25.0
episode  222 score -203.7 average score -39.8 best score:-25.0
episode  223 score -56.3 average score -39.8 best score:-25.0
episode  224 score -20.4 average score -40.6 best score:-25.0
episode  225 score -23.9 average score -39.4 best score:-2