<a href="https://colab.research.google.com/github/MoustHolmes/AMAS_Project/blob/main/PG_Wandb_sweep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs

In [4]:
!pip3 install box2d-py
!pip install wandb



In [7]:
!wandb login

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Define the Q Learning Agent

In [8]:
%%writefile Agent.py
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

class PolicyNetwork(nn.Module):
    def __init__(self, lr, input_dims, n_actions, fc1_dims, fc2_dims):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(*input_dims, fc1_dims)
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        self.fc3 = nn.Linear(fc2_dims, n_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class Agent():
    def __init__(self, lr, fc1_dims, fc2_dims, input_dims, n_actions, gamma=0.99):
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.gamma = gamma
        self.lr = lr
        self.reward_memory = []
        self.action_memory = []
        self.policy = PolicyNetwork(self.lr, input_dims, n_actions, fc1_dims=fc1_dims, fc2_dims= fc2_dims)
    
    def choose_action(self, observation):
        state = T.Tensor([observation]).to(self.policy.device)
        probabilities = F.softmax(self.policy.forward(state))
        action_probs = T.distributions.Categorical(probabilities)
        action = action_probs.sample()
        log_probs = action_probs.log_prob(action)
        self.action_memory.append(log_probs)
        return action.item()
    
    def store_rewards(self, reward):
        self.reward_memory.append(reward)
    
    def learn(self):
        self.policy.optimizer.zero_grad()
        
        # Reward to go:
        # G_t = R_t+1 + gamma * R_t+2 + gamma**2 * R_t+3
        # G_t = sum from k=0 to k=T {gamma**k * R_t+k+1}
        G = np.zeros_like(self.reward_memory, dtype=np.float64)
        for t in range(len(self.reward_memory)):
            G_sum = 0
            discount = 1 #discount factor
            for k in range(t, len(self.reward_memory)):
                G_sum += self.reward_memory[k] * discount
                discount *= self.gamma
            G[t] = G_sum
        G = T.tensor(G, dtype=T.float).to(self.policy.device)
        
        loss = 0
        for g, logprob in zip(G, self.action_memory):
            loss += -g * logprob
        loss.backward()
        self.policy.optimizer.step()
        self.action_memory = []
        self.reward_memory = []

    def print_args(self):
        print('lr       : ' +str(self.lr))
        print('gamma    : ' +str(self.gamma))
        print('fc1_dims : ' +str(self.fc1_dims))
        print('fc2_dims : ' +str(self.fc2_dims))

Overwriting Agent.py


## Train and Test

In [9]:
%%writefile main.py

import numpy as np
import wandb
from Agent import Agent
import argparse
import gym 
import pprint 

def main():
    wandb.init(project='AMAS_Project_Policy_Gradient', config=args)
    pprint.pprint(args)
    env = gym.make('LunarLander-v2')
    agent = Agent(gamma=args.gamma, lr=args.lr,
                  fc1_dims = args.fc1_dims, fc2_dims = args.fc2_dims,
                  input_dims = [8], n_actions = 4)
    agent.print_args()
    
    best_avg_score, weighted_best_score = train(env, agent, episodes = args.episodes, avg_len = args.avg_len, burn_in_time = args.burn_in_time)

    wandb.log({'best_avg_score': best_avg_score,'weighted_best_score':weighted_best_score})

def train(env, agent, episodes=500, avg_len = 50, burn_in_time = 50):
    """The play function runs iterations and updates Q-values if desired."""

    
    scores, eps_history = [], []
    best_score = float('-inf')

    for i in range(episodes):
        score = 0
        done = False
        observation = env.reset()
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.store_rewards(reward)
            agent.learn()
            observation = observation_
        scores.append(score)
        #ep_history.append(.epsilon)

        avg_score = np.mean(scores[-avg_len:])
        if avg_score > best_score and i > burn_in_time:
          best_score = avg_score

        print('episode ', i, 'score %.1f' % score,
            'average score %.1f' % avg_score, 'best score:%.1f' %best_score)
  
        wandb.log({'Scores': score,'Avg_Score': avg_score, 'episodes': episodes})

    weighted_best_score =np.max( np.array(scores[burn_in_time:])/np.arange(len(scores))[burn_in_time:]) 
    print(weighted_best_score)
    return best_score, weighted_best_score

def argumentParser():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--gamma',        default = 0.99, type=float, help='Discounting Factor')
    parser.add_argument('--lr',          default = 0.001, type=float, help='Learning Rate')
    parser.add_argument('--episodes',     default=2500, type=int, help='number of episodes')
    parser.add_argument('--burn_in_time', default=50, type=int, help='number of episodes before calculating avg score')
    parser.add_argument('--avg_len',      default=50, type=int, help='number of episodes  is calculated over')
    parser.add_argument('--fc1_dims',     default=256, type=int, help='size of first fully conected layer in the netork')
    parser.add_argument('--fc2_dims',     default=256, type=int, help='ize of second fully conected layer in the netork')
    return parser

if __name__ == '__main__':
  global args
  args = argumentParser().parse_args()
  main()

Writing main.py


ArgumentParser(prog='ipykernel_launcher.py', usage=None, description=None, formatter_class=<class 'argparse.ArgumentDefaultsHelpFormatter'>, conflict_handler='error', add_help=True)


In [None]:
!python3 main.py

[34m[1mwandb[0m: Currently logged in as: [33mmoustholmes[0m (use `wandb login --relogin` to force relogin)
2021-03-25 02:31:42.047954: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[34m[1mwandb[0m: Tracking run with wandb version 0.10.23
[34m[1mwandb[0m: Syncing run [33mvolcanic-feather-7[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/moustholmes/AMAS_Project_Policy_Gradient[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/moustholmes/AMAS_Project_Policy_Gradient/runs/if56usc8[0m
[34m[1mwandb[0m: Run data is saved locally in /content/wandb/run-20210325_023141-if56usc8
[34m[1mwandb[0m: Run `wandb offline` to turn off syncing.

Namespace(avg_len=50, burn_in_time=50, episodes=700, fc1_dims=256, fc2_dims=256, gamma=0.99, lr=0.001)
lr       : 0.001
gamma    : 0.99
fc1_dims : 256
fc2_dims : 256
  probabilities = F.softmax(self.policy.forward(state))
episode  0 score

## Sweep for Hyperparameter tuning

In [None]:
%%writefile sweep.yaml
project: "AMAS_Project_Policy_Gradient"
program: main.py
method: bayes
metric:
  name: best_score
  goal: maximize
parameters:
  lr:
    values: [ 0.005, 0.0001, 0.0005, 0.000025, 0.00001]
  gamma:
    values: [0.999, 0.99, 0.9, 0.5]
  fc1_dims:
    values: [64, 128, 256, 512]
  fc2_dims:
    values: [64, 128, 256, 512]

Overwriting sweep.yaml


In [None]:
!wandb sweep sweep.yaml

[34m[1mwandb[0m: Creating sweep from: sweep.yaml
[34m[1mwandb[0m: Created sweep with ID: [33m77sc7tfe[0m
[34m[1mwandb[0m: View sweep at: [34m[4mhttps://wandb.ai/moustholmes/AMAS_Project/sweeps/77sc7tfe[0m
[34m[1mwandb[0m: Run sweep agent with: [33mwandb agent moustholmes/AMAS_Project/77sc7tfe[0m


In [None]:
!wandb agent moustholmes/AMAS_Project_Policy_Gradient/iheysljy

[34m[1mwandb[0m: Starting wandb agent 🕵️
2021-03-25 10:32:25,161 - wandb.wandb_agent - INFO - Running runs: []
2021-03-25 10:32:25,452 - wandb.wandb_agent - INFO - Agent received command: run
2021-03-25 10:32:25,452 - wandb.wandb_agent - INFO - Agent starting run with config:
	fc1_dims: 16
	fc2_dims: 1024
	gamma: 0.001
	lr: 0.0005
2021-03-25 10:32:25,453 - wandb.wandb_agent - INFO - About to run command: /usr/bin/env python main.py --fc1_dims=16 --fc2_dims=1024 --gamma=0.001 --lr=0.0005
[34m[1mwandb[0m: Currently logged in as: [33mmhp27[0m (use `wandb login --relogin` to force relogin)
2021-03-25 10:32:28.248432: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[34m[1mwandb[0m: Tracking run with wandb version 0.10.23
[34m[1mwandb[0m: Syncing run [33mwhole-sweep-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/moustholmes/AMAS_Project_Policy_Gradient[0m
[34m[1mwandb[0m: 🧹 View s