In [1]:
from unityagents import UnityEnvironment
import numpy as np

In [2]:
env = UnityEnvironment(file_name="Banana_Windows_x86_64/Banana_Windows_x86_64/Banana.exe")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


In [3]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [4]:
print(brain_name)

BananaBrain


In [5]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents in the environment
print('Number of agents:', len(env_info.agents))

# number of actions
action_size = brain.vector_action_space_size
print('Number of actions:', action_size)

# examine the state space 
state = env_info.vector_observations[0]
print('States look like:', state)
state_size = len(state)
print('States have length:', state_size)

Number of agents: 1
Number of actions: 4
States look like: [1.         0.         0.         0.         0.84408134 0.
 0.         1.         0.         0.0748472  0.         1.
 0.         0.         0.25755    1.         0.         0.
 0.         0.74177343 0.         1.         0.         0.
 0.25854847 0.         0.         1.         0.         0.09355672
 0.         1.         0.         0.         0.31969345 0.
 0.        ]
States have length: 37


In [6]:
# import requiered packages 
import sys
import os
import random
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from collections import deque,namedtuple
import matplotlib.pyplot as plt
%matplotlib inline


In [7]:
# Determine if I want to train the agent on GPU (if available) or CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [8]:
# Our Enviroment
env_info = env.reset(train_mode=False)[brain_name] 
action_size = brain.vector_action_space_size
state = env_info.vector_observations[0]
state_size = len(state) 


In [12]:
# The model consist of 4 fully connected layer and  output layer 

class RLAgent(nn.Module):
    
    def __init__(self,brain,env_info):
        
        super(RLAgent,self).__init__()
        
        self.s_size  = len(env_info.vector_observations[0])      # the size of env states which = 37 
        self.a_size = brain.vector_action_space_size             # the size of available actions in the env which = 4
        self.h1_size     = 32
        self.h2_size     = 64
        
        
        self.fc1      = nn.Linear(self.s_size,self.h1_size)        
        self.fc2      = nn.Linear(self.h1_size,self.h2_size)     
        self.fc3      = nn.Linear(self.h2_size,self.a_size)
    
    def set_weights(self, weights):
        s_size = self.s_size
        h1_size = self.h1_size
        h2_size = self.h2_size
        a_size = self.a_size
        # separate the weights for each layer
        fc1_end = (s_size*h1_size)+h1_size
        fc2_end = fc1_end + (h1_size*h2_size)+h2_size
        fc1_W = torch.from_numpy(weights[:s_size*h1_size].reshape(s_size, h1_size))
        fc1_b = torch.from_numpy(weights[s_size*h1_size:fc1_end])
        fc2_W = torch.from_numpy(weights[fc1_end:fc1_end+(h1_size*h2_size)].reshape(h1_size, h2_size))
        fc2_b = torch.from_numpy(weights[fc1_end+(h1_size*h2_size):fc2_end])
        fc3_W = torch.from_numpy(weights[fc2_end:fc2_end+(h2_size*a_size)].reshape(h2_size, a_size))
        fc3_b = torch.from_numpy(weights[fc2_end+(h2_size*a_size):])
        
        # set the weights for each layer
        self.fc1.weight.data.copy_(fc1_W.view_as(self.fc1.weight.data))
        self.fc1.bias.data.copy_(fc1_b.view_as(self.fc1.bias.data))
        self.fc2.weight.data.copy_(fc2_W.view_as(self.fc2.weight.data))
        self.fc2.bias.data.copy_(fc2_b.view_as(self.fc2.bias.data))
        self.fc3.weight.data.copy_(fc3_W.view_as(self.fc3.weight.data))
        self.fc3.bias.data.copy_(fc3_b.view_as(self.fc3.bias.data))
    
    def get_weights_dim(self):
        return (self.s_size+1)*self.h1_size + (self.h1_size+1)*self.h2_size + (self.h2_size+1)*self.a_size
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.tanh(self.fc3(x))
        return x
        
    def evaluate(self, weights,eps, gamma=1.0, max_t=5000):
        self.set_weights(weights)
        episode_return = 0.0
        env_info = env.reset(train_mode=True)[brain_name]  # reset the environment
        state = env_info.vector_observations[0]            # get the current state
        for t in range(max_t):
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
            action = self.forward(state)
            # Epsilon-greedy action selection
            if random.random() > eps:
                action = np.argmax(action.cpu().data.numpy()).astype(np.int32)
            else:
                action =  random.choice(np.arange(self.a_size))
            env_info = env.step(action)[brain_name]        # send the action to the environment
            next_state = env_info.vector_observations[0]        # get the next state
            reward = env_info.rewards[0]                   # get the reward
            done = env_info.local_done[0]                  # see if episode has finished
            
            state = next_state
            episode_return += reward * math.pow(gamma, t)
            if done:
                break
        return episode_return
    
agent = RLAgent(brain,env_info).to(device)
    


In [None]:
def cem(n_iterations=500, max_t=1000, gamma=1.0, print_every=10, pop_size=50, elite_frac=0.2, sigma=0.5,
        eps_start=1.0,eps_decay=0.99,eps_min=0.01):
    """PyTorch implementation of a cross-entropy method.
        
    Params
    ======
        n_iterations (int): maximum number of training iterations
        max_t (int): maximum number of timesteps per episode
        gamma (float): discount rate
        print_every (int): how often to print average score (over last 100 episodes)
        pop_size (int): size of population at each iteration
        elite_frac (float): percentage of top performers to use in update
        sigma (float): standard deviation of additive noise
    """
    n_elite=int(pop_size*elite_frac)

    scores_deque = deque(maxlen=100)
    scores = []
    best_weight = sigma*np.random.randn(agent.get_weights_dim())
    eps = eps_start
    for i_iteration in range(1, n_iterations+1):
        weights_pop = [best_weight + (sigma*np.random.randn(agent.get_weights_dim())) for i in range(pop_size)]
        rewards = np.array([agent.evaluate(weights,eps, gamma, max_t) for weights in weights_pop])

        elite_idxs = rewards.argsort()[-n_elite:]
        elite_weights = [weights_pop[i] for i in elite_idxs]
        best_weight = np.array(elite_weights).mean(axis=0)

        reward = agent.evaluate(best_weight,eps, gamma=1.0)
        scores_deque.append(reward)
        scores.append(reward)
        eps = max(eps_min, eps_decay*eps) # decrease epsilon

        torch.save(agent.state_dict(), 'checkpoint.pth')
        
        if i_iteration % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_iteration, np.mean(scores_deque)))

        if np.mean(scores_deque)>=13.0:
            print('\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}'.format(i_iteration-100, np.mean(scores_deque)))
            break
    return scores

scores = cem()

# plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

Episode 10	Average Score: 0.60
Episode 20	Average Score: 0.50
Episode 30	Average Score: 0.57
Episode 40	Average Score: 0.53
Episode 50	Average Score: 0.46
Episode 60	Average Score: 0.45
Episode 70	Average Score: 0.40
Episode 80	Average Score: 0.54
Episode 90	Average Score: 0.52
Episode 100	Average Score: 0.55
Episode 110	Average Score: 0.62
Episode 120	Average Score: 0.66
Episode 130	Average Score: 0.87
Episode 140	Average Score: 1.39
Episode 150	Average Score: 1.90
Episode 160	Average Score: 2.40
Episode 170	Average Score: 3.01
Episode 180	Average Score: 3.57
Episode 190	Average Score: 4.21
Episode 200	Average Score: 5.00
Episode 210	Average Score: 5.90
Episode 220	Average Score: 6.65
Episode 230	Average Score: 7.42
Episode 240	Average Score: 7.72
Episode 250	Average Score: 8.03
Episode 260	Average Score: 8.50
Episode 270	Average Score: 8.83
Episode 280	Average Score: 9.23
Episode 290	Average Score: 9.63
Episode 300	Average Score: 9.72
Episode 310	Average Score: 9.64
Episode 320	Avera

In [29]:
env.close()