# Introduction

Here I'm testing the A2C on the taxi grid graph


In [1]:
import networkx as nx
import csv
import copy
import funcs as f
import numpy as np
import matplotlib.pyplot as plt
import gym
from agent_taxi import Agent
from agent_taxi import PolicyCab
from taxi_environment import Env
from keras.utils import to_categorical
%matplotlib inline


def run(lr,gamma, n, penalty):
    
    
    #Environment
    G = f.make_G(n)
    state_zero = np.array([1 if i == 0 else 0 for i in range(G.number_of_nodes())])
    env = Env(G,state_zero)
    num_states = env.num_states
    num_actions = env.num_actions
    penalty = env.illegal_move_penalty

    #Agent
    agent = Agent(num_states, num_actions, lr, gamma)
    agent.memory_size = 1000

    scores = []
    EPISODES = 4000

    for e in range(1,EPISODES+1):
        state = state_zero
        state = np.reshape(state, [1, num_states])
        reward_sum = 0
        while agent.active_time < 100:

            # env.render()
            action = agent.act(state)
            next_state, reward = env.step(action,agent)
            reward_sum += reward
            next_state = np.reshape(next_state, [1, num_states])

            #We only want to remember action taken when looking
            if agent.state == 'serving':
                pass
            else:
                agent.remember(state[0], action, 1.0*reward)
            state = next_state
            

        #Learn & print results
        agent.train_models()
        tau = agent.find_tau()
        scores.append(tau)
        agent.reset_clocks()
        
    return scores

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Hyper parameter optimization

In [2]:
gammas = [0.01, 0.1, 0.9]
lrs = [0.0001,0.001,0.01,0.1]

scores_all = []

for lr in lrs:
    for gamma in gammas:
        scores = run(lr,gamma,n,-1000) # illegal move penalty
        scores_all.append(scores)
        print '(lr,gamma,mean,range) = ' + str((lr,gamma,np.mean(scores), max(scores) - min(scores)))

TypeError: run() takes exactly 4 arguments (2 given)

Smaller gamma are the trick.

### Do on 10x10 graph

In [4]:
gammas = [0.01, 0.1, 0.99]
lrs = [0.0001,0.001,0.01,0.1]

scores_all = []

for lr in lrs:
    for gamma in gammas:
        scores = run(lr,gamma,10,-1000)  #size of graph, penalty for illegal move
        scores = scores[-1000:]
        scores_all.append(scores)
        print '(lr,gamma,mean,range) = ' + str((lr,gamma,np.mean(scores), max(scores) - min(scores)))

(lr,gamma,mean,range) = (0.0001, 0.01, 0.03428, 0.11)
(lr,gamma,mean,range) = (0.0001, 0.1, 0.03433, 0.12)
(lr,gamma,mean,range) = (0.0001, 0.99, 0.034350000000000006, 0.12)
(lr,gamma,mean,range) = (0.001, 0.01, 0.03427, 0.12)
(lr,gamma,mean,range) = (0.001, 0.1, 0.03446, 0.13)
(lr,gamma,mean,range) = (0.001, 0.99, 0.07028, 0.2)
(lr,gamma,mean,range) = (0.01, 0.01, 0.03434, 0.12)
(lr,gamma,mean,range) = (0.01, 0.1, 0.06412000000000001, 0.19)
(lr,gamma,mean,range) = (0.01, 0.99, 0.034370000000000005, 0.13)
(lr,gamma,mean,range) = (0.1, 0.01, 0.03427000000000001, 0.11)
(lr,gamma,mean,range) = (0.1, 0.1, 0.03435, 0.12)
(lr,gamma,mean,range) = (0.1, 0.99, 0.034330000000000006, 0.12)


I wasn't expecting that, doesn't seem to be much different.

In [None]:
gammas = [0.01, 0.1, 0.99]
penalties = [-1000,-100,-10,0]
lr = 0.01
n = 10  #grid edge size

for penalty in penalties:
    for gamma in gammas:
        scores = run(lr,gamma,n,penalty)  #size of graph, penalty for illegal move
        scores = scores[-1000:]
        scores_all.append(scores)
        print '(penalty,gamma,mean,range) = ' + str((penalty,gamma,np.mean(scores), max(scores) - min(scores)))

(penalty,gamma,mean,range) = (-1000, 0.01, 0.03427000000000001, 0.12)
(penalty,gamma,mean,range) = (-1000, 0.1, 0.06412000000000001, 0.2)
(penalty,gamma,mean,range) = (-1000, 0.99, 0.03445, 0.12)
(penalty,gamma,mean,range) = (-100, 0.01, 0.03438000000000001, 0.13)
(penalty,gamma,mean,range) = (-100, 0.1, 0.03436, 0.11)
(penalty,gamma,mean,range) = (-100, 0.99, 0.034330000000000006, 0.12)
(penalty,gamma,mean,range) = (-10, 0.01, 0.03427000000000001, 0.12)
(penalty,gamma,mean,range) = (-10, 0.1, 0.034350000000000006, 0.12)
