# Introduction

Here I'm testing the A2C on the "real-world" nyc graph. That is, I place the empirical trips on the manhattan street network, at the empirical times

In [7]:
import funcs as f
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from agent_with_baseline import Agent
from agent_taxi import PolicyCab
import real_world_nyc_environment as t
from keras.utils import to_categorical

def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

%matplotlib inline



#Load in data
G = t.get_subgraph()  #start with a subgraph
#trip_data = t.get_tripdata(G,18,18) #monday, the 18th of jan
trip_data = np.loadtxt('data/trip_data_nyc_day_18.txt')
#env = Env(G,state_zero)

In [9]:
def run(gamma,lr,illegal_move_penalty):
    
    #Environment
    np.random.seed(0)
    state_zero_scalar = np.random.choice(G.nodes())
    env = t.Env(G,trip_data,state_zero_scalar)
    state_zero = env.convert(state_zero_scalar)  #convert to 1-hot vector
    
    delta = 30  #trips wait 30 deci-seconds = 5 minutes before disappearing
    time_per_episode = 360   #one day
    
    env.delta = delta
    num_states = env.num_states
    num_actions = env.num_actions
    env.illegal_move_penalty = illegal_move_penalty


    #Agent
    agent = Agent(num_states, num_actions, lr, gamma)
    agent.memory_size = 1000

    #Train
    EPISODES = 2000
    scores = []
    for e in range(1,EPISODES+1):
        state = state_zero 
        state = np.reshape(state, [1, num_states])  #convert to tensor for keras
        reward_sum = 0
        while env.active_time < time_per_episode:

            # env.render()
            action = agent.act(state)
            next_state_scalar, reward = env.step(action)
            next_state = env.convert(next_state_scalar)    #convert to 1-hot vec
            reward_sum += reward
            next_state = np.reshape(next_state, [1, num_states])  #convert to tensor for keras
            agent.remember(state[0], action, 1.0*reward)
            state = next_state

        #Learn & print results
        agent.train_models()
        tau = env.find_tau()
        scores.append(tau)
        env.reset(state_zero_scalar,trip_data)
        
    return scores


#Going to use this to parallelize
def func(par):
    lr,gamma,illegal_move = par
    scores = run(lr,gamma,illegal_move)

    transient_index = int(0.75*len(scores))
    scores = scores[transient_index:]  #discard the training
    
    return (lr,gamma,np.mean(scores), max(scores) - min(scores))

### RL agent

In [5]:
gammas = [0.01, 0.1, 0.9]
lrs = [0.0001,0.001,0.01,0.1]
illegal_move = -100

scores_all = []

for lr in lrs:
    for gamma in gammas:
        scores = run(lr,gamma,illegal_move) # illegal move penalty
        scores = scores[-1000:]  #discard the training
        scores_all.append(scores)
        print '(lr,gamma,mean,range) = ' + str((lr,gamma,np.mean(scores), max(scores) - min(scores)))

(lr,gamma,mean,range) = (0.0001, 0.01, 0.9600166666666664, 0.16388888888888886)


KeyboardInterrupt: 

So bigger gamma helps. Next I'll try fiddling with the penalty

In [11]:
penalties = [-1000,-100,-10,0]

lr,gamma = 0.01, 0.9

for penalty in penalties:
    scores = run(lr,gamma,illegal_move) # illegal move penalty
    scores = scores[-1000:]  #discard the training
    print '(penalty,gamma,mean,range) = ' + str((penalty,gamma,np.mean(scores), max(scores) - min(scores)))

(penalty,gamma,mean,range) = (-1000, 0.9, 0.5773689467504177, 0.7917808219178082)
(penalty,gamma,mean,range) = (-100, 0.9, 0.6413017156931538, 0.8225)
(penalty,gamma,mean,range) = (-10, 0.9, 0.5615031844831833, 0.8528610354223434)
(penalty,gamma,mean,range) = (0, 0.9, 0.6653823511800644, 0.8702702702702703)
