# Introduction

Here I'm testing the A2C on the "real-world" nyc graph. That is, I place the empirical trips on the manhattan street network, at the empirical times

In [13]:
import funcs as f
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from agent_with_baseline import Agent
from agent_taxi import PolicyCab
import real_world_nyc_environment as t
from keras.utils import to_categorical


def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

%matplotlib inline


#Load in data
G = t.get_full_nyc_graph() 
#trip_data = t.get_tripdata(G,18,18) #monday, the 18th of jan
trip_data = np.loadtxt('data/trip_data_nyc_day_18.txt')
#env = Env(G,state_zero)

#Environment parameters
delta = 5*6  #trips are removed after this time 
time_per_episode = 5*360  

state_zero = np.random.choice(G.nodes())
env = t.Env(G,trip_data,state_zero)

### Model cab

In [14]:
trip_probs = t.find_trip_probs(G,trip_data)
p = np.array(trip_probs.values())  #trip probs
optimal_policy = t.find_optimal_policy(p,G)

In [15]:
#Instantiate
np.random.seed(0)
state_zero = np.random.choice(G.nodes())
env = t.Env(G,trip_data,state_zero)
env.delta = delta
model_cab = t.Modelcab(optimal_policy)


# Main 
state = state_zero
Return = 0  # sum of rewards
while env.active_time <= time_per_episode:
    action = model_cab.act(state)
    next_state, reward = env.step_modelcab(action)  #different step functions for these cabs
    state = next_state
    Return += reward
tau_optimal = 1.0*env.idle_time / env.active_time
print 'rel idle time = ' + str(tau_optimal)

rel idle time = 0.07084019769357495


### Greedy cab

In [16]:
greedy_policy = t.find_greedy_policy(trip_probs,G)

#Instantiate
np.random.seed(0)
state_zero = np.random.choice(G.nodes())
env = t.Env(G,trip_data,state_zero)
env.delta = delta
greedy_cab = t.Modelcab(greedy_policy)


# Main 
state = state_zero
while env.active_time <= time_per_episode:
    action = greedy_cab.act(state)
    next_state, reward = env.step_modelcab(action)
    state = next_state
tau_greedy = 1.0*env.idle_time / env.active_time
print 'rel idle time = ' + str(tau_greedy)

rel idle time = 1.0


### RL agent

In [None]:
#Environment
state_zero_scalar = np.random.choice(G.nodes())
env = t.Env(G,trip_data,state_zero_scalar)
state_zero = env.convert(state_zero_scalar)  #convert to 1-hot vector
env.delta = delta
num_states = env.num_states
num_actions = env.num_actions
env.illegal_move_penalty = -100


#Agent
lr = 0.01
gamma = 0.01
agent = Agent(num_states, num_actions, lr, gamma)
agent.memory_size = 1000

#Train
EPISODES = 5000
scores = []
for e in range(1,EPISODES+1):
    state = state_zero 
    state = np.reshape(state, [1, num_states])  #convert to tensor for keras
    reward_sum = 0
    while env.active_time < time_per_episode:
        
        # env.render()
        action = agent.act(state)
        next_state_scalar, reward = env.step(action)
        next_state = env.convert(next_state_scalar)    #convert to 1-hot vec
        reward_sum += reward
        next_state = np.reshape(next_state, [1, num_states])  #convert to tensor for keras
        agent.remember(state[0], action, 1.0*reward)
        state = next_state
    
    #Learn & print results
    agent.train_models()
    tau = env.find_tau()
    scores.append(tau)
    env.reset(state_zero_scalar,trip_data)
    if e % 50 == 0:
        print '(episode, tau, score) = ' + str((e,tau,reward_sum))

        
plt.plot(scores,alpha=0.5)
plt.plot(running_mean(scores,100),'b--')  #num windows
plt.plot([tau_greedy for i in scores],'g--')
plt.plot([tau_optimal for i in scores],'r--')
plt.legend(['A2C','greedy','optimal'])
#np.savetxt('stats/scores_lunar_landing.txt',scores)

1. Note to self -- I should change the reward to $\tau$ directly
2. I'm not sure if doing the illegal moves this was is wise
3. Dont forget, this will probably overfit.
4. Then I need to check does it generalize.