### Cab-Driver Agent

In [None]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle
import os
# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt
import pylab

# Import the environment
#from Env import CabDriver


In [None]:
# Import routines
# This cell contains the CabDriver class

import numpy as np
import math
import random
from itertools import permutations,product

# Defining hyperparameters
m = 5 # number of cities, ranges from 1 ..... m
t = 24 # number of hours, ranges from 0 .... t-1
d = 7  # number of days, ranges from 0 ... d-1
C = 5 # Per hour fuel and other costs
R = 9 # per hour revenue from a passenger


class CabDriver():

    def __init__(self):
        """initialise your state and define your action space and state space"""
        action_list = list(permutations(range(0,m) ,2))
        action_list.append((0,0))
        self.action_space = np.array(action_list) #action space is unique 2 values(source & destination) + the no op
        self.state_space = list(product(*[list(range(0,m)), list(range(0,t)), list(range(0,d))])) #State space from MDP:
        #𝑠=𝑋𝑖𝑇𝑗𝐷𝑘 𝑤ℎ𝑒𝑟𝑒 𝑖=0…𝑚−1;𝑗=0….𝑡−1;𝑘=0…..𝑑−1, Where 𝑋𝑖 represents a driver’s current location, 𝑇𝑗 represents time component (more specifically hour of the day), 𝐷𝑘 represents the day of the week
        self.state_size = len(self.state_space)
        self.action_size = len(self.action_space)
        self.state_init = random.choice(self.state_space) #Initialises to any random self_space
        self.encode_vector = np.array([24*7, 7, 1]).reshape(3, 1)
        self.action_map = {v:k for k,v in enumerate(action_list)}
        # Start the first round
        self.reset()


    ## Encoding state (or state-action) for NN input

    def state_encod_arch1(self, curr_state, batch_size=1):
        """convert the state into a vector so that it can be fed to the NN. This method converts a given state into a vector format. Hint: The vector is of size m + t + d."""
        curr_state = np.array(curr_state).reshape(1, 3)
        pos_mat = np.dot(curr_state, self.encode_vector)
        state_encod =  np.zeros((1, self.state_size))
        for i in range(batch_size):
            state_encod[i][pos_mat[i]] = 1

        return np.reshape(state_encod, [1, env.state_size])
    

    def requests(self, state):
        """Determining the number of requests basis the location. 
        Use the table specified in the MDP and complete for rest of the locations"""
        location = state[0]
        possible_actions_index = []
        requests = 0
        if location == 0:
            requests = np.random.poisson(2)

        if location == 1:
            requests = np.random.poisson(12)   #MDP Poisson distribution
        
        if location == 2:
            requests = np.random.poisson(4)    #MDP Poisson distribution
            
        if location == 3:
            requests = np.random.poisson(7)    #MDP Poisson distribution

        if location == 4:
            requests = np.random.poisson(8)    #MDP Poisson distribution  
            
        if requests > 15:
            requests = 15
        elif requests == 0:
            requests = 1

        possible_actions_index = random.sample(range(0, (m-1)*m), requests) # (0,0) is not considered as customer request
        if possible_actions_index or len(possible_actions_index) > 0:
            possible_actions_index.append(20)#add the index of No-OP action (0, 0)
        else:
            self.requests(state)
        actions = [self.action_space[i] for i in possible_actions_index]

        return possible_actions_index, actions   



    def reward_func(self, state, action, Time_matrix):
        """Takes in state, action and Time-matrix and returns the reward"""
        if action[0] == action[1]:
            reward = -C 
            return reward

        pickup = action[0]
        drop = action[1]
        current = state[0]
        time_curr = state[1]
        day_curr = state[2]
        #handle situation where current position is the same as pickup point
        if current == pickup:
            t_pq = Time_matrix[pickup][drop][time_curr][day_curr]
            t_ip = 0
        else:
            t_ip = Time_matrix[current][pickup][time_curr][day_curr]
            time_ip, day_ip = self.trip_time_adjust(time_curr, 
                                                             day_curr, 
                                                             start=current, 
                                                             end=pickup)
            t_pq = Time_matrix[pickup][drop][time_ip][day_ip]
            
        
        #reward formula mentioned in the MDP
        reward = (R*t_pq)-(C*(t_pq+t_ip))
        return reward


    def next_state_func(self, state, action, Time_matrix):
        """Takes state and action as input and returns next state"""
        next_state = None
        pickup = action[0]
        drop = action[1]
        current = state[0]
        time_curr = state[1]
        day_curr = state[2]
        
        if pickup != drop:
            if current == pickup:
                time_next, day_next = self.trip_time_adjust(time_curr, day_curr, start=pickup, end=drop)
            else:
                time_interim, day_interim = self.trip_time_adjust(time_curr, 
                                                             day_curr, 
                                                             start=current, 
                                                             end=pickup)
                time_next, day_next = self.trip_time_adjust(time_interim, 
                                                             day_interim, 
                                                             start=pickup, 
                                                             end=drop)
                
            next_state = (drop, time_next, day_next)
        else:
            next_state = state # if there is no action retain the state
        
        return next_state

    #reset env
    def reset(self):
        self.state_init = random.choice(self.state_space)
        return self.action_space, self.state_space, self.state_init
    
    #handle the time/day increase if the drop time goes to the next day
    def trip_time_adjust(self, time_curr, day_curr, start, end):
        time_next = time_curr + Time_matrix[start][end][time_curr][day_curr]
        
        if time_next < 24:
            day_next = day_curr
        else:
            day_next = (day_curr + 1) % 7 #int((day_curr+int(time_next/24)) % 7)
            time_next = time_next % 24
            
        return int(time_next), int(day_next)


#### Defining Time Matrix

In [None]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy")

#### Tracking the state-action pairs for checking convergence


In [None]:
#Defining a function to save the Q-dictionary as a pickle file
def save_pickle(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [None]:
#define epsilon_decay strategy
def epsilon_decay(total_steps, step):
    epsilon = abs(np.exp((-np.pi/total_steps)*step))
    return epsilon

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [None]:
#this cell contains the agent class

class DQNAgent:
    def __init__(self, state_size, action_size, action_map, discount_factor=0.95, learning_rate=0.01,
                       epsilon=0.99, epsilon_decay=0.99, epsilon_min=0.01):
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # Write here: Specify you hyper parameters for the DQN
        self.discount_factor = discount_factor
        self.learning_rate = learning_rate        
        self.epsilon_max = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.model_history = None
        self.action_map = action_map
        self.batch_size = 32
        #self.batch_size = 1
        # create replay memory using deque
        self.memory = deque(maxlen=4096)

        # create main model and target model
        self.model = self.build_model()
    

    # approximate Q function using Neural Network
    def build_model(self):
        model = Sequential()
        # Write your code here: Add layers to your neural nets       

        # hidden layers
        model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))
        #model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))

        # the output layer: output is of size num_actions
        model.add(Dense(self.action_size, activation='relu', kernel_initializer='he_uniform'))     
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        model.summary()
        return model


    def get_action(self, cstate, all_actions, pos_act_ind):
    # Write your code here:
    # get action from model using epsilon-greedy policy
    # Decay in ε after we generate each sample from the environment
        actions = all_actions[pos_act_ind]
        if np.random.rand() <= self.epsilon_max:
            # explore: choose a random action from all possible actions
            mode = 'Exploring'
            action = random.choice(actions)
        else:
            # choose the action with the highest q(s, a)
            # the first index corresponds to the batch size, so
            # reshape state to (1, state_size) so that the first index corresponds to the batch size
            mode = 'Exploiting'
            #cstate = cstate.reshape(1, self.state_size) 
            q_value = self.model.predict(x=cstate)
            Q_val_for_actions = q_value[0][pos_act_ind]
            max_index = np.argmax(Q_val_for_actions)
            print(q_value.shape, Q_val_for_actions, 'Index ', max_index, np.max(Q_val_for_actions))
            action = actions[max_index]
        print(mode, 'Available actions ', actions, ' Selected action ', action)    
        return action
        

    def append_sample(self, state, action, reward, next_state, done):
    # Write your code here:
    # save sample <s,a,r,s'> to the replay memory
        self.memory.append((state, action, reward, next_state, done))
    
    
    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):
        if len(self.memory) > self.batch_size:
            # Sample batch from the memory
            mini_batch = random.sample(self.memory, self.batch_size)
            update_input = np.zeros((self.batch_size, self.state_size))
            update_output = np.zeros((self.batch_size, self.state_size))
            
            actions, rewards, done = [], [], []
            
            for i in range(self.batch_size):
                state, action, reward, next_state, done_boolean = mini_batch[i]
                # Write your code from here
                # 1. Identify the next action 
                update_input[i] = env.state_encod_arch1(state)
                actions.append(action)
                rewards.append(reward)
                update_output[i] = env.state_encod_arch1(next_state)
                done.append(done_boolean)
                
            # 2. Get the target for the Q-network
            
            target = self.model.predict(update_input)
            target_qval = self.model.predict(update_output)
            
            #3. Update our target rewards
            for i in range(self.batch_size):
                index = self.action_map[tuple(actions[i])]
                if done[i]:
                    target[i][index] = rewards[i]
                else: # non-terminal state
                    target[i][index] = rewards[i] + self.discount_factor * np.max(target_qval[i])
                
        # 4. Fit your model and track the loss values
            return self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=0)
            
            
    def save(self, name):
        self.model.save(name)

In [None]:
# to store rewards in each episode
rewards_per_episode, episodes  = [], []

# make dir to store model weights
if not os.path.exists("saved_model_weights"):
    os.mkdir("saved_model_weights")

# n_episodes
n_episodes = 100

### DQN block

In [None]:
 # Call all the initialised variables of the environment
env = CabDriver()
#Call the DQN agent
dqn = DQNAgent(env.state_size, env.action_size, env.action_map)

for episode in range(n_episodes):

    # Write code here
    # Call the environment
   
    _,_,curr_state = env.reset()
    state_size = env.state_size
    
   
    #action_size = len(actions)
    reward = 0
    curr_time = 0
    #print(curr_state)
    
    
    terminal_state = False
    print("Episode :", episode)
    
    while not terminal_state:
        # Write your code here
        # 1. Pick epsilon-greedy action from possible actions for the current state
        encoded_state = env.state_encod_arch1(curr_state)
        pos_act_ind, actions = env.requests(curr_state)
        action = dqn.get_action(encoded_state, env.action_space, pos_act_ind)
        # 2. Evaluate your reward and next state
        reward = reward + env.reward_func(curr_state, action, Time_matrix)
        next_state = env.next_state_func(curr_state,action,Time_matrix)
        
        pickup_loc = action[0]
        drop_loc = action[1]
        current_loc = curr_state[0]
        time = curr_state[1]
        day = curr_state[2]
        #calculate time increase only on different pickup and drop points
        if pickup_loc != drop_loc:
            curr_time = curr_time + Time_matrix[current_loc][pickup_loc][time][day]
            time = next_state[1]
            day = next_state[2]
            curr_time = curr_time + Time_matrix[pickup_loc][drop_loc][time][day]
        else:
            curr_time += 1.0
        
        # 3. Append the experience to the memory
        dqn.append_sample(curr_state, action, reward, next_state, terminal_state)
        curr_state = next_state
        
        # 4. Train the model by calling function agent.train_model
        history = dqn.train_model()
        # 5. Keep a track of rewards, Q-values, loss
        print("episode:", episode, "  score:", reward, "  memory length:",
                      len(dqn.memory), "  epsilon:", dqn.epsilon_max)
        
        if curr_time >= 24*30:
            terminal_state = True
    
    # store total reward obtained in this episode
    rewards_per_episode.append(reward)
    episodes.append(episode)
    #pylab.plot(episode, rewards_per_episode, 'b')
    #pylab.savefig("./cab_dqn.png")
        
    
    dqn.epsilon_max =  epsilon_decay(n_episodes, episode)
    #save model for every 50 episodes
    if episode % 50 == 0:
            dqn.model.save_weights("./saved_model_weights/driver_dqn.h5")
        
        

In [None]:
# make directory
if not os.path.exists("saved_pickle_files"):
    os.mkdir("saved_pickle_files")

# save rewards_per_episode
save_pickle(rewards_per_episode, "saved_pickle_files/rewards_per_episode")


# plot results
with open('saved_pickle_files/rewards_per_episode.pkl', 'rb') as f:
    rewards_per_episode = pickle.load(f)

plt.plot(list(range(len(rewards_per_episode))), rewards_per_episode)
plt.xlabel("episode number")
plt.ylabel("reward per episode")

# save plots in saved_plots/ directory
plt.savefig('rewards.png')

print("Average reward of last 100 episodes is {0}".format(np.mean(rewards_per_episode[-100:])))

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
total_steps = 50
time = np.arange(0, total_steps)
epsilon = []
x = 0.99
for step in range(0,total_steps):
    x = abs(np.exp((-np.pi/total_steps)*step))
    epsilon.append(x)

In [None]:
plt.plot(time, epsilon)
plt.show()