### Cab-Driver Agent

In [1]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle

# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam
from itertools import permutations,product

# for plotting graphs
import matplotlib.pyplot as plt

# Import the environment
from Env import CabDriver

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


#### Defining Time Matrix

In [2]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy")

#### Tracking the state-action pairs for checking convergence


In [3]:
#Defining a function to save the Q-dictionary as a pickle file
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [135]:
class DQNAgent:
    
    def __init__(self, state_size, action_size):
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # Write here: Specify you hyper parameters for the DQN
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon_max = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        
        self.batch_size = 32        
        # create replay memory using deque
        self.memory = deque(maxlen=1000)

        # create main model and target model
        self.model = self.build_model()
        
         #Variables for On-Hot Encoding
        self.eye_loc=np.eye(5)
        self.eye_hour=np.eye(24)
        self.eye_day=np.eye(7)
        
        
    # approximate Q function using Neural Network
    def build_model(self):
        model = Sequential()
        # Write your code here: Add layers to your neural nets       
        
        model.add(Dense(32, input_dim=self.state_size, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size, activation='linear',
                        kernel_initializer='he_uniform'))
        model.summary()
        
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        model.summary
        return model



    def get_action(self, state,pos_actions):
    # Write your code here:
    # get action from model using epsilon-greedy policy
       
        if np.random.rand() <= self.epsilon_max:
            return random.choice(pos_actions)

        else:
            q_value = self.model.predict(state)
            return pos_actions[np.argmax(q_value[0])]
        
    def state_encod_arch(self, state):
        
        
        state_encod = list(self.eye_loc[state[0]]) + list(self.eye_hour[state[1]]) + list(self.eye_day[state[2]])

        return state_encod     
        


    def append_sample(self, state, action, reward, next_state):
    # Write your code here:
    # save sample <s,a,r,s'> to the replay memory
        self.memory.append((state, action, reward, next_state))
    
        # Decay in ε after we generate each sample from the environment       
        if self.epsilon_max > self.epsilon_min:
            self.epsilon_max *= self.epsilon_decay
    
    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):
        
        all_actions = list(permutations(range(0,20), 2))
        
        if len(self.memory) > self.batch_size:
            # Sample batch from the memory
            #batch_size = min(self.batch_size, len(self.memory))
            mini_batch = random.sample(self.memory, self.batch_size)
            
            #print(mini_batch)
            
            update_output = np.zeros((self.batch_size, self.state_size))
            update_input = np.zeros((self.batch_size, self.state_size))
            
            actions, rewards = [], []
            
            for i in range(self.batch_size):
                state, action, reward, next_state = mini_batch[i]
                
                update_input[i] = self.state_encod_arch(state)
                update_output[i] = self.state_encod_arch(next_state)
                actions.append(action)
                rewards.append(reward)
                
                # Write your code from here
                # 1. Predict the target from earlier model
            target = self.model.predict(update_input)
            target_val = self.model.predict(update_output)
            print(target.shape)
            print(target)
            
            
                # 2. Get the target for the Q-network
            for i in range(self.batch_size):
                # Q Learning: get maximum Q value at s' from target model
                if False:
                    
                    target[i][all_actions.index(actions[i])] = rewards[i]
                
                else:
                                        
                    target[i][all_actions.index(actions[i])] = rewards[i] + self.discount_factor * (np.amax(target_val[i]))    
                
                #3. Update your 'update_output' and 'update_input' batch
            
                
                
        # 4. Fit your model and track the loss values
            self.model.fit(update_input, target, batch_size=self.batch_size,
                       epochs=1, verbose=0)
            
    def save(self, name):
        self.model.save(name)

In [60]:
Episodes = 1
steps_per_episode = 30

### DQN block

In [138]:
for episode in range(Episodes):

    terminal_state = False
    
    # Write code here
    # Call the environment
    env = CabDriver()
    
    # Call all the initialised variables of the environment
    state_size = 36
    action_size = len(env.action_space)
    state = env.reset()
    state = env.state_init
   
    #Call the DQN agent
    print(state)
    agent = DQNAgent(state_size, action_size)
    
    for i in range(0,self.batch_size):
              
        # Write your code here
        # 1. Pick epsilon-greedy action from possible actions for the current state
        pos_actions = env.requests(state)[1]
        
        action = agent.get_action(state,pos_actions)
        #print(pos_actions)
        # 2. Evaluate your reward and next state
        next_state = env.next_state_func(state, action, Time_matrix)
        reward = env.reward_func(state, action, Time_matrix)
        
        
        
        
        # 3. Append the experience to the memory
        agent.append_sample(state, action, reward, next_state)
        state = next_state
        # 4. Train the model by calling function agent.train_model
        
        agent.train_model()
        
        
        # 5. Keep a track of rewards, Q-values, loss
        

(4, 0, 0)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_214 (Dense)            (None, 32)                1184      
_________________________________________________________________
dense_215 (Dense)            (None, 32)                1056      
_________________________________________________________________
dense_216 (Dense)            (None, 20)                660       
Total params: 2,900
Trainable params: 2,900
Non-trainable params: 0
_________________________________________________________________


NameError: name 'self' is not defined

In [109]:
env = CabDriver()

In [110]:
env.next_state_func((0,23,6),(0,0),Time_matrix)

(0, 0, 0)

In [113]:
len(env.action_space)

20

In [111]:
from itertools import permutations,product

In [130]:
bb = list(permutations(range(0,5), 2))
bb

[(0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (1, 0),
 (1, 2),
 (1, 3),
 (1, 4),
 (2, 0),
 (2, 1),
 (2, 3),
 (2, 4),
 (3, 0),
 (3, 1),
 (3, 2),
 (3, 4),
 (4, 0),
 (4, 1),
 (4, 2),
 (4, 3)]

In [132]:
bb.index((0,4))

3

### Tracking Convergence

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
time = np.arange(0,10000)
epsilon = []
for i in range(0,10000):
    epsilon.append(0 + (1 - 0) * np.exp(-0.0009*i))

In [None]:
plt.plot(time, epsilon)
plt.show()