### Cab-Driver Agent

In [1]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle
import os
# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt

# Import the environment
#from Env import CabDriver


Using TensorFlow backend.


In [2]:
# Import routines

import numpy as np
import math
import random
from itertools import permutations,product

# Defining hyperparameters
m = 5 # number of cities, ranges from 1 ..... m
t = 24 # number of hours, ranges from 0 .... t-1
d = 7  # number of days, ranges from 0 ... d-1
C = 5 # Per hour fuel and other costs
R = 9 # per hour revenue from a passenger


class CabDriver():

    def __init__(self):
        """initialise your state and define your action space and state space"""
        action_list = list(permutations(range(0,m) ,2))
        action_list.append((0,0))
        self.action_space = np.array(action_list) #action space is unique 2 values(source & destination) + the no op
        self.state_space = list(product(*[list(range(0,m)), list(range(0,t)), list(range(0,d))])) #State space from MDP:
        #𝑠=𝑋𝑖𝑇𝑗𝐷𝑘 𝑤ℎ𝑒𝑟𝑒 𝑖=0…𝑚−1;𝑗=0….𝑡−1;𝑘=0…..𝑑−1, Where 𝑋𝑖 represents a driver’s current location, 𝑇𝑗 represents time component (more specifically hour of the day), 𝐷𝑘 represents the day of the week
        self.state_size = len(self.state_space)
        self.action_size = len(self.action_space)
        self.state_init = random.choice(self.state_space) #Initialises to any random self_space
        self.encode_vector = np.array([24*7, 7, 1]).reshape(3, 1)


        # Start the first round
        self.reset()


    ## Encoding state (or state-action) for NN input

    def state_encod_arch1(self, curr_state, batch_size=1):
        """convert the state into a vector so that it can be fed to the NN. This method converts a given state into a vector format. Hint: The vector is of size m + t + d."""
       
        #Encoded values of m + t + d
        
        curr_state = np.array(curr_state).reshape(1, 3)
        #print(curr_state.shape)
        #enc_mat = self.encode_vector
        # pos = (state[0]*24*7) + (state[1]*7) + state[2]
        
        pos_mat = np.dot(curr_state, self.encode_vector)
        state_encod =  np.zeros((1, self.state_size))
        # state_encod[pos] = 1
        for i in range(batch_size):
            state_encod[i][pos_mat[i]] = 1

        return np.reshape(state_encod, [1, env.state_size])
    


    # Use this function if you are using architecture-2 
    # def state_encod_arch2(self, state, action):
    #     """convert the (state-action) into a vector so that it can be fed to the NN. This method converts a given state-action pair into a vector format. Hint: The vector is of size m + t + d + m + m."""

        
    #     return state_encod


    ## Getting number of requests

    def requests(self, state):
        """Determining the number of requests basis the location. 
        Use the table specified in the MDP and complete for rest of the locations"""
        location = state[0]
        requests = 0
        if location == 0:
            requests = np.random.poisson(2)

        if location == 1:
            requests = np.random.poisson(12)   #MDP Poisson distribution
        
        if location == 2:
            requests = np.random.poisson(4)    #MDP Poisson distribution
            
        if location == 3:
            requests = np.random.poisson(7)    #MDP Poisson distribution

        if location == 4:
            requests = np.random.poisson(8)    #MDP Poisson distribution  
            
        if requests > 15:
            requests = 15

        possible_actions_index = random.sample(range(0, (m-1)*m), requests) # (0,0) is not considered as customer request
        possible_actions_index.append(20) #add the index of No-OP action (0, 0)
        actions = [self.action_space[i] for i in possible_actions_index]

        print('Number of actions available', len(actions))
        return possible_actions_index, actions   



    def reward_func(self, state, action, Time_matrix):
        """Takes in state, action and Time-matrix and returns the reward"""
        if action[0] == action[1]:
            reward = -C 
            return reward

        #print('reward:' ,state, action)
        p = action[0]
        q = action[1]
        i = state[0]
        time = state[1]
        day = state[2]
        #print('reward vals:', (p, q, i, time, day))
        t_pq = Time_matrix[p][q][time][day]
        t_ip = Time_matrix[i][p][time][day]
        
        
        reward = (R*t_pq)-(C*(t_pq+t_ip))
        return reward


    def next_state_func(self, state, action, Time_matrix):
        """Takes state and action as input and returns next state"""
        
        #print('next_state :', state, action)
        p = action[0]
        q = action[1]
        i = state[0]
        time_curr = state[1]
        day_curr = state[2]
        #print('next_state_vals :', (p, q, i, time_curr, day_curr))
        time_next = time_curr + Time_matrix[p][q][time_curr][day_curr]

        day_next = int((day_curr+int(time_next/24)) % 7)
        time_next = int(time_next % 24)
            
        next_state = (q,time_next,day_next)
        return next_state


    def reset(self):
        return self.action_space, self.state_space, self.state_init


#### Defining Time Matrix

In [3]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy")

#### Tracking the state-action pairs for checking convergence


In [4]:
#Defining a function to save the Q-dictionary as a pickle file
def save_pickle(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [None]:
class DQNAgent:
    def __init__(self, state_size, action_size, discount_factor=0.95, learning_rate=0.01,
                       epsilon=0.99, epsilon_decay=0.99, epsilon_min=0.01):
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # Write here: Specify you hyper parameters for the DQN
        self.discount_factor = discount_factor
        self.learning_rate = learning_rate        
        self.epsilon_max = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.model_history = None
        
        self.batch_size = 32
        #self.batch_size = 1
        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # create main model and target model
        self.model = self.build_model()
    

    # approximate Q function using Neural Network
    def build_model(self):
        model = Sequential()
        # Write your code here: Add layers to your neural nets       

        # hidden layers
        model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))

        # the output layer: output is of size num_actions
        model.add(Dense(self.action_size, activation='relu', kernel_initializer='he_uniform'))     
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        model.summary()
        return model


    def get_action(self, cstate, all_actions, pos_act_ind):
    # Write your code here:
    # get action from model using epsilon-greedy policy
    # Decay in ε after we generate each sample from the environment
        actions = all_actions[pos_act_ind]
        q_value = 0
        if np.random.rand() <= self.epsilon_max:
            # explore: choose a random action from all possible actions
            print('Exploring')
            action = random.choice(actions)
        else:
            # choose the action with the highest q(s, a)
            # the first index corresponds to the batch size, so
            # reshape state to (1, state_size) so that the first index corresponds to the batch size
            print('Exploiting')
            #cstate = cstate.reshape(1, self.state_size) 
            q_value = self.model.predict(x=cstate)
            max_index = np.argmax(q_value[0])
            action = all_actions[max_index] if max_index in pos_act_ind else random.choice(actions)
        print('Selected action ', action)    
        return action, q_value
        

    def append_sample(self, state, action, reward, next_state, done):
    # Write your code here:
    # save sample <s,a,r,s'> to the replay memory
        self.memory.append((state, action, reward, next_state, done))
    
    
    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):
        if len(self.memory) > self.batch_size:
            # Sample batch from the memory
            mini_batch = random.sample(self.memory, self.batch_size)
            update_input = np.zeros((self.batch_size, self.state_size))
            update_output = np.zeros((self.batch_size, self.state_size))
            
            actions, rewards, done = [], [], []
            
            for i in range(self.batch_size):
                state, action, reward, next_state, done_boolean = mini_batch[i]
                
                # Write your code from here
                # 1. Predict the target from earlier model
                update_input[i] = env.state_encod_arch1(state)
                actions.append(action)
                rewards.append(reward)
                update_output[i] = env.state_encod_arch1(next_state)
                done.append(done_boolean)
                
            # 2. Get the target for the Q-network
            
            target = self.model.predict(update_input)
            target_qval = self.model.predict(update_output)
            #print(target, target.shape)
            #print(target.shape, target_qval.shape)

            #3. Update your 'update_output' and 'update_input' batch
            for i in range(self.batch_size):
                #print(i, actions[i])
                if done[i]:
                    target[i][actions[i]] = rewards[i]
                    #target[i] = rewards[i]
                else: # non-terminal state
                    target[i][actions[i]] = rewards[i] + self.discount_factor * np.max(target_qval[i])
                    #target[i] = rewards[i] + self.discount_factor * np.max(target_qval[i])
                
        # 4. Fit your model and track the loss values
            return self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=1)
            
            
    def save(self, name):
        self.model.save(name)

In [None]:
# to store rewards in each episode
rewards_per_episode, episodes, q_vals_per_episode, loss = [], [], [], []

# make dir to store model weights
if not os.path.exists("saved_model_weights"):
    os.mkdir("saved_model_weights")

# n_episodes
n_episodes = 150

### DQN block

In [None]:
 # Call all the initialised variables of the environment
env = CabDriver()
#Call the DQN agent
dqn = DQNAgent(env.state_size, env.action_size)

for episode in range(n_episodes):

    # Write code here
    # Call the environment
   
    _,_,curr_state = env.reset()
    state_size = env.state_size
    pos_act_ind, actions = env.requests(curr_state)
    action = random.choice(actions)
    #action_size = len(actions)
    reward = 0
    curr_time = 0
    q_val_list = []
    #print(curr_state)
    
    
    terminal_state = False
    print("Episode :", episode)
    
    while not terminal_state:
        # Write your code here
        # 1. Pick epsilon-greedy action from possible actions for the current state
        encoded_state = env.state_encod_arch1(curr_state)
        #encoded_state = np.reshape(encoded_state, [1, env.state_size])
        action, q_value = dqn.get_action(encoded_state, env.action_space, pos_act_ind)
        # 2. Evaluate your reward and next state
        reward = reward + env.reward_func(curr_state, action, Time_matrix)
        next_state = env.next_state_func(curr_state,action,Time_matrix)
        
        q_val_list.append(q_value)
        
        p = action[0]
        q = action[1]
        i = curr_state[0]
        time = curr_state[1]
        day = curr_state[2]
        print(action)
        curr_time = curr_time + Time_matrix[i][p][time][day]
        
        
        day = int((day+int(time/24)) % 7)
        time = int(time % 24)
        
        curr_time = curr_time + Time_matrix[p][q][time][day]
        day = int((day+int(time/24)) % 7)
        time = int(time % 24)
        # 3. Append the experience to the memory
        dqn.append_sample(curr_state, action, reward, next_state, terminal_state)
        curr_state = next_state
        
        # 4. Train the model by calling function agent.train_model
        history = dqn.train_model()
        # 5. Keep a track of rewards, Q-values, loss
        
        if history:
            print('rewards: ', reward, 'q-value: ', q_value)
            print('loss:', history.history['loss'])
        
        if curr_time >= 24*30:
            terminal_state = True
    
        # store total reward obtained in this episode
    rewards_per_episode.append(reward)
    episodes.append(episode)
        
    if dqn.epsilon_max > dqn.epsilon_min:
        dqn.epsilon_max *= dqn.epsilon_decay        
        
        

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                26912     
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_3 (Dense)              (None, 21)                693       
Total params: 28,661
Trainable params: 28,661
Non-trainable params: 0
_________________________________________________________________
Number of actions available 13
Episode : 0
Exploring
Selected action  [4 0]
[4 0]
Exploring
Selected action  [2 3]
[2 3]
Exploring
Selected action  [4 0]
[4 0]
Exploiting
Selected action  [1 0]
[1 0]
Exploring
Selected action  [0 0]
[0 0]
Exploring
Selected action  [4 0]
[4 0]
Exploring
Selected action  [4 0]
[4 0]
Exploring
Selected action  [2 3]
[2 3]
Exploring
Selected action  [0 1]
[0 1]
Explori

rewards:  51.0 q-value:  0
loss: [617.7643432617188]
Exploring
Selected action  [0 4]
[0 4]
Epoch 1/1
rewards:  35.0 q-value:  0
loss: [510.9819641113281]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  39.0 q-value:  0
loss: [546.4935302734375]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  22.0 q-value:  0
loss: [464.7519226074219]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  13.0 q-value:  0
loss: [630.7761840820312]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -11.0 q-value:  0
loss: [540.7274169921875]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  5.0 q-value:  0
loss: [584.9932861328125]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -1.0 q-value:  0
loss: [607.35009765625]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -2.0 q-value:  0
loss: [520.1939086914062]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -7.0 q-value:  0
loss: [571.8206787109375]
Exploring
Selected action  [2 3

rewards:  7.0 q-value:  0
loss: [577.791748046875]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  13.0 q-value:  0
loss: [359.10540771484375]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  25.0 q-value:  0
loss: [367.2469787597656]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  37.0 q-value:  0
loss: [307.2422790527344]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  63.0 q-value:  0
loss: [305.54937744140625]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  36.0 q-value:  0
loss: [426.3515319824219]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  25.0 q-value:  0
loss: [419.0176086425781]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  33.0 q-value:  0
loss: [345.83709716796875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  18.0 q-value:  0
loss: [407.51824951171875]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  12.0 q-value:  0
loss: [511.7124938964844]
Exploring
Selected action  

rewards:  -80.0 q-value:  0
loss: [251.00128173828125]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -56.0 q-value:  0
loss: [409.08404541015625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -61.0 q-value:  0
loss: [329.48828125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -62.0 q-value:  0
loss: [379.33367919921875]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -55.0 q-value:  0
loss: [399.9736328125]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -37.0 q-value:  0
loss: [396.61041259765625]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -38.0 q-value:  0
loss: [340.7107849121094]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -43.0 q-value:  0
loss: [352.65057373046875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -64.0 q-value:  0
loss: [302.4189453125]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -57.0 q-value:  0
loss: [331.82257080078125]
Exploring
Selected actio

Epoch 1/1
rewards:  -70.0 q-value:  0
loss: [426.0220031738281]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -84.0 q-value:  0
loss: [352.2650146484375]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -107.0 q-value:  0
loss: [583.6470947265625]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -75.0 q-value:  0
loss: [527.897216796875]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -73.0 q-value:  0
loss: [514.6438598632812]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -75.0 q-value:  0
loss: [428.7249755859375]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -76.0 q-value:  0
loss: [425.3857727050781]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -93.0 q-value:  0
loss: [332.1543273925781]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -96.0 q-value:  0
loss: [453.39117431640625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -101.0 q-value:  0
loss: [462.67498779296875]
Explor

rewards:  -39.0 q-value:  0
loss: [648.5897216796875]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -40.0 q-value:  0
loss: [806.7388916015625]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -42.0 q-value:  0
loss: [773.535400390625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -47.0 q-value:  0
loss: [643.415283203125]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -49.0 q-value:  0
loss: [598.06689453125]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -56.0 q-value:  0
loss: [748.455322265625]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -66.0 q-value:  0
loss: [419.282958984375]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -74.0 q-value:  0
loss: [342.58056640625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -79.0 q-value:  0
loss: [523.078369140625]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -80.0 q-value:  0
loss: [444.6087951660156]
Exploring
Selected action  [

rewards:  -163.0 q-value:  0
loss: [514.30322265625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -168.0 q-value:  0
loss: [727.4567260742188]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -198.0 q-value:  0
loss: [538.0408935546875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -203.0 q-value:  0
loss: [492.1756591796875]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -233.0 q-value:  0
loss: [401.3697509765625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -238.0 q-value:  0
loss: [671.3292236328125]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -228.0 q-value:  0
loss: [616.41796875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -233.0 q-value:  0
loss: [535.7280883789062]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -238.0 q-value:  0
loss: [859.252685546875]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -229.0 q-value:  [[0.         0.         0.         0.       

rewards:  -326.0 q-value:  0
loss: [1349.9732666015625]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -328.0 q-value:  0
loss: [1436.4345703125]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -334.0 q-value:  0
loss: [903.8946533203125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -339.0 q-value:  0
loss: [1280.91748046875]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -324.0 q-value:  0
loss: [1131.545654296875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -329.0 q-value:  0
loss: [1906.839599609375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -334.0 q-value:  0
loss: [1838.4366455078125]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -333.0 q-value:  0
loss: [1493.201416015625]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -337.0 q-value:  0
loss: [1089.06787109375]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -337.0 q-value:  0
loss: [1173.076171875]
Exploring
Sele

rewards:  37.0 q-value:  0
loss: [2089.46728515625]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  41.0 q-value:  0
loss: [1711.535888671875]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  52.0 q-value:  0
loss: [2009.9365234375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  68.0 q-value:  0
loss: [1310.77880859375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  63.0 q-value:  0
loss: [2036.1337890625]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  57.0 q-value:  0
loss: [1852.6082763671875]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  70.0 q-value:  0
loss: [1024.729248046875]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  82.0 q-value:  0
loss: [833.9938354492188]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  80.0 q-value:  0
loss: [1123.703369140625]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  52.0 q-value:  0
loss: [2420.62060546875]
Exploring
Selected action  [2 0]
[2

rewards:  130.0 q-value:  0
loss: [1202.1253662109375]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  138.0 q-value:  0
loss: [1007.8177490234375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  146.0 q-value:  0
loss: [1252.4088134765625]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  118.0 q-value:  0
loss: [2556.20556640625]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  101.0 q-value:  0
loss: [1938.9620361328125]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  111.0 q-value:  0
loss: [1896.87060546875]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  108.0 q-value:  0
loss: [1672.140625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  103.0 q-value:  0
loss: [2218.39794921875]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  139.0 q-value:  0
loss: [1221.81982421875]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  137.0 q-value:  0
loss: [1800.90380859375]
Exploring
Selected action 

rewards:  9.0 q-value:  0
loss: [2092.482421875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  7.0 q-value:  0
loss: [879.9117431640625]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  5.0 q-value:  0
loss: [1648.11279296875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  0.0 q-value:  0
loss: [1271.926025390625]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  2.0 q-value:  0
loss: [1350.822998046875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -3.0 q-value:  0
loss: [1056.065185546875]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -3.0 q-value:  0
loss: [1345.2642822265625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -8.0 q-value:  0
loss: [1763.124755859375]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -24.0 q-value:  [[0.         0.         0.         0.         0.         0.51968837
  0.         0.         0.         0.         0.         0.02113765
  0.08826232 0.         0.   

rewards:  -126.0 q-value:  0
loss: [1123.5205078125]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -118.0 q-value:  0
loss: [1894.4312744140625]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -146.0 q-value:  0
loss: [1243.51025390625]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -162.0 q-value:  0
loss: [1751.6773681640625]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -150.0 q-value:  0
loss: [1712.550048828125]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -161.0 q-value:  0
loss: [1109.127197265625]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -162.0 q-value:  0
loss: [1659.268310546875]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -167.0 q-value:  0
loss: [1070.7366943359375]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -168.0 q-value:  0
loss: [1288.6268310546875]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -173.0 q-value:  0
loss: [2081.496337890625]
Explorin

rewards:  -211.0 q-value:  0
loss: [1664.321044921875]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -181.0 q-value:  0
loss: [1689.873779296875]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -183.0 q-value:  0
loss: [2106.77734375]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  -177.0 q-value:  0
loss: [1221.0916748046875]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -157.0 q-value:  0
loss: [1819.580810546875]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -183.0 q-value:  0
loss: [1369.6341552734375]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -163.0 q-value:  0
loss: [1976.65185546875]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -170.0 q-value:  0
loss: [1116.88330078125]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -196.0 q-value:  0
loss: [1415.19091796875]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -219.0 q-value:  [[0.         0.         0.         0.    

Epoch 1/1
rewards:  -307.0 q-value:  0
loss: [2042.7998046875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -312.0 q-value:  0
loss: [1495.6258544921875]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -313.0 q-value:  0
loss: [1414.6922607421875]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -339.0 q-value:  0
loss: [1469.0213623046875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -344.0 q-value:  0
loss: [1382.2840576171875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -349.0 q-value:  0
loss: [1067.114990234375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -375.0 q-value:  0
loss: [2146.242919921875]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -376.0 q-value:  0
loss: [1014.6390380859375]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -402.0 q-value:  0
loss: [1586.59912109375]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -409.0 q-value:  0
loss: [1323.99816894531

Epoch 1/1
rewards:  -642.0 q-value:  0
loss: [2070.745849609375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -644.0 q-value:  0
loss: [5063.6455078125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -649.0 q-value:  0
loss: [3868.06787109375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -661.0 q-value:  0
loss: [3591.516357421875]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -687.0 q-value:  0
loss: [2835.349609375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -699.0 q-value:  0
loss: [4895.208984375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -701.0 q-value:  0
loss: [3346.6943359375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -705.0 q-value:  0
loss: [2102.1328125]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -709.0 q-value:  0
loss: [1572.3016357421875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -714.0 q-value:  0
loss: [1452.23681640625]
Exploring
Selec

rewards:  -935.0 q-value:  0
loss: [9288.0166015625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -940.0 q-value:  0
loss: [5048.6484375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -967.0 q-value:  0
loss: [9211.76171875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -972.0 q-value:  0
loss: [2399.3974609375]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -973.0 q-value:  0
loss: [4121.72314453125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -978.0 q-value:  0
loss: [6443.974609375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -1005.0 q-value:  0
loss: [7986.3251953125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -1010.0 q-value:  0
loss: [8360.287109375]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -1011.0 q-value:  0
loss: [7491.3955078125]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -1012.0 q-value:  0
loss: [6839.31787109375]
Exploring
Selected action  [4 

Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -1267.0 q-value:  0
loss: [12462.1806640625]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -1268.0 q-value:  0
loss: [7476.537109375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -1269.0 q-value:  0
loss: [9534.08203125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -1274.0 q-value:  0
loss: [12396.740234375]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -1280.0 q-value:  0
loss: [19556.5]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -1285.0 q-value:  0
loss: [17115.310546875]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -1286.0 q-value:  0
loss: [8237.20703125]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -1286.0 q-value:  0
loss: [13666.494140625]
Exploiting
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -1314.0 q-value:  [[0.         0.         0.         0.         0.         0.38581082
  0.         0.         0.         0.     

Epoch 1/1
rewards:  -119.0 q-value:  0
loss: [17443.60546875]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -100.0 q-value:  0
loss: [4112.1298828125]
Exploiting
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -109.0 q-value:  [[0.         0.         0.         0.         0.         0.4704502
  0.         0.         0.         0.         0.         0.
  0.07857762 0.         0.         0.         0.10673242 0.15297358
  0.         0.04857763 0.        ]]
loss: [10053.0361328125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -149.0 q-value:  0
loss: [16113.88671875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -151.0 q-value:  0
loss: [17444.2578125]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -154.0 q-value:  0
loss: [10673.5205078125]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -159.0 q-value:  0
loss: [3551.881591796875]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  -160.0 q-value:  0
loss: [5620.9433

Epoch 1/1
rewards:  -160.0 q-value:  0
loss: [15332.1181640625]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -149.0 q-value:  0
loss: [15560.673828125]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -141.0 q-value:  0
loss: [6780.6279296875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -146.0 q-value:  0
loss: [15957.904296875]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -143.0 q-value:  0
loss: [10301.013671875]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -147.0 q-value:  0
loss: [9711.62890625]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -147.0 q-value:  0
loss: [10928.478515625]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -151.0 q-value:  0
loss: [8230.8662109375]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -127.0 q-value:  0
loss: [13998.859375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -132.0 q-value:  0
loss: [2256.2587890625]
Exploring
Selected acti

rewards:  29.0 q-value:  0
loss: [11980.841796875]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  27.0 q-value:  0
loss: [19258.95703125]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  22.0 q-value:  0
loss: [2746.66845703125]
Number of actions available 5
Episode : 7
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -5 q-value:  0
loss: [14690.015625]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  18.0 q-value:  0
loss: [8417.857421875]
Exploiting
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  8.0 q-value:  [[0.         0.         0.         0.         0.         0.3282604
  0.         0.         0.         0.         0.         0.01830234
  0.04670671 0.         0.         0.         0.05038361 0.17151774
  0.         0.         0.        ]]
loss: [7095.447265625]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -10.0 q-value:  0
loss: [9876.390625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -2.0 q-value:  0
los

Epoch 1/1
rewards:  -44.0 q-value:  0
loss: [7710.76708984375]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -20.0 q-value:  0
loss: [18330.486328125]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -22.0 q-value:  0
loss: [6374.7822265625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -36.0 q-value:  0
loss: [14062.4814453125]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -43.0 q-value:  0
loss: [11935.4375]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -31.0 q-value:  0
loss: [8519.3984375]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -23.0 q-value:  0
loss: [19180.326171875]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -15.0 q-value:  0
loss: [8344.509765625]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -7.0 q-value:  0
loss: [10569.51171875]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  17.0 q-value:  0
loss: [12961.4921875]
Exploring
Selected action  [4 3]
[4 3]
Ep

Epoch 1/1
rewards:  -208.0 q-value:  0
loss: [17190.095703125]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -210.0 q-value:  0
loss: [3653.19580078125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -188.0 q-value:  0
loss: [4307.5634765625]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -188.0 q-value:  0
loss: [6491.158203125]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -188.0 q-value:  0
loss: [1337.1578369140625]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -193.0 q-value:  [[0.         0.         0.         0.         0.         0.24762435
  0.         0.         0.         0.         0.         0.
  0.06405261 0.         0.         0.         0.05863293 0.09963582
  0.         0.         0.        ]]
loss: [6031.81982421875]
Exploring
Selected action  [4 3]
[4 3]
Epoch 1/1
rewards:  -169.0 q-value:  0
loss: [8410.271484375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -174.0 q-value:  0
loss: [4707.

Epoch 1/1
rewards:  -38.0 q-value:  0
loss: [18031.68359375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -43.0 q-value:  0
loss: [7309.63720703125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -48.0 q-value:  0
loss: [15930.8349609375]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -63.0 q-value:  0
loss: [15264.693359375]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -29.0 q-value:  0
loss: [10566.5859375]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -13.0 q-value:  0
loss: [10902.43359375]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -17.0 q-value:  0
loss: [6934.134765625]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -39.0 q-value:  0
loss: [8791.923828125]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -55.0 q-value:  0
loss: [4025.62158203125]
Exploiting
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -57.0 q-value:  [[0.         0.         0.         0.         0.         

Epoch 1/1
rewards:  -57.0 q-value:  0
loss: [11698.3642578125]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -60.0 q-value:  0
loss: [4254.3564453125]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -54.0 q-value:  0
loss: [16810.9609375]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -58.0 q-value:  0
loss: [14412.12109375]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -81.0 q-value:  0
loss: [6876.1826171875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -86.0 q-value:  0
loss: [10715.0859375]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -92.0 q-value:  0
loss: [19869.2734375]
Exploiting
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -89.0 q-value:  [[0.         0.         0.         0.         0.         0.32398278
  0.         0.         0.         0.         0.         0.03793438
  0.03541692 0.         0.         0.         0.07331775 0.11581267
  0.         0.         0.        ]]
loss: [7840.577148437

rewards:  -99.0 q-value:  0
loss: [10080.1923828125]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -102.0 q-value:  0
loss: [8080.076171875]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -102.0 q-value:  0
loss: [8682.1796875]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -104.0 q-value:  0
loss: [9072.3388671875]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -96.0 q-value:  0
loss: [18341.724609375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -128.0 q-value:  0
loss: [12241.65625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -133.0 q-value:  0
loss: [12514.2880859375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -138.0 q-value:  0
loss: [7319.41796875]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -141.0 q-value:  0
loss: [13349.9560546875]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -134.0 q-value:  0
loss: [26393.01953125]
Exploring
Selected action  [2 0]
[2 0]


Epoch 1/1
rewards:  32.0 q-value:  [[0.         0.         0.         0.         0.         0.4919832
  0.         0.         0.         0.         0.         0.00377483
  0.11554367 0.         0.01612853 0.         0.05664667 0.2897812
  0.         0.00527227 0.        ]]
loss: [9270.939453125]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  30.0 q-value:  0
loss: [11085.7490234375]
Exploiting
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  30.0 q-value:  [[0.         0.         0.         0.         0.         0.30454162
  0.         0.         0.         0.         0.         0.
  0.05935901 0.         0.         0.         0.01068665 0.12018364
  0.         0.         0.        ]]
loss: [14025.73046875]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  33.0 q-value:  0
loss: [10807.232421875]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  41.0 q-value:  0
loss: [6569.50390625]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  18.0 q-value: 

Epoch 1/1
rewards:  -31.0 q-value:  0
loss: [8474.123046875]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -21.0 q-value:  0
loss: [16111.9375]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -9.0 q-value:  0
loss: [4909.8193359375]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -12.0 q-value:  0
loss: [4353.44482421875]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  14.0 q-value:  0
loss: [12009.951171875]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  19.0 q-value:  0
loss: [8362.509765625]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  13.0 q-value:  0
loss: [14701.919921875]
Exploiting
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  4.0 q-value:  [[0.         0.         0.         0.         0.         0.4248218
  0.         0.         0.         0.         0.         0.02002457
  0.04820918 0.         0.         0.         0.09574321 0.14260952
  0.         0.         0.        ]]
loss: [6768.3583984375]
Expl

rewards:  -23.0 q-value:  0
loss: [7468.7265625]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -11.0 q-value:  0
loss: [12083.94921875]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -41.0 q-value:  0
loss: [13324.4296875]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -54.0 q-value:  0
loss: [14101.55859375]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -44.0 q-value:  0
loss: [4559.8798828125]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -48.0 q-value:  0
loss: [7548.57666015625]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -66.0 q-value:  0
loss: [10170.5390625]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -63.0 q-value:  0
loss: [19839.46484375]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -41.0 q-value:  0
loss: [11145.9365234375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -46.0 q-value:  0
loss: [13833.1875]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rew

Epoch 1/1
rewards:  16.0 q-value:  0
loss: [13711.6611328125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  11.0 q-value:  0
loss: [11679.90234375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  2.0 q-value:  0
loss: [12689.841796875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -4.0 q-value:  0
loss: [1453.0740966796875]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -17.0 q-value:  0
loss: [4354.6630859375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  3.0 q-value:  0
loss: [6462.0517578125]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -20.0 q-value:  0
loss: [13832.3828125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -25.0 q-value:  0
loss: [8422.6064453125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -30.0 q-value:  0
loss: [2546.46728515625]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -22.0 q-value:  0
loss: [17718.830078125]
Exploring
Selected action  [3 1]
[

rewards:  -96.0 q-value:  0
loss: [1963.2783203125]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -112.0 q-value:  0
loss: [12711.4521484375]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -88.0 q-value:  0
loss: [7770.0390625]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -94.0 q-value:  0
loss: [9614.46875]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -96.0 q-value:  0
loss: [2305.053466796875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -68.0 q-value:  0
loss: [12939.640625]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -95.0 q-value:  0
loss: [6651.3798828125]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -97.0 q-value:  0
loss: [3866.665771484375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -99.0 q-value:  0
loss: [15821.060546875]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -101.0 q-value:  0
loss: [2437.1025390625]
Exploring
Selected action  [1 3]
[1 3]
Epoc

rewards:  -175.0 q-value:  0
loss: [11944.5537109375]
Exploiting
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -163.0 q-value:  [[0.         0.         0.         0.         0.         0.37238088
  0.         0.         0.         0.         0.         0.
  0.0678615  0.         0.         0.         0.0914225  0.13212939
  0.         0.         0.        ]]
loss: [11695.435546875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -168.0 q-value:  0
loss: [17229.87890625]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -148.0 q-value:  0
loss: [10754.4365234375]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  -158.0 q-value:  0
loss: [7554.65576171875]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -122.0 q-value:  0
loss: [6511.1630859375]
Exploring
Selected action  [1 0]
[1 0]
Epoch 1/1
rewards:  -126.0 q-value:  0
loss: [3628.8818359375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -131.0 q-value:  0
loss: [13811.61328125]


Epoch 1/1
rewards:  85.0 q-value:  0
loss: [15208.4384765625]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  110.0 q-value:  0
loss: [11142.5361328125]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  114.0 q-value:  0
loss: [10950.4658203125]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  150.0 q-value:  [[0.         0.         0.         0.         0.         0.29793623
  0.         0.         0.         0.         0.         0.
  0.05579024 0.         0.         0.         0.01358375 0.11403672
  0.         0.         0.        ]]
loss: [7042.3525390625]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  146.0 q-value:  0
loss: [15685.65625]
Exploring
Selected action  [3 4]
[3 4]
Epoch 1/1
rewards:  160.0 q-value:  0
loss: [14203.25]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  179.0 q-value:  0
loss: [7054.083984375]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  173.0 q-value:  0
loss: [13218.177734375]
Exploring

Epoch 1/1
rewards:  176.0 q-value:  0
loss: [10096.525390625]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  141.0 q-value:  0
loss: [3717.95263671875]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  140.0 q-value:  0
loss: [2037.79833984375]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1
rewards:  139.0 q-value:  0
loss: [9778.2939453125]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  163.0 q-value:  0
loss: [12503.23046875]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  161.0 q-value:  [[0.         0.         0.         0.         0.         0.34772414
  0.         0.         0.         0.         0.         0.00770267
  0.0809109  0.         0.         0.         0.06697267 0.14964747
  0.         0.         0.        ]]
loss: [6177.13330078125]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  159.0 q-value:  0
loss: [3899.486328125]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  191.0 q-value:  0
loss: [1292.36

rewards:  236.0 q-value:  [[0.         0.         0.         0.         0.         0.3609627
  0.         0.         0.         0.         0.         0.0036481
  0.09235047 0.         0.         0.         0.05774345 0.22048181
  0.         0.0085496  0.        ]]
loss: [17877.109375]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  235.0 q-value:  [[0.         0.         0.         0.         0.         0.55208457
  0.         0.         0.         0.         0.         0.03216945
  0.13238113 0.         0.         0.         0.17012155 0.22591165
  0.         0.         0.        ]]
loss: [1648.578369140625]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  227.0 q-value:  0
loss: [7003.4873046875]
Number of actions available 9
Episode : 12
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  26.0 q-value:  [[0.         0.         0.         0.         0.         0.47776443
  0.         0.         0.         0.         0.         0.04733036
  0.07174876 0. 

Epoch 1/1
rewards:  5.0 q-value:  0
loss: [3528.08984375]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -10.0 q-value:  0
loss: [10100.1240234375]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  2.0 q-value:  [[0.         0.         0.         0.         0.         0.43471104
  0.         0.         0.         0.         0.         0.
  0.08378413 0.         0.         0.         0.07959609 0.17669404
  0.         0.04391154 0.        ]]
loss: [4363.583984375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -3.0 q-value:  0
loss: [17081.67578125]
Exploring
Selected action  [3 0]
[3 0]
Epoch 1/1
rewards:  -4.0 q-value:  0
loss: [13819.068359375]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  30.0 q-value:  0
loss: [20235.240234375]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  17.0 q-value:  0
loss: [11654.81640625]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  11.0 q-value:  0
loss: [8539.3798828125]
Exploring
Se

Epoch 1/1
rewards:  -44.0 q-value:  0
loss: [4991.85302734375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -49.0 q-value:  0
loss: [2622.24560546875]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -76.0 q-value:  0
loss: [8648.89453125]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  -78.0 q-value:  0
loss: [6899.0908203125]
Exploring
Selected action  [4 2]
[4 2]
Epoch 1/1
rewards:  -74.0 q-value:  0
loss: [7831.33837890625]
Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -77.0 q-value:  0
loss: [8614.5361328125]
Exploring
Selected action  [0 3]
[0 3]
Epoch 1/1
rewards:  -88.0 q-value:  0
loss: [7970.60009765625]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  -86.0 q-value:  [[0.         0.         0.         0.         0.         0.3282604
  0.         0.         0.         0.         0.         0.01830234
  0.04670671 0.         0.         0.         0.05038361 0.17151774
  0.         0.         0.        ]]
loss: [7900.34

Exploring
Selected action  [3 2]
[3 2]
Epoch 1/1
rewards:  -72.0 q-value:  0
loss: [5802.55029296875]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -108.0 q-value:  0
loss: [6534.36083984375]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewards:  -109.0 q-value:  0
loss: [2512.0830078125]
Exploiting
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -139.0 q-value:  [[0.         0.         0.         0.         0.         0.33562374
  0.         0.         0.         0.         0.         0.
  0.03711441 0.         0.         0.         0.04241926 0.12336116
  0.         0.         0.        ]]
loss: [7872.83935546875]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -147.0 q-value:  0
loss: [5983.015625]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -158.0 q-value:  0
loss: [16314.8037109375]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -161.0 q-value:  0
loss: [4954.14306640625]
Exploring
Selected action  [1 3]
[1 3]
Epoch 1/1
rewar

Epoch 1/1
rewards:  -193.0 q-value:  0
loss: [5393.83984375]
Exploiting
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -198.0 q-value:  [[0.         0.         0.         0.         0.         0.24338704
  0.         0.         0.         0.         0.         0.
  0.03234629 0.         0.         0.00317863 0.02437754 0.09916785
  0.         0.         0.        ]]
loss: [8142.890625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -203.0 q-value:  0
loss: [4992.7568359375]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -213.0 q-value:  0
loss: [9794.962890625]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -219.0 q-value:  0
loss: [6458.65771484375]
Exploring
Selected action  [4 1]
[4 1]
Epoch 1/1
rewards:  -249.0 q-value:  0
loss: [17188.720703125]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -289.0 q-value:  0
loss: [1749.696044921875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -294.0 q-value:  0
loss: [8617.4150390

rewards:  -256.0 q-value:  0
loss: [7693.49609375]
Exploring
Selected action  [2 3]
[2 3]
Epoch 1/1
rewards:  -259.0 q-value:  0
loss: [6601.60791015625]
Number of actions available 13
Episode : 14
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  26.0 q-value:  [[0.         0.         0.         0.         0.         0.47776443
  0.         0.         0.         0.         0.         0.04733036
  0.07174876 0.         0.         0.         0.11875437 0.17243308
  0.         0.01938195 0.        ]]
loss: [4127.15673828125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  34.0 q-value:  0
loss: [6362.384765625]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  32.0 q-value:  0
loss: [3953.9931640625]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  76.0 q-value:  [[0.         0.         0.         0.         0.         0.31518963
  0.         0.         0.         0.         0.         0.
  0.03617282 0.         0.         0.         0.         0.

Epoch 1/1
rewards:  49.0 q-value:  0
loss: [1308.037841796875]
Exploring
Selected action  [1 4]
[1 4]
Epoch 1/1
rewards:  47.0 q-value:  0
loss: [2908.6318359375]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  27.0 q-value:  0
loss: [4754.994140625]
Exploring
Selected action  [3 1]
[3 1]
Epoch 1/1
rewards:  17.0 q-value:  0
loss: [15129.439453125]
Exploring
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  49.0 q-value:  0
loss: [11315.548828125]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  55.0 q-value:  0
loss: [7358.76611328125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  58.0 q-value:  0
loss: [10926.8720703125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  53.0 q-value:  0
loss: [982.032958984375]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  43.0 q-value:  0
loss: [15272.927734375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  38.0 q-value:  0
loss: [6052.84033203125]
Exploiting
Selected action  [1 2]


Epoch 1/1
rewards:  -44.0 q-value:  0
loss: [9478.92578125]
Exploring
Selected action  [0 2]
[0 2]
Epoch 1/1
rewards:  -29.0 q-value:  0
loss: [4909.138671875]
Exploring
Selected action  [2 1]
[2 1]
Epoch 1/1
rewards:  -1.0 q-value:  0
loss: [7357.939453125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -6.0 q-value:  0
loss: [3100.70166015625]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  18.0 q-value:  0
loss: [1479.650634765625]
Exploiting
Selected action  [1 2]
[1 2]
Epoch 1/1
rewards:  26.0 q-value:  [[0.         0.         0.         0.         0.         0.3646671
  0.         0.         0.         0.         0.         0.
  0.06910945 0.         0.         0.         0.065116   0.16416334
  0.         0.00672938 0.        ]]
loss: [8093.7099609375]
Exploring
Selected action  [0 1]
[0 1]
Epoch 1/1
rewards:  19.0 q-value:  0
loss: [9705.421875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  14.0 q-value:  0
loss: [6968.6689453125]
Exploring
S

rewards:  -42.0 q-value:  [[0.         0.         0.         0.         0.         0.17813192
  0.         0.         0.         0.         0.         0.
  0.0278997  0.         0.         0.01525487 0.         0.11890816
  0.         0.02513181 0.        ]]
loss: [5239.642578125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -47.0 q-value:  0
loss: [1915.7161865234375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -52.0 q-value:  0
loss: [7636.0634765625]
Exploiting
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -57.0 q-value:  [[0.         0.         0.         0.         0.         0.19964492
  0.         0.         0.         0.02480842 0.         0.
  0.00202443 0.         0.         0.04684129 0.         0.06943158
  0.         0.0444722  0.        ]]
loss: [1891.659912109375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -59.0 q-value:  0
loss: [5439.3984375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -64.0 q-value:  0
los

Epoch 1/1
rewards:  -210.0 q-value:  0
loss: [12611.2724609375]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -214.0 q-value:  0
loss: [9326.6884765625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -234.0 q-value:  0
loss: [6908.3310546875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -239.0 q-value:  0
loss: [1026.363525390625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -259.0 q-value:  0
loss: [12550.1240234375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -259.0 q-value:  0
loss: [14257.0068359375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -259.0 q-value:  0
loss: [7130.068359375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -264.0 q-value:  0
loss: [6583.1044921875]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -268.0 q-value:  0
loss: [4431.1513671875]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -273.0 q-value:  0
loss: [11035.7939453125]
Exploring
Sele

rewards:  -492.0 q-value:  0
loss: [3106.478515625]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -498.0 q-value:  0
loss: [6620.6484375]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -503.0 q-value:  0
loss: [3329.0947265625]
Exploiting
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -509.0 q-value:  [[0.         0.         0.         0.         0.         0.34437835
  0.         0.         0.         0.         0.         0.
  0.03924166 0.         0.         0.         0.0516016  0.11003996
  0.         0.03479169 0.        ]]
loss: [4010.63818359375]
Exploiting
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -515.0 q-value:  [[0.         0.         0.         0.         0.         0.39173502
  0.         0.         0.         0.         0.         0.
  0.07012677 0.         0.02291518 0.         0.09454925 0.14729641
  0.         0.03072543 0.        ]]
loss: [5476.365234375]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -518.0 q-value:  0
l

rewards:  -659.0 q-value:  0
loss: [3184.498291015625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -670.0 q-value:  0
loss: [11315.5166015625]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -671.0 q-value:  0
loss: [2319.91943359375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -679.0 q-value:  0
loss: [11135.900390625]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -684.0 q-value:  0
loss: [6374.4677734375]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -691.0 q-value:  0
loss: [3132.19677734375]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -689.0 q-value:  0
loss: [3817.704345703125]
Exploring
Selected action  [2 4]
[2 4]
Epoch 1/1
rewards:  -709.0 q-value:  0
loss: [6197.205078125]
Exploring
Selected action  [0 0]
[0 0]
Epoch 1/1
rewards:  -714.0 q-value:  0
loss: [6917.599609375]
Exploring
Selected action  [2 0]
[2 0]
Epoch 1/1
rewards:  -718.0 q-value:  0
loss: [5445.0087890625]
Exploring
Selected actio

Epoch 1/1
rewards:  -11.0 q-value:  0
loss: [3469.49072265625]
Exploring
Selected action  [4 0]
[4 0]
Epoch 1/1


In [None]:
# make directory
if not os.path.exists("saved_pickle_files"):
    os.mkdir("saved_pickle_files")

# save rewards_per_episode
save_pickle(rewards_per_episode, "saved_pickle_files/rewards_per_episode")


# plot results
with open('saved_pickle_files/rewards_per_episode.pkl', 'rb') as f:
    rewards_per_episode = pickle.load(f)

plt.plot(list(range(len(rewards_per_episode))), rewards_per_episode)
plt.xlabel("episode number")
plt.ylabel("reward per episode")

# save plots in saved_plots/ directory
plt.savefig('rewards.png')

print("Average reward of last 100 episodes is {0}".format(np.mean(rewards_per_episode[-100:])))

### Tracking Convergence

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
time = np.arange(0,10000)
epsilon = []
for i in range(0,10000):
    epsilon.append(0 + (1 - 0) * np.exp(-0.0009*i))

In [None]:
plt.plot(time, epsilon)
plt.show()