### Cab-Driver Agent

In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import random
import math
from collections import deque
import collections
import pickle
import os
import time


# for building DQN model 
from tensorflow.keras import layers
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt

# Import the environment
from Env import CabDriver

#### Defining Time Matrix

In [2]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy")

In [3]:
Time_matrix.shape

(5, 5, 24, 7)

In [4]:
Time_matrix[1][2][11][3]

10.0

In [5]:
Time_matrix.max()

11.0

Maximum time to travel between two locations is 11 hours (next state can change by max of 1 day)

#### Tracking the state-action pairs for checking convergence


In [6]:
#Defining a function to save the Q-dictionary as a pickle file
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

### TODO  : Picking the architecture

#### Architecture 1: input - state, output - q-value for each possible (state, action)

Pros:
- Model has to be run once for each state  

Cons:
    

#### Architecture 2: input - (state, action), output - q-value for the given (state, action)

#### Conclusion

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [7]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # Write here: Specify you hyper parameters for the DQN 
        self.discount_factor = 0.99
        self.learning_rate = 0.001       
        self.epsilon_max = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.00000001
        self.epsilon = 1
        
        
    
    
        self.explore_count = 0
        self.exploit_count = 0
        
        self.batch_size = 32        
        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # create main model and target model
        self.model = self.build_model()
        
        
        
        

    # approximate Q function using Neural Network
    def build_model(self):
        model = Sequential()
        
        # input layer
        model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform')) 
        
        # the output layer: output is of size num_actions
        model.add(Dense(self.action_size, activation='relu', kernel_initializer='he_uniform')) 
        
        
        
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        model.summary
        return model

     

    def get_action(self, state, episode_count):
    # Write your code here:
    # get action from model using epsilon-greedy policy
    # Decay in ε after we generate each sample from the environment       
        
        
#         exp_value = 1/10**(len(str(episode_count))-1)
#         self.epsilon = self.epsilon_min + (self.epsilon_max - self.epsilon_min) * np.exp(-exp_value*episode_count)
    
        
        
        if np.random.rand() <= self.epsilon:
            # exploration: choose a random action from all possible actions
            ##print(f"{episode_count}: EXPLORE")
            self.explore_count += 1
            return random.randrange(self.action_size)

        else:
            # exploitation: choose the action that returns the maximum q-value
            ##print(f"{episode_count}: EXPLOIT")
            
            
#             # the first index corresponds to the batch size, so
#             # reshape state to (1, state_size) so that the first index corresponds to the batch size 
 
            state = np.reshape(state,[1,self.state_size])
            q_value = self.model.predict(state) 
            self.exploit_count += 1
            return np.argmax(q_value)
         
 
   
    def get_summary_details(self):
        return self.explore_count, self.exploit_count
        

        
    def reset_episode_counts(self):
        self.explore_count = 0
        self.exploit_count = 0

    def append_sample(self, state, action_idx, reward, next_state, done):
    # Write your code here:
    # save sample <s,a,r,s',done> to the replay memory after every action
        self.memory.append((state, action_idx, reward, next_state, done))
        
        
#         # update ε after each sample
#         if self.epsilon > self.epsilon_min:
#             self.epsilon *= self.epsilon_decay    
    
    

    # Pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):
        """
        train the neural network on a minibatch. Input to the network is the states,
        output is the target q-value corresponding to each action.
        """

        if len(self.memory) > self.batch_size:
            
            # sample minibatch from memory
            minibatch = random.sample(self.memory, self.batch_size)

            # initialise two matrices - update_input and update_output
            # Initialise the Q(s,a) with zero
            update_input = np.zeros((self.batch_size, self.state_size))
            # Initialise the Q(s',a)
            update_output = np.zeros((self.batch_size, self.state_size))
            actions, rewards, done = [], [], []

            # populate update_input and update_output and the lists rewards, actions, done
            for i in range(self.batch_size):
                state, action_idx, reward, next_state, is_done = minibatch[i]
                #print(f'4DEBUG:minibatch[{i}] = {minibatch[i]}')
                # Add state s to the Q(s,a), Q(s',a) from memory
                update_input[i] = state
                # Add action from memory
                actions.append(action_idx)
                # Add reward from the memory
                rewards.append(reward)
                # Add next state s' to Q(s',a) from the memory
                update_output[i] = next_state
                done.append(is_done)

            # Find the Q(s,a) and Q(s',a) using state as input to the neural network 
        
            # Predict the target q-values from state s
            target = self.model.predict(update_input)

            # Get the target for the Q-network
            target_qval = self.model.predict(update_output)
            
            #print(f'4DEBUG: target:{target}')
            #print(f'4DEBUG: target_qval:{target_qval}')

            # Update the target values - set the target as (r + maxQ(s',a))
            for i in range(self.batch_size):
                # Q Learning: get maximum Q value at s' from target model
                if done[i]:
                    target[i][actions[i]] = rewards[i]
                else: # non-terminal state
                    target[i][actions[i]] = rewards[i] + self.discount_factor * np.max(target_qval[i])

            # Train the model
            self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=0)
     


    def save_model_weights(self, name):
        self.model.save_weights(name)

    def save(self, name):
        self.model.save(name)

In [8]:
Episodes = 1000 

### DQN block

In [9]:

env = CabDriver()

# get size of state and action from environment
state_size = len(env.state_space[0])
action_size = len(env.action_space)

print(f'State size: {state_size}, Action size: {action_size}')

# agent needs to be initialised outside the loop since the DQN
# network will be initialised along with the agent
agent = DQNAgent(action_size=action_size, state_size=state_size) 

# to store rewards in each episode
rewards_per_episode, episodes = [], []

# make dir to store model weights
if not os.path.exists("saved_model_weights"):
    os.mkdir("saved_model_weights")
 

summary_df = pd.DataFrame(columns=['EPISODE','REWARD','MEMORY_LENGTH','EPSILON','EXPLORE_CNT', 'EXPLOIT_CNT', 'EPISODE_TIME'])

summary_threshold = 1000
def update_summary_details(episode,reward, memory_len, epsilon, explore_count, exploit_count,episode_time):
    #print(f'Updating summary details:{match_results}')
    return summary_df.append({'EPISODE' : episode,
                       'REWARD' : reward,
                       'MEMORY_LENGTH' : memory_len,
                       'EPSILON' : epsilon,
                       'EXPLORE_CNT' : explore_count,
                       'EXPLOIT_CNT' : exploit_count,
                       'EPISODE_TIME': episode_time
                      }, ignore_index=True) 

State size: 3, Action size: 21





for episode in range(Episodes):

    # Write code here
    # Call the environment
    # Call all the initialised variables of the environment
    

    #Call the DQN agent
    
    
    while !terminal_state:
        
        # Write your code here
        # 1. Pick epsilon-greedy action from possible actions for the current state
        # 2. Evaluate your reward and next state
        # 3. Append the experience to the memory
        # 4. Train the model by calling function agent.train_model
        # 5. Keep a track of rewards, Q-values, loss
        

In [None]:
start_time = time.time()

#### simulation starts ####
for episode in range(Episodes):
    episode_start_time = time.time()
    done = False
    score = 0

    # reset at the start of each episode
    state = env.reset()

    while not done:

        # 1. Pick epsilon-greedy action from possible actions for the current state
        action_idx = agent.get_action(state,episode)
        action = env.action_space[action_idx]
        # 2. Evaluate your reward and next state
        next_state, reward, done = env.step(state, action, Time_matrix)
            
        #next_state, reward, done, info = env.step(action)

        # 3. Append the experience to the memory
        # save the sample <s, a, r, s', done> to the replay memory
        agent.append_sample(state, action_idx, reward, next_state, done)

        # 4. Train the model by calling function agent.train_model
        agent.train_model()

        # add reward to the total score of this episode
        score += reward
        state = next_state



    # epsilon decay
    if agent.epsilon > agent.epsilon_min:
        agent.epsilon *= agent.epsilon_decay
#         agent.epsilon = self.epsilon_min + (self.epsilon_max - self.epsilon_min) * np.exp(agent.epsilon_decay*episode)

    # 5. Keep a track of rewards, Q-values, loss
    # store total reward obtained in this episode
    rewards_per_episode.append(score)
    episodes.append(episode)
    
    # Save summary after every episode
    explore_count, exploit_count = agent.get_summary_details()
    episode_time = time.time() - episode_start_time
    summary_df = update_summary_details(episode,score, len(agent.memory),round(agent.epsilon,5),explore_count, exploit_count,round(episode_time,3))
    agent.reset_episode_counts()
        
    # every episode:
    print(f"episode {episode}, reward {score}, memory_length {len(agent.memory)}, epsilon {round(agent.epsilon,5)}, explore:{explore_count}, exploit:{exploit_count},time: {round(episode_time,3)}")
        
    # every few episodes:
    if episode % 1000 == 0:
        # store q-values of some prespecified state-action pairs
        # q_dict = agent.store_q_values()
        
        
        curr_elapsed_time = time.time() - start_time
        print(f'Episodes:{episode} - Elapsed_time:{round(curr_elapsed_time,3)} ')  

        # save model weights
        agent.save_model_weights(name="model_weights.h5")

 
# save model weights
agent.save_model_weights(name="model_weights.h5")
    
elapsed_time = time.time() - start_time
print(f'Total elapsed_time:{elapsed_time}')       
    
        
#### simulation complete ####

episode 0, reward -180.0, memory_length 116, epsilon 0.999, explore count:116, exploit count:0,time: 7.303
Episodes:0 - Elapsed_time:7.305 
episode 1, reward -250.0, memory_length 239, epsilon 0.998, explore count:123, exploit count:0,time: 9.515
episode 2, reward 54.0, memory_length 351, epsilon 0.997, explore count:111, exploit count:1,time: 7.796
episode 3, reward 25.0, memory_length 481, epsilon 0.99601, explore count:130, exploit count:0,time: 10.407
episode 4, reward -201.0, memory_length 600, epsilon 0.99501, explore count:119, exploit count:0,time: 8.677
episode 5, reward 27.0, memory_length 708, epsilon 0.99401, explore count:107, exploit count:1,time: 7.504
episode 6, reward 134.0, memory_length 835, epsilon 0.99302, explore count:125, exploit count:2,time: 8.957
episode 7, reward 258.0, memory_length 950, epsilon 0.99203, explore count:115, exploit count:0,time: 8.05
episode 8, reward -185.0, memory_length 1062, epsilon 0.99104, explore count:112, exploit count:0,time: 7.668

episode 75, reward 132.0, memory_length 2000, epsilon 0.92678, explore count:103, exploit count:6,time: 8.61
episode 76, reward -27.0, memory_length 2000, epsilon 0.92585, explore count:113, exploit count:8,time: 8.743
episode 77, reward 54.0, memory_length 2000, epsilon 0.92493, explore count:110, exploit count:14,time: 9.763
episode 78, reward -77.0, memory_length 2000, epsilon 0.924, explore count:99, exploit count:12,time: 9.831
episode 79, reward -104.0, memory_length 2000, epsilon 0.92308, explore count:111, exploit count:7,time: 8.44
episode 80, reward -36.0, memory_length 2000, epsilon 0.92216, explore count:111, exploit count:10,time: 10.37
episode 81, reward 75.0, memory_length 2000, epsilon 0.92123, explore count:104, exploit count:8,time: 8.841
episode 82, reward -42.0, memory_length 2000, epsilon 0.92031, explore count:113, exploit count:8,time: 9.372
episode 83, reward 79.0, memory_length 2000, epsilon 0.91939, explore count:117, exploit count:13,time: 10.91
episode 84, r

episode 149, reward -282.0, memory_length 2000, epsilon 0.86064, explore count:98, exploit count:22,time: 8.952
episode 150, reward 135.0, memory_length 2000, epsilon 0.85978, explore count:100, exploit count:21,time: 8.998
episode 151, reward -273.0, memory_length 2000, epsilon 0.85892, explore count:104, exploit count:20,time: 9.178
episode 152, reward 426.0, memory_length 2000, epsilon 0.85806, explore count:107, exploit count:13,time: 8.55
episode 153, reward -233.0, memory_length 2000, epsilon 0.85721, explore count:109, exploit count:7,time: 8.17
episode 154, reward -5.0, memory_length 2000, epsilon 0.85635, explore count:100, exploit count:11,time: 7.798
episode 155, reward -267.0, memory_length 2000, epsilon 0.85549, explore count:106, exploit count:19,time: 9.145
episode 156, reward 36.0, memory_length 2000, epsilon 0.85464, explore count:108, exploit count:14,time: 8.861
episode 157, reward -32.0, memory_length 2000, epsilon 0.85378, explore count:105, exploit count:16,time: 

episode 223, reward -306.0, memory_length 2000, epsilon 0.79923, explore count:96, exploit count:19,time: 8.571
episode 224, reward 108.0, memory_length 2000, epsilon 0.79843, explore count:96, exploit count:33,time: 9.657
episode 225, reward -44.0, memory_length 2000, epsilon 0.79763, explore count:97, exploit count:22,time: 8.885
episode 226, reward -140.0, memory_length 2000, epsilon 0.79683, explore count:99, exploit count:32,time: 9.964
episode 227, reward -311.0, memory_length 2000, epsilon 0.79603, explore count:92, exploit count:27,time: 9.039
episode 228, reward -81.0, memory_length 2000, epsilon 0.79524, explore count:91, exploit count:21,time: 8.181
episode 229, reward -365.0, memory_length 2000, epsilon 0.79444, explore count:96, exploit count:19,time: 8.541
episode 230, reward -284.0, memory_length 2000, epsilon 0.79365, explore count:89, exploit count:30,time: 9.131
episode 231, reward -90.0, memory_length 2000, epsilon 0.79285, explore count:92, exploit count:25,time: 8.

episode 297, reward -93.0, memory_length 2000, epsilon 0.74219, explore count:111, exploit count:40,time: 11.58
episode 298, reward -252.0, memory_length 2000, epsilon 0.74145, explore count:98, exploit count:31,time: 9.828
episode 299, reward -419.0, memory_length 2000, epsilon 0.74071, explore count:84, exploit count:46,time: 10.24
episode 300, reward -95.0, memory_length 2000, epsilon 0.73997, explore count:100, exploit count:34,time: 10.047
episode 301, reward 239.0, memory_length 2000, epsilon 0.73923, explore count:88, exploit count:30,time: 8.981
episode 302, reward -187.0, memory_length 2000, epsilon 0.73849, explore count:98, exploit count:27,time: 9.415
episode 303, reward -145.0, memory_length 2000, epsilon 0.73775, explore count:90, exploit count:19,time: 8.034
episode 304, reward 63.0, memory_length 2000, epsilon 0.73701, explore count:89, exploit count:29,time: 8.984
episode 305, reward 9.0, memory_length 2000, epsilon 0.73627, explore count:111, exploit count:53,time: 12

episode 370, reward -86.0, memory_length 2000, epsilon 0.68992, explore count:99, exploit count:48,time: 11.447
episode 371, reward -154.0, memory_length 2000, epsilon 0.68923, explore count:90, exploit count:34,time: 9.53
episode 372, reward -238.0, memory_length 2000, epsilon 0.68854, explore count:82, exploit count:36,time: 9.166
episode 373, reward -68.0, memory_length 2000, epsilon 0.68785, explore count:91, exploit count:53,time: 11.258
episode 374, reward -288.0, memory_length 2000, epsilon 0.68716, explore count:100, exploit count:47,time: 11.48
episode 375, reward -36.0, memory_length 2000, epsilon 0.68647, explore count:93, exploit count:43,time: 10.586
episode 376, reward -109.0, memory_length 2000, epsilon 0.68579, explore count:83, exploit count:50,time: 10.56
episode 377, reward -253.0, memory_length 2000, epsilon 0.6851, explore count:91, exploit count:45,time: 10.63
episode 378, reward -208.0, memory_length 2000, epsilon 0.68442, explore count:89, exploit count:36,time:

episode 444, reward -275.0, memory_length 2000, epsilon 0.64068, explore count:88, exploit count:46,time: 10.547
episode 445, reward -347.0, memory_length 2000, epsilon 0.64004, explore count:97, exploit count:67,time: 13.244
episode 446, reward -136.0, memory_length 2000, epsilon 0.6394, explore count:93, exploit count:46,time: 10.955
episode 447, reward 4.0, memory_length 2000, epsilon 0.63876, explore count:82, exploit count:55,time: 10.922
episode 448, reward -225.0, memory_length 2000, epsilon 0.63812, explore count:98, exploit count:54,time: 11.977
episode 449, reward -187.0, memory_length 2000, epsilon 0.63748, explore count:90, exploit count:47,time: 10.718
episode 450, reward -302.0, memory_length 2000, epsilon 0.63685, explore count:89, exploit count:56,time: 11.579
episode 451, reward -415.0, memory_length 2000, epsilon 0.63621, explore count:91, exploit count:64,time: 12.487
episode 452, reward -121.0, memory_length 2000, epsilon 0.63557, explore count:84, exploit count:40,

episode 517, reward -446.0, memory_length 2000, epsilon 0.59556, explore count:70, exploit count:69,time: 11.465
episode 518, reward -74.0, memory_length 2000, epsilon 0.59496, explore count:86, exploit count:65,time: 12.145
episode 519, reward -131.0, memory_length 2000, epsilon 0.59437, explore count:100, exploit count:63,time: 13.032
episode 520, reward -218.0, memory_length 2000, epsilon 0.59377, explore count:91, exploit count:59,time: 11.965
episode 521, reward -313.0, memory_length 2000, epsilon 0.59318, explore count:87, exploit count:58,time: 11.548
episode 522, reward -324.0, memory_length 2000, epsilon 0.59258, explore count:80, exploit count:69,time: 12.167
episode 523, reward -61.0, memory_length 2000, epsilon 0.59199, explore count:92, exploit count:75,time: 13.513
episode 524, reward -117.0, memory_length 2000, epsilon 0.5914, explore count:81, exploit count:58,time: 11.2
episode 525, reward -185.0, memory_length 2000, epsilon 0.59081, explore count:98, exploit count:73,

episode 590, reward -454.0, memory_length 2000, epsilon 0.55361, explore count:104, exploit count:90,time: 15.924
episode 591, reward -450.0, memory_length 2000, epsilon 0.55306, explore count:102, exploit count:93,time: 15.923
episode 592, reward -187.0, memory_length 2000, epsilon 0.5525, explore count:108, exploit count:83,time: 15.334
episode 593, reward -720.0, memory_length 2000, epsilon 0.55195, explore count:105, exploit count:84,time: 15.396
episode 594, reward -224.0, memory_length 2000, epsilon 0.5514, explore count:109, exploit count:74,time: 14.62
episode 595, reward -425.0, memory_length 2000, epsilon 0.55085, explore count:94, exploit count:86,time: 14.804
episode 596, reward -453.0, memory_length 2000, epsilon 0.5503, explore count:105, exploit count:69,time: 13.812
episode 597, reward -679.0, memory_length 2000, epsilon 0.54975, explore count:105, exploit count:82,time: 15.055
episode 598, reward -554.0, memory_length 2000, epsilon 0.5492, explore count:107, exploit co

episode 663, reward -591.0, memory_length 2000, epsilon 0.51462, explore count:114, exploit count:78,time: 15.382
episode 664, reward -569.0, memory_length 2000, epsilon 0.5141, explore count:95, exploit count:101,time: 17.489
episode 665, reward -599.0, memory_length 2000, epsilon 0.51359, explore count:103, exploit count:104,time: 20.01
episode 666, reward -482.0, memory_length 2000, epsilon 0.51307, explore count:106, exploit count:100,time: 18.887
episode 667, reward -277.0, memory_length 2000, epsilon 0.51256, explore count:110, exploit count:78,time: 18.012
episode 668, reward -309.0, memory_length 2000, epsilon 0.51205, explore count:94, exploit count:83,time: 14.73
episode 669, reward -441.0, memory_length 2000, epsilon 0.51154, explore count:107, exploit count:92,time: 17.014
episode 670, reward -495.0, memory_length 2000, epsilon 0.51103, explore count:97, exploit count:72,time: 14.538
episode 671, reward -487.0, memory_length 2000, epsilon 0.51051, explore count:102, exploit

In [None]:

plt.figure(0, figsize=(16,8))
plt.plot(summary_df[['EPISODE','REWARD']])
plt.title('REWARDS PER EPISODE')
plt.xlabel("Episode")
plt.ylabel("Reward")

plt.show()

# save plots in saved_plots/ directory
plt.savefig('rewards_episodes.png')

In [None]:
display(summary_df)

In [None]:
# save stuff as pickle
def save_pickle(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

# make directory
if not os.path.exists("saved_pickle_files"):
    os.mkdir("saved_pickle_files")

# save rewards_per_episode
save_pickle(rewards_per_episode, "saved_pickle_files/rewards_per_episode")
save_pickle(summary_df, "saved_pickle_files/summary_per_episode")



### Tracking Convergence

In [None]:

# plot results
with open('saved_pickle_files/summary_per_episode.pkl', 'rb') as f:
    summary_data = pickle.load(f)

    
plt.figure(0, figsize=(16,8))
plt.plot(summary_data[['EPISODE','REWARD']])
plt.title('REWARDS PER EPISODE')
plt.xlabel("Episode")
plt.ylabel("Reward")

plt.show()

# save plots in saved_plots/ directory
plt.savefig('rewards_episodes.png')
  

In [None]:

# plot results
with open('saved_pickle_files/rewards_per_episode.pkl', 'rb') as f:
    rewards_per_episode = pickle.load(f)

plt.plot(list(range(len(rewards_per_episode))), rewards_per_episode)
plt.xlabel("episode number")
plt.ylabel("reward per episode")

plt.show()

# save plots in saved_plots/ directory
plt.savefig('rewards.png')


print("Average reward of last 100 episodes is {0}".format(np.mean(rewards_per_episode[-100:]))) 









In [None]:
#### Plot exploration vs exploitation

In [None]:

plt.figure(0, figsize=(16,8))
plt.plot(summary_data[['EXPLORE_CNT','EXPLOIT_CNT']])
plt.title('EXPLORATION vs EXPLOITATION')
plt.xlabel("Exploration")
plt.ylabel("Exploitation")

plt.show()

# save plots in saved_plots/ directory
plt.savefig('exploration_exploitation.png')

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
import matplotlib.pyplot as plt

max_epsilon = 1.0
min_epsilon = 0.0000001
episode_count = 25000
exp_value = 1/10**(len(str(episode_count))-1)
print(f'exp:{exp_value}')
time = np.arange(0,episode_count)
epsilon = []
for i in range(0,episode_count):
    epsilon.append(min_epsilon + (max_epsilon - min_epsilon) * np.exp(-exp_value*i))
    

plt.plot(time, epsilon)
plt.show()

In [None]:
# From starter code
total_episodes = 10000
time = np.arange(0,total_episodes)
epsilon = []
for i in range(0,total_episodes):
    epsilon.append(0 + (1 - 0) * np.exp(-0.0009*i))
    
plt.plot(time, epsilon)
plt.show()

In [None]:
max_epsilon = 1.0
min_epsilon = 0.001
episode_count = 50
exp_value = 1/10**(len(str(episode_count))-1)
time = np.arange(0,episode_count)
epsilon = []
for i in range(0,episode_count):
    epsilon.append(min_epsilon + (max_epsilon - min_epsilon) * np.exp(-exp_value*i))
    

plt.plot(time, epsilon)
plt.show()