## Necessary imports

In [1]:
import numpy as np
import random
from collections import deque # something like extended version of list
import gym
from keras import Model, Sequential
from keras.layers import Dense, Embedding, Reshape
from keras.optimizers import Adam

2022-11-13 21:11:05.761154: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## creating the environment

In [2]:
environment = gym.make("Taxi-v3").env
environment.render()

+---------+
|[34;1mR[0m: | :[43m [0m:G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [3]:
print("No.of states:\t",environment.observation_space.n)
print("No.of actions:\t",environment.action_space.n)

No.of states:	 500
No.of actions:	 6


## Creating deep learning model

In [4]:
class Agent:
    # self means the current object (similar to this in oop)
    # constructor
    def __init__(self, environment, optimizer):
        # initialize attributes
        self._state_size = environment.observation_space.n
        self._action_size = environment.action_space.n
        self._optimizer = optimizer

        # list with 2000 observations to store
        self.experience_replay = deque(maxlen=2000)

        # discount factor
        self.gamma = 0.6
        # after several runs randomness will be fixed, to maintain the randomness we use epsilon
        self.epsilon = 0.1

        # build models by calling the below created functions
        self.q_network = self._build_compile_model()
        self.target_network = self._build_compile_model()
        # set weights to the model
        self.align_target_model()
    
    # function to set weights to the model
    def align_target_model(self):
        # after sometime weights of q network will be copied to the target network
        self.target_network.set_weights(self.q_network.get_weights())

    # store data to experience_replay list as tuples
    def store(self, state, action, reward, next_state, terminated):
        self.experience_replay.append((state, action, reward, next_state, terminated))

    # function to building the model
    def _build_compile_model(self):
        # creating the model
        model = Sequential()
        # embedding functions will represent 500 state size by 10
        model.add(Embedding(self._state_size, 10, input_length = 1))
        # reshape the model to 10 inputs
        model.add(Reshape((10,)))
        # two hidden layers with 50 neurons each with rectified linear unit activation function
        model.add(Dense(50, activation='relu'))
        model.add(Dense(50, activation='relu'))
        # output layer with neurons of action size with linear activation function
        model.add(Dense(self._action_size, activation='linear'))
        # as the loss function use mean squared error and use the given optimizer
        model.compile(loss='mse',optimizer=self._optimizer)

        return model

    def action(self, state):
        if np.random.rand() <= self.epsilon:
            return environment.action_space.sample()

        # take prediction from state and use that as a q value table
        # predict function accepts only 2d arrays that's why we converted it
        q_values = self.q_network.predict(state)
        # take the index of the highest action
        return np.argmax(q_values[0]) 

    # train the trained data again to get a better output
    def retrain(self, batch_size):
        # get random sample from experience_replay
        minibatch = random.sample(self.experience_replay, batch_size)
        # expand the tuple and loop through the tuples
        for state, action, reward, next_state, terminated in minibatch:
            # get the predictions
            target = self.q_network.predict(state)
            
            if terminated:
                target[0][action] = reward
            else:
                t = self.target_network.predict(next_state)
                target[0][action] = reward + self.gamma * np.amax(t)
            
            self.q_network.fit(state, target, epochs=1, verbose=0)

In [5]:
# creating adam optimizer with learning rate of 0.01
optimizer = Adam(learning_rate=0.01)
# creating the Agent object
agent = Agent(environment, optimizer)
# getting q_network sequential model summary
agent.q_network.summary()

2022-11-13 21:11:19.561447: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 10)             5000      
                                                                 
 reshape (Reshape)           (None, 10)                0         
                                                                 
 dense (Dense)               (None, 50)                550       
                                                                 
 dense_1 (Dense)             (None, 50)                2550      
                                                                 
 dense_2 (Dense)             (None, 6)                 306       
                                                                 
Total params: 8,406
Trainable params: 8,406
Non-trainable params: 0
_________________________________________________________________


## Training the model

In [6]:
# how much data is send to train per iteration
batch_size = 32
num_of_episodes = 100
timeSteps_per_episode = 100 

for e in range(0, num_of_episodes):
    # Reset the environment, to get new situation in every episode (iteration)
    state = environment.reset()
    # create state as a 2d array
    state = np.reshape(state, [1, 1])
    
    reward = 0
    # in each episode it will come to a terminate state to track that
    terminated = False
    
    for timeStep in range(timeSteps_per_episode):
        # get the Action
        action = agent.action(state)
        # when action is given it will give these things
        next_state, reward, terminated, info = environment.step(action)
        # again convert the state to the 2d array
        next_state = np.reshape(next_state, [1, 1])
        # store the data to the experience_replay list
        agent.store(state, action, reward, next_state, terminated)
        # use next_state as the state for next loop
        state = next_state

        # if terminate occurs entire timeStep loop breaks and it will go to the next episode
        if terminated:
            # coping q_network weights to target network
            agent.align_target_model()
            break
        
        
        if len(agent.experience_replay) > batch_size:
            agent.retrain(batch_size)

