In [1]:
import random
import gym 
import numpy as np
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Conv2D, MaxPooling2D, Input
from tensorflow.keras.optimizers import Adam

seed = random.seed(12345)



# Defining the DQN class
class DQN:
    def __init__(self, state_size, action_size, buffer_size=5000, gamma=0.9, epsilon=0.1, update_rate=1000,
                net_arc=[64,64]):
        # Define the state size
        self.state_size = state_size
        # Define the action size
        self.action_size = action_size
        # Define the replay buffer
        self.buffer_size = buffer_size
        self.replay_buffer = deque(maxlen=buffer_size)
        # Define the discount factor
        self.gamma = gamma
        # Define the epsilon value
        self.epsilon = epsilon
        # Define the target network update rate (for hard updates only)
        self.update_rate = update_rate
        # Define the agent and target networks
        self.main_network = self.build_network()
        self.target_network = self.build_network()
        # Copy the weights from the main network to the target network
        self.target_network.set_weights(self.main_network.get_weights())
       
        
    def build_network(self, net_arc=[64,64]):
        model = Sequential()
        model.add(Dense(net_arc[0], activation='relu', input_shape = self.state_size))
        for i in range(len(net_arc)-1):
            model.add(Dense(net_arc[i+1], activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        # Have not yet compiled the model for lack of a loss function (want to implement this using gradient
        # tape for practice)
        model.compile(loss='mse', optimizer=Adam())
        return model
        
    
        
    def store_transitions(self, state, action, reward, next_state, done):
        if len(self.replay_buffer) < self.buffer_size:
            self.replay_buffer.append((state, action, reward, next_state, done))
        else:
            self.replay_buffer.popleft()
            self.replay_buffer.append((state, action, reward, next_state, done))

    def epsilon_greedy(self, state):
        if random.uniform(0,1) < self.epsilon:
            action = np.random.randint(self.action_size)
        else:
            Q_values = self.main_network.predict(state.reshape((1,4)))
            action = np.argmax(Q_values[0])
        return action

    def train(self, batch_size):
        minibatch = random.sample(self.replay_buffer, batch_size)
        # Compute the target value using the target network
        for state, action, reward, next_state, done in minibatch:
            if not done:
                target_Q = (reward + self.gamma * np.argmax(self.target_network.predict(next_state.reshape((1,4)))))
            else:
                target_Q = reward
            # Compute predicted Q values using the main network        
            Q_values = self.main_network.predict(state.reshape((1,4)))
            #update the target value
            Q_values[0][action] = target_Q

            # Train the main network
            self.main_network.fit(state.reshape(1,4), Q_values, epochs=1, verbose=0)


    def update_target_network(self):
        self.target_network.set_weights(self.main_network.get_weights())

        

env = gym.make('CartPole-v1')
state_size = env.observation_space.shape
action_size = env.action_space.n
        
num_episodes = 500
num_timesteps = 200
batch_size = 8
dqn = DQN(state_size, action_size)
done = False
time_step = 0

return_array = np.zeros(num_episodes)
for i in range(num_episodes):
    state = env.reset()
    for t in range(num_timesteps):
        #env.render()
        time_step+=1
        if time_step % dqn.update_rate ==0:
            dqn.update_target_network()
        action = dqn.epsilon_greedy(state)
        next_state, reward, done, _ = env.step(action)
        dqn.store_transitions(state, action, reward, next_state, done)
        state = next_state
        return_array[i]+=reward
        if len(dqn.replay_buffer) > batch_size:
            dqn.train(batch_size)
        if done:
            print('episode: ',i, ', ','Return: ', return_array[i])
            break  


episode:  0 ,  Return:  11.0
episode:  1 ,  Return:  10.0
episode:  2 ,  Return:  9.0
episode:  3 ,  Return:  13.0
episode:  4 ,  Return:  10.0
episode:  5 ,  Return:  9.0
episode:  6 ,  Return:  10.0
episode:  7 ,  Return:  11.0
episode:  8 ,  Return:  9.0
episode:  9 ,  Return:  10.0
episode:  10 ,  Return:  10.0
episode:  11 ,  Return:  13.0
episode:  12 ,  Return:  13.0
episode:  13 ,  Return:  9.0
episode:  14 ,  Return:  11.0
episode:  15 ,  Return:  11.0
episode:  16 ,  Return:  10.0
episode:  17 ,  Return:  9.0
episode:  18 ,  Return:  14.0
episode:  19 ,  Return:  12.0


KeyboardInterrupt: 