<a href="https://colab.research.google.com/github/Gilevich/DeepLearningWithPhil/blob/master/DQN_Tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
import random
import numpy as np
import gym
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model

In [72]:
class ReplayBuffer():
  def __init__(self, max_size, input_dims):
    self.mem_size = max_size
    self.mem_cntr = 0

    self.state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
    self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
    self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
    self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
    self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32)

  def store_transition(self, state, action, reward, state_, done):
    index = self.mem_cntr % self.mem_size
    self.state_memory[index] = state 
    self.new_state_memory[index] = state_
    self.reward_memory[index] = reward
    self.action_memory[index] = action
    self.terminal_memory[index] = 1 - int(done)
    self.mem_cntr += 1

  def sample_buffer(self, batch_size):
    max_mem = min(self.mem_cntr, self.mem_size)
    batch = np.random.choice(max_mem, batch_size, replace = False)

    states = self.state_memory[batch]
    states_ = self.new_state_memory[batch]
    rewards = self.reward_memory[batch]
    actions = self.action_memory[batch]
    terminal = self.terminal_memory[batch]

    return states, actions, rewards, states_, terminal

  

In [9]:
def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims):
  model = keras.Sequential([keras.layers.Dense(fc1_dims, activation='relu'),
                            keras.layers.Dense(fc2_dims, activation='relu'),
                            keras.layers.Dense(n_actions, activation=None)])
  model.compile(optimizer=Adam(learning_rate=lr), loss='mean_squared_error')

  return model

In [69]:
class Agent():
  def __init__(self, lr, gamma, n_actions, epsilon, batch_size, input_dims, 
               epsilon_dec=1e-3, epsilon_end=0.01, mem_size=1000000, fname='dqn_model.h5'):
    self.action_space = [i for i in range(n_actions)]
    self.gamma = gamma
    self.epsilon = epsilon
    self.eps_dec = epsilon_dec
    self.eps_min = epsilon_end
    self.batch_size = batch_size
    self.model_file = fname
    self.memory = ReplayBuffer(mem_size, input_dims)
    self.q_eval = build_dqn(lr, n_actions, input_dims, 256, 256)

  def store_transition(self, state, action, reward, new_state, done):
    self.memory.store_transition(state, action, reward, new_state, done)

  def choise_action(self, observation):
    if np.random.random() < self.epsilon:
      action = np.random.choice(self.action_space)
    else:
      state = np.array([observation])
      actions = self.q_eval.predict(state)

      action = np.argmax(actions)

    return action

  def learn(self):
    if self.memory.mem_cntr < self.batch_size:
      return
    
    states, actions, rewards, states_, dones = \
          self.memory.sample_buffer(self.batch_size)

    q_eval = self.q_eval.predict(states)
    q_next = self.q_eval.predict(states_)

    q_target = np.copy(q_eval)
    batch_index = np.arange(self.batch_size, dtype=np.int32)

    q_target[batch_index, actions] = rewards + self.gamma * np.max(q_next, axis=1)*dones
  

    self.q_eval.train_on_batch(states, q_target)

    self.epsilon = self.epsilon = self.epsilon - self.eps_dec if self.epsilon > \
                  self.eps_min else self.eps_min

  def save_model(self):
    self.q_eval.save(self.model_file)

  def load_model(self):
    self.q_eval = load_model(self.model_file)


In [74]:
def main():
  tf.compat.v1.disable_eager_execution()
  env = gym.make("CartPole-v0")
  lr = 0.001
  n_games = 500
  agent = Agent(gamma=0.99, epsilon=1.0, lr=lr,
                input_dims=env.observation_space.shape,
                n_actions=env.action_space.n, mem_size=1000000, batch_size=64,
                epsilon_end=0.01)
  scores = []
  eps_history = []

  for i in range(n_games):
    done = Falsescore = 0
    score = 0
    observation = env.reset()
    while not done:
      action = agent.choise_action(observation)
      observation_, reward, done, info = env.step(action)
      score += reward
      agent.store_transition(observation, action, reward, observation_, done)
      observation = observation_
      agent.learn()
    eps_history.append(agent.epsilon)
    scores.append(score)

    avg_score = np.mean(scores[-100:])
    print('episode: ', i, 'score %.2f' % score, 
          'average_score %.2f' % avg_score,
          'epsilon %.2f' % agent.epsilon)
    filename = 'lunarlander_tf2.png'
    x = [i+1 for i in range(n_games)]



In [75]:
main()

episode:  0 score 10.00 average_score 10.00 epsilon 1.00
episode:  1 score 21.00 average_score 15.50 epsilon 1.00
episode:  2 score 9.00 average_score 13.33 epsilon 1.00
episode:  3 score 19.00 average_score 14.75 epsilon 1.00
episode:  4 score 15.00 average_score 14.80 epsilon 0.99
episode:  5 score 20.00 average_score 15.67 epsilon 0.97
episode:  6 score 25.00 average_score 17.00 epsilon 0.94
episode:  7 score 23.00 average_score 17.75 epsilon 0.92
episode:  8 score 16.00 average_score 17.56 epsilon 0.90
episode:  9 score 17.00 average_score 17.50 epsilon 0.89
episode:  10 score 16.00 average_score 17.36 epsilon 0.87
episode:  11 score 30.00 average_score 18.42 epsilon 0.84
episode:  12 score 10.00 average_score 17.77 epsilon 0.83
episode:  13 score 13.00 average_score 17.43 epsilon 0.82
episode:  14 score 13.00 average_score 17.13 epsilon 0.81
episode:  15 score 19.00 average_score 17.25 epsilon 0.79
episode:  16 score 40.00 average_score 18.59 epsilon 0.75
episode:  17 score 13.00 