<a href="https://colab.research.google.com/github/Nicohim87/DeepLearning/blob/main/Pertemuan11/session11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gymnasium

Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.0.0


In [4]:
import numpy as np
import gymnasium as gym
import tensorflow as tf
from tensorflow import keras
from collections import deque

In [10]:
GAMMA = 0.99
LEARNING_RATE = 1e-3
MEMORY_SIZE = int(1e6)
BATCH_SIZE = 64
EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995

In [28]:
class DQNAgent:
  def __init__(self, observation_space, action_space):
    self.exploration_rate = EXPLORATION_MAX
    self.action_space = action_space
    self.memory = deque(maxlen=MEMORY_SIZE)
    self.model = keras.Sequential([
        keras.layers.Input((observation_space,)),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dense(self.action_space, activation='linear')
    ])
    self.model.compile(
        optimizer=keras.optimizers.Adam(learning_rate = LEARNING_RATE),
        loss='mse'
    )

  def remember(self, state, action, reward, next_state, done):
    self.memory.append((state, action, reward, next_state, done))

  def act(self, state):
    if np.random.rand() < self.exploration_rate:
      return np.random.randint(self.action_space)
    else:
      state = state.reshape(1, -1)
      q_values = self.model.predict(state)
      return np.argmax(q_values[0])

  def experience_replay(self):
    if len(self.memory) < BATCH_SIZE:
      return
    batch = np.random.choice(len(self.memory), BATCH_SIZE, replace=False)
    states, q_targets = [], []
    for i in batch:
      state, action, reward, next_state, done = self.memory[i]
      q_update = reward
      if not done:
        next_state = next_state.reshape(1, -1)
        q_update = (reward + GAMMA * np.amax(self.model.predict(next_state, verbose=0)[0]))

      state = state.reshape(1, -1)
      q_values = self.model.predict(state, verbose=0)
      q_values[0][action] = q_update
      states.append(state[0])
      q_targets.append(q_values[0])

    self.model.fit(np.array(states), np.array(q_targets), verbose=0, batch_size = BATCH_SIZE)
    self.exploration_rate *= EXPLORATION_DECAY
    self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)

In [29]:
env = gym.make('CartPole-v1')
OBSERVATION_SPACE = env.observation_space.shape[0]
ACTION_SPACE = env.action_space.n

agent = DQNAgent(OBSERVATION_SPACE, ACTION_SPACE)
episode = 0

for _ in range(10):
    episode += 1
    state, _ = env.reset()
    state = np.array(state)
    state = np.reshape(state, [1, OBSERVATION_SPACE])
    step = 0
    while True:
        step += 1
        env.render()
        action = agent.act(state)
        next_state, reward, done, _, _ = env.step(action)
        agent.remember(state, action, reward, next_state, done)

        state = next_state

        if done:
            print(f"Episode #{episode}, Step-{step}")
            break

        agent.experience_replay()

env.close()

  gym.logger.warn(


Episode #1, Step-12
Episode #2, Step-15
Episode #3, Step-18
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Episode #4, Step-56
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step


KeyboardInterrupt: 