In [1]:
import random

import gymnasium as gym
import numpy as np

import tensorflow as tf

from keras import Model
from keras.layers import Dense

In [2]:
class DQN(Model):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()

        self.replay_memory = []

        self.dense1 = Dense(48, activation="relu", input_dim=state_size)
        self.dense2 = Dense(48, activation="relu")
        self.dense2 = Dense(48, activation="relu")
        self.dense3 = Dense(action_size, activation="softmax")

        self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
        self.optimizer_side = tf.keras.optimizers.Adam(learning_rate=0.0004)

    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dense3(x)

        return x
    
    def remember(self, state, action, reward, next_state, done):
        self.replay_memory.append((state, action, reward, next_state, done))

In [3]:
def update_model(model: DQN):
    # 리플레이 버퍼 크기가 작으면 업데이트하지 않음
    if len(model.replay_memory) < 1000:
        return
    
    # 너무 많으면 리플레이 버퍼 pop
    if len(model.replay_memory) > 10000:
        model.replay_memory.pop(0)

    # 메모리에서 랜덤 샘플링
    sample_size = 64
    samples = random.sample(model.replay_memory, sample_size)

    # 분할
    states, actions, rewards, next_states, dones = zip(*samples)
    # states, actions, rewards, next_states, dones = zip(*model.replay_memory)

    # numpy 배열로 변환
    states = np.array(states)
    actions = np.array(actions)
    rewards = np.array(rewards)
    next_states = np.array(next_states)
    dones = np.array(dones)

    # 모델 예측과 타겟 값 계산
    targets = model.call(states).numpy()
    next_q_values = model.call(next_states).numpy()

    targets[np.arange(sample_size), actions] = rewards + 0.95 * np.max(next_q_values, axis=1) * (1 - dones)

    # 모델 업데이트
    with tf.GradientTape() as tape:
        q_values = model.call(states)
        loss = tf.keras.losses.mean_squared_error(targets, q_values)

    gradients = tape.gradient(loss, model.trainable_variables)
    model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))


    left_index = np.where(actions != 0)
    targets_left = model.call(states).numpy()
    next_q_values_left = model.call(next_states).numpy()[left_index]

    targets_left[left_index[0], actions[left_index]] = rewards[left_index] + 0.95 * np.max(next_q_values_left, axis=1) * (1 - dones[left_index])

    with tf.GradientTape() as tape:
        q_values = model.call(states)
        loss = tf.keras.losses.mean_squared_error(targets_left, q_values)

    gradients = tape.gradient(loss, model.trainable_variables)
    model.optimizer_side.apply_gradients(zip(gradients, model.trainable_variables))


    right_index = np.where(actions != 8)
    targets_right = model.call(states).numpy()
    next_q_values_right = model.call(next_states).numpy()[right_index]

    targets_right[right_index[0], actions[right_index]] = rewards[right_index] + 0.95 * np.max(next_q_values_right, axis=1) * (1 - dones[right_index])

    with tf.GradientTape() as tape:
        q_values = model.call(states)
        loss = tf.keras.losses.mean_squared_error(targets_right, q_values)

    gradients = tape.gradient(loss, model.trainable_variables)
    model.optimizer_side.apply_gradients(zip(gradients, model.trainable_variables))
    

In [4]:
# env = gym.make("Pendulum-v1", g=9.81, render_mode="human")
env = gym.make("Pendulum-v1", g=9.81)

model = DQN(3, 9)


for episode in range(200):
    state, info = env.reset()
    terminated = False
    truncated = False
    step = 0

    rewards = []
    before_reward = None

    while not terminated and step < 300:

        # 모델로 행동 예측
        action = model.call(np.array([state])).numpy()[0]
        action = np.random.choice(9, p=action)
        # 행동 실행
        next_state, reward, terminated, truncated, info = env.step((action/2-2,))
    

        # 리플레이 버퍼에 기억
        model.remember(state, action, reward, next_state, terminated)
        
        # 모델 업데이트
        update_model(model)

        state = next_state
        step += 1

        rewards.append(reward)
    
    print("Episode: {}, Steps: {}, Score: {}".format(episode, step, sum(rewards) / len(rewards)))

env.close()

Episode: 0, Steps: 300, Score: -8.13066480673677
Episode: 1, Steps: 300, Score: -6.041336974261138
Episode: 2, Steps: 300, Score: -6.239332050959648
Episode: 3, Steps: 300, Score: -8.056297408954602
Episode: 4, Steps: 300, Score: -5.779184215683671
Episode: 5, Steps: 300, Score: -7.9023308182630885
Episode: 6, Steps: 300, Score: -6.125682195612535
Episode: 7, Steps: 300, Score: -7.883861765048579
Episode: 8, Steps: 300, Score: -6.531899033140856
Episode: 9, Steps: 300, Score: -6.338818527358223


KeyboardInterrupt: 

In [18]:
import numpy as np

# Generate a 2-dimensional array of shape (3, 4) with random floats between 0 and 1
random_array = np.random.randint(9)
print(random_array)

7
