In [None]:
import random

import gymnasium as gym
import numpy as np

import tensorflow as tf

from keras import Model
from keras.layers import Dense

import matplotlib.pyplot as plt

import os    
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

In [None]:
class DQN(Model):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()

        self.replay_memory = []

        self.dense1 = Dense(48, activation="relu", input_dim=state_size)
        self.dense2 = Dense(48, activation="relu")
        self.dense3 = Dense(action_size, activation="softmax")

        self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)

    def call(self, x):
        x = self.dense1(x)
        x = self.dense2(x)
        x = self.dense3(x)

        return x
    
    def remember(self, state, action, reward, next_state, done):
        self.replay_memory.append((state, action, reward, next_state, done))

In [None]:
def update_model(model: DQN):
    # 리플레이 버퍼 크기가 작으면 업데이트하지 않음
    if len(model.replay_memory) < 1000:
        return
    
    # 너무 많으면 리플레이 버퍼 pop
    if len(model.replay_memory) > 20000:
        model.replay_memory.pop(0)

    # 메모리에서 랜덤 샘플링
    sample_size = 64
    samples = random.sample(model.replay_memory, sample_size)

    # 분할
    states, actions, rewards, next_states, dones = zip(*samples)
    # states, actions, rewards, next_states, dones = zip(*model.replay_memory)

    # numpy 배열로 변환
    states = np.array(states)
    actions = np.array(actions)
    rewards = np.array(rewards)
    next_states = np.array(next_states)
    dones = np.array(dones)

    # 모델 예측과 타겟 값 계산
    targets = model.call(states).numpy()  # type: ignore
    next_q_values = model.call(next_states).numpy()  # type: ignore
    
    targets[np.arange(sample_size), actions] = rewards + 0.95 * np.max(next_q_values, axis=1) * (1 - dones)

    # 모델 업데이트
    with tf.GradientTape() as tape:
        q_values = model.call(states)
        loss = tf.keras.losses.mean_squared_error(targets, q_values)

    gradients = tape.gradient(loss, model.trainable_variables)
    model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))

In [None]:
def draw_qvalue(model, episode):
    thetas = np.arange(0, 2*np.pi, 0.1)
    velocities = np.arange(-4, 4, 0.1)

    states = np.array([[np.cos(theta), np.sin(theta), velocity] for theta in thetas for velocity in velocities])
    z = model.call(states).numpy()

    z_np = np.array(z).reshape((len(thetas), len(velocities), 9))

    # x, y, z를 2차원 그래프로 표현
    fig, ax = plt.subplots(2, 5, figsize=(20, 8))
    fig.suptitle("Episode: {}".format(episode))
    for i in range(9):
        ax[i//5][i%5].title.set_text("Action: {}".format(i))
        ax[i//5][i%5].imshow(z_np[:,:,i], cmap='hot', interpolation='nearest', clim=(0, 1))
        ax[i//5][i%5].set_xticks(np.arange(0, 80, 10))
        ax[i//5][i%5].set_yticks(np.arange(0, 63, 10))
        ax[i//5][i%5].set_xticklabels(np.arange(-4, 4, 1))
        ax[i//5][i%5].set_yticklabels(np.round(np.arange(0, 2*np.pi+0.1, 2*np.pi/6),1))
        ax[i//5][i%5].set_xlabel("Velocity")
        ax[i//5][i%5].set_ylabel("Theta")
        ax[i//5][i%5].invert_yaxis()  # y축 변경
    ax[1][4].axis('off')  # 마지막 subplot은 빈 공간
    plt.tight_layout()
    plt.savefig("model1_image/episode_{}.png".format(episode))

In [None]:
# env = gym.make("Pendulum-v1", g=9.81, render_mode="human")
env = gym.make("Pendulum-v1", g=9.81)

model = DQN(3, 9)

eps = 1

for episode in range(100):
    state, info = env.reset()
    terminated = False
    truncated = False
    step = 0
    
    eps *= 0.9

    rewards = []
    before_reward = None

    while not terminated and step < 300:

        # 모델로 행동 예측
        if np.random.rand() < eps:
            action = np.random.randint(9)
        else:
            action = model.call(np.array([state])).numpy()[0]  # type: ignore
            action = np.random.choice(9, p=action)
        # 행동 실행
        next_state, reward, terminated, truncated, info = env.step((action/2-2,))
        
        
        # 리플레이 버퍼에 기억
        model.remember(state, action, reward, next_state, terminated)
        
        # 모델 업데이트
        update_model(model)

        state = next_state
        step += 1

        rewards.append(reward)
    
    print("Episode: {}, Steps: {}, Score: {}".format(episode, step, sum(rewards) / len(rewards)))
    
    if episode % 10 == 0:
        draw_qvalue(model, episode)

env.close()