In [None]:
### gravity = 10

import numpy as np
import tensorflow as tf
import random
from keras import layers, Model
import gymnasium as gym
import matplotlib.pyplot as plt

env=gym.make('Pendulum-v1', render_mode='human')

class DQN(Model):

    def __init__(self, input_state, output_state):

        super(DQN, self).__init__()
        self.d1=layers.Dense(32, input_dim=input_state, activation='relu')
        self.d2=layers.Dense(16, activation='relu')
        self.d3=layers.Dense(16, activation='relu')
        self.d4=layers.Dense(output_state, activation='linear')
        self.optimizer=tf.keras.optimizers.Adam(0.001)

    def call(self, x):

        x=self.d1(x)
        x=self.d2(x)
        x=self.d3(x)

        return self.d4(x)
    
class Agent():

    def __init__(self):

        self.state_size=3
        self.action_size=41
        self.eps=1.0
        self.eps_decay=0.98
        self.eps_min=0.1
        self.batch_size=64
        self.learning_rate=0.001
        self.discount_factor=0.99
        self.memory=[]
        self.model=DQN(self.state_size, self.action_size)      

    def update_eps(self):

        self.eps=max(self.eps*self.eps_decay, self.eps_min)

    def eps_greedy(self, state):
        
        if self.eps < np.random.rand():
            return np.random.uniform(-2,2)
        else:
            act=np.argmax(self.model.call(np.array([state])))
            return -2+act*0.1
    
    def append_sample(self, n_state, action, reward, termination, state):

        self.memory.append((n_state, action, reward, termination, state))

    def train_model(self):

        if len(self.memory)<1000:
            return

        if len(self.memory)>20000:
            del self.memory[0]

        if len(self.memory)>self.batch_size:

            mini_batch=random.sample(self.memory, self.batch_size)

            n_states=np.array([x[0] for x in mini_batch])
            actions=np.array([x[1] for x in mini_batch])
            rewards=np.array([x[2] for x in mini_batch])
            terminations=np.array([x[3] for x in mini_batch])
            states=np.array([x[4] for x in mini_batch])

            q_val=self.model.call(states).numpy()
            n_q_val=self.model.call(n_states).numpy()

            targets=q_val.copy()
            targets[np.arange(len(rewards)), actions]=rewards+self.discount_factor*np.max(n_q_val, axis=1)*(1-terminations)

            with tf.GradientTape() as tape:
                q_val=self.model.call(states)
                loss=tf.keras.losses.mse(targets, q_val)

            gradients=tape.gradient(loss, self.model.trainable_variables)
            optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
            optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

env.reset()
epi=1000
agent=Agent()

for i in range(epi):
    
    state, _  = env.reset()
    agent.update_eps()
    termination=False
    step=0
    max_reward=-20

    while not termination and step<1000:

        action = agent.eps_greedy(state)
        n_state, reward, termination, _, _ =env.step([action])
        action=int((action+2)*10)

        if termination:
            reward=1000

        max_reward=max(reward, max_reward)

        agent.append_sample(n_state, action, reward, termination, state)
        agent.train_model()

        state=n_state
        step+=1

    print(i+1, step, max_reward)

env.close()

In [None]:
### gravity = 10, state=41

import numpy as np
import tensorflow as tf
import random
from keras import layers, Model
import gymnasium as gym

env=gym.make('Pendulum-v1', render_mode='human')

class DQN(Model):

    def __init__(self, input_state, output_state):

        super(DQN, self).__init__()
        self.d1=layers.Dense(32, input_dim=input_state, activation='relu')
        self.d2=layers.Dense(16, activation='relu')
        self.d3=layers.Dense(16, activation='relu')
        self.d4=layers.Dense(output_state, activation='linear')
        self.optimizer=tf.keras.optimizers.Adam(0.001)

    def call(self, x):

        x=self.d1(x)
        x=self.d2(x)
        x=self.d3(x)
        return self.d4(x)
    
class Agent():

    def __init__(self):

        self.state_size=3
        self.action_size=41
        self.eps=1.0
        self.eps_decay=0.98
        self.eps_min=0.1
        self.batch_size=64
        self.learning_rate=0.001
        self.discount_factor=0.99
        self.memory=[]
        self.model=DQN(self.state_size, self.action_size)      

    def update_eps(self):

        self.eps=max(self.eps*self.eps_decay, self.eps_min)

    def eps_greedy(self, state):
        
        if np.random.rand()<self.eps:
            return np.random.uniform(-2,2)
        else:
            act=np.argmax(self.model.call(np.array([state])))
            return -2+act*0.1
    
    def append_sample(self, n_state, action, reward, termination, state):

        self.memory.append((n_state, action, reward, termination, state))

    def train_model(self):

        if len(self.memory)<1000:
            return

        if len(self.memory)>20000:
            del self.memory[0]

        if len(self.memory)>self.batch_size:

            mini_batch=random.sample(self.memory, self.batch_size)

            n_states=np.array([x[0] for x in mini_batch])
            actions=np.array([x[1] for x in mini_batch])
            rewards=np.array([x[2] for x in mini_batch])
            terminations=np.array([x[3] for x in mini_batch])
            states=np.array([x[4] for x in mini_batch])

            q_val=self.model.call(states).numpy()
            n_q_val=self.model.call(n_states).numpy()

            targets=q_val.copy()
            targets[np.arange(len(rewards)), actions]=rewards+self.discount_factor*np.max(n_q_val, axis=1)*(1-terminations)

            with tf.GradientTape() as tape:
                q_val=self.model.call(states)
                loss=tf.keras.losses.mse(targets, q_val)

            gradients=tape.gradient(loss, self.model.trainable_variables)
            optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
            optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

env.reset()
epi=100
agent=Agent()
memory_step=[]
count=0

for i in range(epi):
    
    state, _  = env.reset()
    agent.update_eps()
    termination=False
    step=0
    max_reward=-20
    
    while not termination and step<1000:

        action = agent.eps_greedy(state)
        n_state, reward, termination, _, _ =env.step([action])
        action=int((action+2)*10)

        if reward>=-0.015:
            termination=True
            memory_step.append(step)
            count+=1

        max_reward=max(reward, max_reward)

        agent.append_sample(n_state, action, reward, termination, state)
        agent.train_model()

        state=n_state
        step+=1

    print(i+1, step, max_reward)

agent.model.save_weights("./save_model/model", save_format="tf")
env.close()
print('성공횟수 : {}, 평균 step : {}'.format(count, (sum(memory_step)/count)))

In [None]:
### gravity = 10, state=41

import numpy as np
import tensorflow as tf
import random
from keras import layers, Model
import gymnasium as gym
import time

env=gym.make('Pendulum-v1', render_mode='human')

class DQN(Model):

    def __init__(self, input_state, output_state):

        super(DQN, self).__init__()
        self.d1=layers.Dense(32, input_dim=input_state, activation='relu')
        self.d2=layers.Dense(16, activation='relu')
        self.d3=layers.Dense(16, activation='relu')
        self.d4=layers.Dense(output_state, activation='linear')
        self.optimizer=tf.keras.optimizers.Adam(0.001)

    def call(self, x):

        x=self.d1(x)
        x=self.d2(x)
        x=self.d3(x)
        return self.d4(x)
    
class Agent():

    def __init__(self):

        self.state_size=3
        self.action_size=41
        self.eps=1.0
        self.eps_decay=0.98
        self.eps_min=0.1
        self.batch_size=64
        self.learning_rate=0.001
        self.discount_factor=0.99
        self.memory=[]
        self.model=DQN(self.state_size, self.action_size)      

    def update_eps(self):

        self.eps=max(self.eps*self.eps_decay, self.eps_min)

    def eps_greedy(self, state):
        
        if np.random.rand()<self.eps:
            return np.random.uniform(-2,2)
        else:
            act=np.argmax(self.model.call(np.array([state])))
            return -2+act*0.1
    
    def append_sample(self, n_state, action, reward, termination, state):

        self.memory.append((n_state, action, reward, termination, state))

    def train_model(self):

        if len(self.memory)<1000:
            return

        if len(self.memory)>20000:
            del self.memory[0]

        if len(self.memory)>self.batch_size:

            mini_batch=random.sample(self.memory, self.batch_size)

            n_states=np.array([x[0] for x in mini_batch])
            actions=np.array([x[1] for x in mini_batch])
            rewards=np.array([x[2] for x in mini_batch])
            terminations=np.array([x[3] for x in mini_batch])
            states=np.array([x[4] for x in mini_batch])

            q_val=self.model.call(states).numpy()
            n_q_val=self.model.call(n_states).numpy()

            targets=q_val.copy()
            targets[np.arange(len(rewards)), actions]=rewards+self.discount_factor*np.max(n_q_val, axis=1)*(1-terminations)

            with tf.GradientTape() as tape:
                q_val=self.model.call(states)
                loss=tf.keras.losses.mse(targets, q_val)

            gradients=tape.gradient(loss, self.model.trainable_variables)
            optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
            optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

env.reset()
agent=Agent()

agent.model.load_weights('./save_model/model')
agent.eps=0.01

for i in range(5):
    
    state, _  = env.reset()
    termination=False
    max_reward=-20
    step=0
    env.render()
    
    while not termination:

        action = agent.eps_greedy(state)
        n_state, reward, termination, _, _ =env.step([action])
        action=int((action+2)*10)

        if reward>=-0.015:
            termination=True

        max_reward=max(reward, max_reward)
        state=n_state
        step+=1

    print(i+1, step, max_reward)
    