In [1]:
import os
import gym
import time
import threading
import random
import numpy as np
import tensorflow as tf

from skimage.color import rgb2gray
from skimage.transform import resize
from tensorflow.compat.v1.train import AdamOptimizer
from tensorflow.keras.layers import Conv2D, Flatten, Dense
from collections import deque
from tensorflow.keras.optimizers import Adam

In [2]:
class DQN(tf.keras.Model):
    def __init__(self, action_size, state_size):
        super(DQN, self).__init__()
        self.conv1 = Conv2D(32, (8, 8), strides=(4, 4), activation='relu',
                            input_shape=state_size)
        self.conv2 = Conv2D(64, (4, 4), strides=(2, 2), activation='relu')
        self.conv3 = Conv2D(64, (3, 3), strides=(1, 1), activation='relu')
        self.flatten = Flatten()
        self.fc = Dense(512, activation='relu')
        self.fc_out = Dense(action_size)

    def call(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.flatten(x)
        x = self.fc(x)
        q = self.fc_out(x)
        return q

In [3]:
global_model = DQN(action_size=3, state_size=(84, 84, 4))
global_target_model = DQN(action_size=3, state_size=(84, 84, 4))

In [4]:
global_memory = deque(maxlen=100000)

In [5]:
file = open('./log.csv', 'w')
file.write('episode,score,scoreMax,scoreAvg,memoryLength,epsilon,qAvg,avgLoss\n')
file.close()

In [6]:
def global_update_model():
        global_target_model.set_weights(global_model.get_weights())
        global_model.save_weights("./save_model/model", save_format="tf")
        print('global_model_update')

In [7]:
def pre_processing(observe):
    processed_observe = np.uint8(
        resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
    return processed_observe

In [8]:
class Global_DQNAgent:
    def __init__(self, action_size, state_size=(84, 84, 4)):
        self.render = False

        # 상태와 행동의 크기 정의
        self.state_size = state_size
        self.action_size = action_size

        # DQN 하이퍼파라미터
        self.discount_factor = 0.99
        self.learning_rate = 1e-4
        self.epsilon = 1.
        self.epsilon_start, self.epsilon_end = 1.0, 0.02
        self.exploration_steps = 1000000.
        self.epsilon_decay_step = self.epsilon_start - self.epsilon_end
        self.epsilon_decay_step /= self.exploration_steps
        self.batch_size = 32
        self.train_start = 50000
        self.update_target_rate = 10000
        self.no_op_steps = 30

        # 글로벌 모델 업대이트를 위한 옵티마이저 선언
        self.optimizer = Adam(self.learning_rate, clipnorm=10.)
        
        # 타깃 모델 초기화
        global_update_model()

        self.avg_q_max, self.avg_loss = 0, 0
        self.writer = tf.summary.create_file_writer('summary/breakout_dqn')
        self.model_path = os.path.join(os.getcwd(), 'save_model', 'model')
        
        self.threads = 1
        
        
    # 쓰레드를 만들고, 학습을 하는 함수
    def train(self):
        # 쓰레드 수 만큼 Runner 클래스 생성
        runners = [Runner(i) for i in range(self.threads)]

        # 각 쓰레드 시작
        for i, runner in enumerate(runners):
            print("Start worker #{:d}".format(i))
            runner.start()

            
    def train_model(self):
        if self.epsilon > self.epsilon_end:
            self.epsilon -= self.epsilon_decay_step

        # 메모리에서 배치 크기만큼 무작위로 샘플 추출
        batch = random.sample(global_memory, self.batch_size)

        history = np.array([sample[0][0] / 255. for sample in batch],
                           dtype=np.float32)
        actions = np.array([sample[1] for sample in batch])
        rewards = np.array([sample[2] for sample in batch])
        next_history = np.array([sample[3][0] / 255. for sample in batch],
                                dtype=np.float32)
        dones = np.array([sample[4] for sample in batch])

        
        # 학습 파라메터
        model_params = global_model.trainable_variables
        with tf.GradientTape() as tape:
            # 현재 상태에 대한 모델의 큐함수
            predicts = global_model(history)
            one_hot_action = tf.one_hot(actions, self.action_size)
            predicts = tf.reduce_sum(one_hot_action * predicts, axis=1)

            # 다음 상태에 대한 타깃 모델의 큐함수
            target_predicts = global_target_model(next_history)

            # 벨만 최적 방정식을 구성하기 위한 타깃과 큐함수의 최대 값 계산
            max_q = np.amax(target_predicts, axis=1)
            targets = rewards + (1 - dones) * self.discount_factor * max_q

            # 후버로스 계산
            error = tf.abs(targets - predicts)
            quadratic_part = tf.clip_by_value(error, 0.0, 1.0)
            linear_part = error - quadratic_part
            loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part)

            self.avg_loss += loss.numpy()

        # 오류함수를 줄이는 방향으로 모델 업데이트
        grads = tape.gradient(loss, model_params)
        self.optimizer.apply_gradients(zip(grads, model_params))

In [9]:
class Runner(threading.Thread):
    def __init__(self, num):
        
        self.state_size = (84, 84, 4)
        self.action_size = 3

        # DQN 하이퍼파라미터
        self.discount_factor = 0.99
        self.learning_rate = 1e-4
        self.epsilon = 1.
        self.epsilon_start, self.epsilon_end = 1.0, 0.02
        self.exploration_steps = 1000000.
        self.epsilon_decay_step = self.epsilon_start - self.epsilon_end
        self.epsilon_decay_step /= self.exploration_steps
        self.batch_size = 32
        self.train_start = 50000
        self.update_target_rate = 10000
        self.no_op_steps = 30

        # 글로벌 모델 업대이트를 위한 옵티마이저 선언
        self.optimizer = Adam(self.learning_rate, clipnorm=10.)
        
        self.avg_q_max, self.avg_loss = 0, 0
        self.writer = tf.summary.create_file_writer('summary/breakout_dqn')
        self.model_path = os.path.join(os.getcwd(), 'save_model', 'model')
        
        self.threads = 1
        
        
        threading.Thread.__init__(self)
        self.myNum = num
        
        
    def run(self):
        env = gym.make('BreakoutDeterministic-v4')
        agent = DQNAgent(action_size=3, myNum=self.myNum)
        
        global_step = 0
        score_avg = 0
        score_max = 0
        
        # 불필요한 행동을 없애주기 위한 딕셔너리 선언
        action_dict = {0:1, 1:2, 2:3, 3:3}
        
        num_episode = 50000
        for e in range(num_episode):
            done = False
            dead = False
            
            step, score, start_life = 0, 0, 5
            
            # env 초기화
            observe = env.reset()
            
            # 랜덤으로 뽑힌 값 만큼의 프레임동안 움직이지 않음
            for _ in range(random.randint(1, agent.no_op_steps)):
                observe, _, _, _ = env.step(1)
            
            
            # 프레임을 전처리 한 후 4개의 상태를 쌓아서 입력값으로 사용.
            state = pre_processing(observe)
            history = np.stack((state, state, state, state), axis=2)
            history = np.reshape([history], (1, 84, 84, 4))

            while not done:
                if agent.render:
                    env.render()
                global_step += 1
                step += 1
                
                # 바로 전 history를 입력으로 받아 행동을 선택
                action = agent.get_action(history)
                # 1: 정지, 2: 왼쪽, 3: 오른쪽
                real_action = action_dict[action]
                
                # 죽었을 때 시작하기 위해 발사 행동을 함
                if dead:
                    action, real_action, dead = 0, 1, False
                
                # 선택한 행동으로 환경에서 한 타임스텝 진행
                observe, reward, done, info = env.step(real_action)
                # 각 타임스텝마다 상태 전처리
                next_state = pre_processing(observe)
                next_state = np.reshape([next_state], (1, 84, 84, 1))
                next_history = np.append(next_state, history[:, :, :, :3], axis=3)
                
                agent.avg_q_max += np.amax(agent.model(np.float32(history / 255.))[0])
                
                if start_life > info['ale.lives']:
                    dead = True
                    start_life = info['ale.lives']
                
                score += reward
                reward = np.clip(reward, -1., 1.)
                # 샘플 <s, a, r, s'>을 리플레이 메모리에 저장 후 학습
                # append_sample 수정
                agent.append_sample(history, action, reward, next_history, dead)
                
                # 리플레이 메모리 크기가 정해놓은 수치에 도달한 시점부터 모델 학습 시작
                if len(global_memory) >= agent.train_start:
                    if agent.epsilon > agent.epsilon_end:
                        agent.epsilon -= agent.epsilon_decay_step
                    
                    
                    batch = random.sample(global_memory, self.batch_size)

                    history = np.array([sample[0][0] / 255. for sample in batch],
                                       dtype=np.float32)
                    actions = np.array([sample[1] for sample in batch])
                    rewards = np.array([sample[2] for sample in batch])
                    next_history1 = np.array([sample[3][0] / 255. for sample in batch],
                                        dtype=np.float32)
                    dones = np.array([sample[4] for sample in batch])

        
                    # 학습 파라메터
                    model_params = global_model.trainable_variables
                    with tf.GradientTape() as tape:
                        # 현재 상태에 대한 모델의 큐함수
                        predicts = global_model(history)
                        one_hot_action = tf.one_hot(actions, self.action_size)
                        predicts = tf.reduce_sum(one_hot_action * predicts, axis=1)

                        # 다음 상태에 대한 타깃 모델의 큐함수
                        target_predicts = global_target_model(next_history1)

                        # 벨만 최적 방정식을 구성하기 위한 타깃과 큐함수의 최대 값 계산
                        max_q = np.amax(target_predicts, axis=1)
                        targets = rewards + (1 - dones) * self.discount_factor * max_q

                        # 후버로스 계산
                        error = tf.abs(targets - predicts)
                        quadratic_part = tf.clip_by_value(error, 0.0, 1.0)
                        linear_part = error - quadratic_part
                        loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part)

                        self.avg_loss += loss.numpy()

                    # 오류함수를 줄이는 방향으로 모델 업데이트
                    grads = tape.gradient(loss, model_params)
                    self.optimizer.apply_gradients(zip(grads, model_params))
                    
                    agent.update_model()
                    
                    
                    # 일정 시간마다 global_model에서 가중치 받아오기
                    if global_step % agent.update_target_rate == 0:
                        global_update_model()
                    
                if dead:
                    history = np.stack((next_state, next_state,
                                        next_state, next_state), axis=2)
                    history = np.reshape([history], (1, 84, 84, 4))
                else:
                    history = next_history
                
                if done:
                    # 각 에피소드 당 학습 정보를 기록
                    score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
                    score_max = score if score > score_max else score_max
                    
                    if self.myNum == 0:
                        # print(log)
                        
                        file = open('./log.csv', 'a')
                        file.write("{:5d}".format(e) + ',')
                        file.write("{:4.1f}".format(score) + ',')
                        file.write("{:4.1f}".format(score_max) + ',')
                        file.write("{:4.1f}".format(score_avg) + ',')
                        file.write("{:5d}".format(len(global_memory)) + ',')
                        file.write("{:.3f}".format(agent.epsilon) + ',')
                        file.write("{:3.2f}".format(agent.avg_q_max / float(step)) + ',')
                        file.write("{:3.2f}".format(agent.avg_loss / float(step)) + '\n')
                        file.close()
                    

                    agent.avg_q_max, agent.avg_loss = 0, 0
                    
        # if self.myNum == 0:
        #    file.close()

In [10]:
class DQNAgent:
    def __init__(self, action_size, state_size=(84, 84, 4), myNum=0):
        self.model = DQN(action_size, state_size)
        self.update_model()
        
        # 예를 들어 앱실론을 점점 줄이며? 앱실론에 따라 행동 결정등 코드 추가
        self.no_op_steps = 30
        self.render = False
        self.epsilon = 1.
        self.epsilon_start, self.epsilon_end = 1.0, 0.02
        self.exploration_steps = 1000000.
        # self.exploration_steps = 100000.
        self.epsilon_decay_step = self.epsilon_start - self.epsilon_end
        self.epsilon_decay_step /= self.exploration_steps
        
        self.state_size = state_size
        self.action_size = action_size
        self.avg_q_max, self.avg_loss = 0, 0
        self.train_start = 50000
        # self.train_start = 5000
        self.update_target_rate = 10000
        # self.update_target_rate = 1000
        
        if myNum == 0:
            self.render=True
    
        
    def update_model(self):
        self.model.set_weights(global_model.get_weights())
    
    def get_action(self, history):  
        history = np.float32(history / 255.0)
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model(history)
            return np.argmax(q_value[0])
        
    # global에 추가
    def append_sample(self, history, action, reward, next_history, dead):
        global_memory.append((history, action, reward, next_history, dead))

In [11]:
if __name__ == "__main__":
    global_agent = Global_DQNAgent(action_size=3)
    global_agent.train()

global_model_update
Start worker #0
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
global_model_update
glob