Author : educational_ai ©educational_ai

# Step7 first DQN

# 7 まずは、DQNをやってみよう

 いよいよ、機械学習をします。boardのデータをそのまま使うのではなくて、特殊な計算をして、特徴量を使ってやることにします。全体を見て、動くのを確認してから、細かいところを説明していきます。

まずは、基本的な部分を作っていきます。

# Library

In [1]:
import os

import numpy as np
import random
import dezero_emb as dezero
from puyopuyo import *
from collections import deque


dezero_embは、機械学習のためのフレームワークです。

# CFG_ML
fallblock.pyにも同じように、設定ファイルを作っています。機械学習に関する設定のみ入れます

save_pathが存在しないと、モデルの保存ができないので、save_pathを作ります。

In [2]:
class CFG_ML:
  lr = 1e-3
  buffer_size = 30000
  num_epochs = 300
  final_epsilon = 1e-3
  initial_epsilon = 1
  num_decay_epochs = 200
  batch_size = 512
  gamma = 0.99
  save_interval = 100
  saved_path = "trained_models"
  sync_interval = 10

if not os.path.isdir(CFG_ML.saved_path):
    os.makedirs(CFG_ML.saved_path)


# deep_q_network

In [3]:
class DQNet(dezero.Models.Model):
  def __init__(self):
    super().__init__()
    self.l1 = dezero.L.Linear(128)
    self.l2 = dezero.L.Linear(128)
    self.l3 = dezero.L.Linear(1)

  def forward(self, x):
    x = dezero.F.relu(self.l1(x))
    x = dezero.F.relu(self.l2(x))
    x = self.l3(x)
    return x

# Replay buffer

In [4]:
class ReplayBuffer:
  def __init__(self, buffer_size, batch_size):
    self.buffer = deque(maxlen=buffer_size)
    self.batch_size = batch_size

  def add(self, state, action, reward, next_state, done):
    data = (state, action, reward, next_state, done)
    self.buffer.append(data)

  def __len__(self):
    return len(self.buffer)

  def get_batch(self):
    data = random.sample(self.buffer, self.batch_size)

    state = np.stack([x[0] for x in data])
    action = np.array([x[1] for x in data])
    reward = np.array([x[2] for x in data])
    next_state = np.stack([x[3] for x in data])
    done = np.array([x[4] for x in data]).astype(np.int32)
    return state, action, reward, next_state, done


# DQN agent

In [7]:
import copy

class DQNAgent:
  def __init__(self):
    self.epsilon = CFG_ML.initial_epsilon
    self.action_size = 2

    self.replay_buffer = ReplayBuffer(CFG_ML.buffer_size, CFG_ML.batch_size)
    self.qnet = DQNet()
    self.qnet_target = DQNet()
    self.optimizer = dezero.optimizers.Adam(CFG_ML.lr)
    self.optimizer.setup(self.qnet)

  def __call__(self, board, puyo):
    action_list = utils.create_action_list(board)

    next_boards = []
    next_reward =[]
    action =(2, 1)
    if len(action_list):
      for action in action_list:
        next_board, reward, done = utils.next_board(board, puyo, action)
        if not done:
          next_boards.append(next_board)
          next_reward.append(reward)
      
      next_boards = np.stack(next_boards)
      predictions = self.eval2(next_boards)
      
      next_reward =np.array(next_reward)[:, np.newaxis]
      predictions += dezero.Variable(next_reward)
      index = predictions.data.argmax()
      action = action_list[index]
    return action


  def boardtostate(self, board):
    cont_b = 2 ** np.arange(CFG.Width,dtype=np.int32)
    b1 = np.zeros(CFG.Height * CFG.Width,dtype = np.int32).reshape(CFG.Height , CFG.Width)
    b1[board == 1] = 1
    b2 = np.zeros(CFG.Height * CFG.Width,dtype = np.int32).reshape(CFG.Height , CFG.Width)
    b2[board == 2] = 1
    b3 = np.zeros(CFG.Height * CFG.Width,dtype = np.int32).reshape(CFG.Height , CFG.Width)
    b3[board == 3] = 1
    b4 = np.zeros(CFG.Height * CFG.Width,dtype = np.int32).reshape(CFG.Height , CFG.Width)
    b4[board == 4] = 1
    board_list =np.concatenate([b1,b2,b3,b4])
    state =  board_list.dot(cont_b)      
    return state

  def eval(self, board):
    state = self.boardtostate(board)      
    return self.qnet_target(state)

  def eval2(self, boards):
    states = []
    for i in range(boards.shape[0]):
      state = self.boardtostate(boards[i])
      states.append(state)
    states = np.stack(states)      
    return self.qnet_target(states)


  def update(self, board, action, reward, next_board, done):
    state =  self.boardtostate(board) 
    next_state =  self.boardtostate(next_board)      
    
    self.replay_buffer.add(state, action, reward, next_state, done)
    if not done:
      return
    if len(self.replay_buffer) < CFG_ML.batch_size:
      return
    state, action, reward, next_state, done = self.replay_buffer.get_batch()

    qs = self.qnet(state)
    next_qs = self.qnet_target(next_state)
    reward =reward[:,np.newaxis]
    done =done[:,np.newaxis]
    target = reward + (1 - done) * CFG_ML.gamma * next_qs

    self.qnet.cleargrads()
    loss = dezero.F.mean_squared_error(qs, target)
    loss.backward()
    self.optimizer.update()


  def sync_qnet(self):
    self.qnet_target = copy.deepcopy(self.qnet)

  def save_model(self,filename):
    self.qnet.save_weights(filename)

  def load_model(self,filename):
    self.qnet.load_weights(filename)
    self.qnet_target.load_weights(filename)


# Train

In [8]:
np.random.seed(seed=123)
env = EnvPuyopuyo()
agent = DQNAgent()


for epochs in range(CFG_ML.num_epochs):
  epsilon = CFG_ML.final_epsilon + (max(CFG_ML.num_decay_epochs - epochs, 0) * (
            CFG_ML.initial_epsilon - CFG_ML.final_epsilon) / CFG_ML.num_decay_epochs)

  board, puyo = env.reset()
  done = False
  final_score = 0
  final_pieces = 0

  while not done:
    u = random.random()
    random_action = u <= epsilon

    action = agent(board, puyo)

    if random_action:
      action_list = utils.create_action_list(board)
      if len(action_list):
        index = random.randint(0, len(action_list) - 1)
        action = action_list[index]

    next_board, puyo, reward, done = env.step(action)
    agent.update(board, action, reward, next_board, done)
    board = next_board
    final_score += reward
    final_pieces += 2
  
  if epochs % CFG_ML.sync_interval == 0:
    agent.sync_qnet()

  print("Epoch: {}/{}, Score: {}, pieces {}".format(
    epochs,
    CFG_ML.num_epochs,
    final_score,
    final_pieces))

  if epochs > 0 and epochs % CFG_ML.save_interval == 0:
    agent.save_model("{}/puyopuyo_{}".format(CFG_ML.saved_path, epochs))

agent.save_model("{}/puyopuyo".format(CFG_ML.saved_path))



Epoch: 0/300, Score: 3420, pieces 108
Epoch: 1/300, Score: 2300, pieces 78
Epoch: 2/300, Score: 5510, pieces 134
Epoch: 3/300, Score: 1120, pieces 48
Epoch: 4/300, Score: 1500, pieces 96
Epoch: 5/300, Score: 3510, pieces 106
Epoch: 6/300, Score: 6310, pieces 140
Epoch: 7/300, Score: 3270, pieces 86
Epoch: 8/300, Score: 4340, pieces 86
Epoch: 9/300, Score: 5040, pieces 110
Epoch: 10/300, Score: 2960, pieces 112
Epoch: 11/300, Score: 6290, pieces 124
Epoch: 12/300, Score: 26370, pieces 242
Epoch: 13/300, Score: 3670, pieces 124
Epoch: 14/300, Score: 9250, pieces 132
Epoch: 15/300, Score: 4240, pieces 122
Epoch: 16/300, Score: 24840, pieces 218
Epoch: 17/300, Score: 3030, pieces 106
Epoch: 18/300, Score: 7990, pieces 124
Epoch: 19/300, Score: 8930, pieces 114
Epoch: 20/300, Score: 38280, pieces 294
Epoch: 21/300, Score: 1750, pieces 62
Epoch: 22/300, Score: 2400, pieces 74
Epoch: 23/300, Score: 7900, pieces 154
Epoch: 24/300, Score: 10720, pieces 158
Epoch: 25/300, Score: 28100, pieces 16

19分でした。