In [26]:
import numpy as np
import random
import copy
import datetime
import platform
import torch
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from collections import deque
from mlagents_envs.environment import UnityEnvironment, ActionTuple
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel

In [27]:
state_size = [3*2, 64, 84]
action_size = 4

load_model = False
train_model = True

batch_size = 32
mem_maxlen = 10000
discount_factor = 0.9
learning_rate = 0.00025

run_step = 50000 if train_model else 0
test_step = 5000
train_start_step = 5000
target_update_step = 500

print_interval = 10
save_interval = 100

epsilon_eval = 0.05
epsilon_init = 1.0 if train_model else epsilon_eval
epsilon_min = 0.1
explore_step = run_step * 0.8
epsilon_delta = (epsilon_init - epsilon_min) / explore_step if train_model else 0

VISUAL_OBS = 0
GOAL_OBS = 1
VECTOR_OBS = 2
OBS = VISUAL_OBS

In [28]:
game = 'grid'
os_name = platform.system()
if os_name == 'Windows':
    env_name = f'../ML_Agents_Project/env/{game}'

date_time = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
save_path = f'./saved_models/{game}/DQN/{date_time}'
load_path = f'./saved_models/{game}/DQN/202302211540'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
print(os_name)

cuda
Windows


In [29]:
class DQN(torch.nn.Module):
    def __init__(self, **kwargs):
        super(DQN, self).__init__(**kwargs)
        self.conv1 = torch.nn.Conv2d(in_channels=state_size[0], out_channels=32, kernel_size=8, stride=4)
        dim1 = ((state_size[1] - 8) // 4 + 1, (state_size[2] - 8) // 4 + 1)
        self.conv2 = torch.nn.Conv2d(
            in_channels=32, out_channels=64, kernel_size=4, stride=2)
        dim2 = ((dim1[0] - 4) // 2 + 1, (dim1[1] - 4) // 2 + 1)
        self.conv3 = torch.nn.Conv2d(
            in_channels=64, out_channels=64, kernel_size=3, stride=1)
        dim3 = ((dim2[0] - 3) // 1 + 1, (dim2[1] - 3) // 1 + 1)

        self.flat = torch.nn.Flatten()
        self.fc1 = torch.nn.Linear(64*dim3[0] * dim3[1], 512)
        self.q = torch.nn.Linear(512, action_size)

    def forward(self, x):
        x = x.permute(0, 3, 1, 2)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.flat(x)
        x = F.relu(self.fc1(x))
        return self.q(x)


In [30]:
class DQNAgent:
    def __init__(self):
        self.network = DQN().to(device)
        self.target_network = copy.deepcopy(self.network)
        self.optimizer = torch.optim.Adam(self.network.parameters(), lr=learning_rate, )
        self.memory = deque(maxlen=mem_maxlen)
        self.epsilon = epsilon_init
        self.writer = SummaryWriter(save_path)

        if load_model == True:
            print(f"... Load Model from {load_path} / ckpt...")
            checkpoint = torch.load(load_path+'/ckpt', map_location=device)
            self.network.load_state_dict(checkpoint['network'])
            self.target_network.load_state_dict(checkpoint['network'])
            self.optimizer.load_state_dict(checkpoint['optimizer'])

    def get_action(self, state, training=True):
        # 네트워크 모드 설정
        self.network.trian(training)
        epsilon = self.epsilon if training else epsilon_eval

        # 랜덤하게 행동 결정
        if epsilon > random.random():
            action = np.random.randint(0, action_size, size=(state.shape[0], 1))
        
        # 네트워크 연산에 따라 행동 결정
        else:
            q = self.network(torch.FloatTensor(state).to(device))
            action = torch.argmax(q, axis=1, keepdim=True).data.cpu().numpy()
        return action

    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def train_model(self):
        batch = random.sample(self.memory, batch_size)
        state = np.stack([b[0] for b in batch], axis=0)
        action = np.stack([b[1] for b in batch], axis=0)
        reward = np.stack([b[2] for b in batch], axis=0)
        next_state = np.stack([b[3] for b in batch], axis=0)
        done = np.stack([b[4] for b in batch], axis=0)

        state, action, reward, next_state, done = map(lambda x:torch.FloatTensor(x).to(device), [state, action, reward, next_state, done])

        eye = torch.eye(action_size).to(device)
        one_hot_action = eye[action.view(-1).long()]
        q = (self.network(state) * one_hot_action).sum(1, keepdims=True)

        with torch.no_grad():
            next_q = self.target_network(next_state)
            target_q = reward + next_q.max(1, keepdims=True).values * ((1-done) * discount_factor)

            loss = F.smooth_l1_loss(q, target_q)

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            self.epsilon = max(epsilon_min, self.epsilon - epsilon_delta)

            return loss.item()
    
    def update_target(self):
        self.target_network.load_state_dict(self.network.state_dict())

    def save_model(self):
        print(f'... save model to {save_path}/ckpt ...')
        torch.save({'network' : self.network.state_dict(), 'optimizer' : self.optimizer.state_dict()}, save_path+'/ckpt')

    def writer_summray(self, score, loss, epsilon, step):
        self.writer.add_scalar('run/score', score, step)
        self.writer.add_scalar('model/loss', loss, step)
        self.writer.add_scalar('model/epsilon', epsilon, step)



In [31]:
engine_configuration_channel = EngineConfigurationChannel()
env = UnityEnvironment(file_name=env_name, side_channels=[
                       engine_configuration_channel])
env.reset()




UnityTimeOutException: The Unity environment took too long to respond. Make sure that :
	 The environment does not need user interaction to launch
	 The Agents' Behavior Parameters > Behavior Type is set to "Default"
	 The environment and the Python interface have compatible versions.

NameError: name 'env_name' is not defined