In [121]:
import torch
import gym
import torch.nn as nn
from collections import namedtuple
import torch.nn.functional as F
import numpy as np
import random
import matplotlib.pyplot as plt
import os
from Make_Env import make_env
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
os.environ["CUDA_VISIBLE_DEVICES"] = '1'


In [122]:
transition = namedtuple(typename="transition", 
            field_names=("state", "action", "reward", "done", "next_state"))

In [123]:
class buffer(object):
    def __init__(self, size):
        self.size = size
        self.buffer = []
        self.pos = 0
    
    def store(self, *args):
        if len(self.buffer) < self.size:
            self.buffer.append(None)
        self.buffer[self.pos] = transition(*args)
        self.pos = (self.pos + 1) % self.size
    
    def sample(self, batch):
        return random.sample(self.buffer, batch)
    
    def __len__(self):
        return len(self.buffer)

In [124]:
# input : 84 * 84 * 4
#  -> conv1 -> 20 * 20 * 32
#  -> conv2 -> 9 * 9 * 64
#  -> conv3 -> 7 * 7 * 64 
class Net(nn.Module):
    def __init__(self, input_n = 4, action_n = 4):
        super(Net, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels = input_n, 
                      out_channels = 32, 
                      kernel_size = 8, 
                      stride = 4),
            nn.ReLU(),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels = 32,
                      out_channels = 64,
                      kernel_size = 4,
                      stride = 2),
            nn.ReLU(),
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels = 64,
                      out_channels = 64,
                      kernel_size = 3,
                      stride = 1),
            nn.ReLU(),
        )
        self.fc = nn.Linear(64 * 7 * 7, 512)
        torch.nn.init.normal_(self.fc.weight, mean=0, std=1)
        torch.nn.init.constant_(self.fc.bias, 0.1)
        self.output = nn.Linear(512, action_n)
        torch.nn.init.normal_(self.output.weight, mean=0, std=1)
        torch.nn.init.constant_(self.output.bias, 0.1)

    def forward(self, s):
        s = s.float() / 128 - 1
        s = self.conv1(s)
        s = self.conv2(s)
        s = self.conv3(s)
        t = s.reshape(s.size(0), -1)
        t = self.fc(t)
        t = F.leaky_relu(t)
        t = self.output(t)
        return t

In [125]:
class Agent(object):
    def __init__(self, lr = 0.00025, buffer_size = 100000, T = 100, batch = 32, epsilon = 1.0, gamma = 0.99, load = False):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(self.device)
        self.buffer = buffer(buffer_size)
        self.t = 0
        self.T_count = 0
        self.T = T
        self.epsilon = epsilon
        self.batch = batch
        self.gamma = gamma
        self.eval_net = Net().to(self.device)
        self.target_net = Net().to(self.device)
        self.loss_function = nn.SmoothL1Loss()
        self.flag = 0
        self.decay = 0
        self.opt = torch.optim.Adam(self.eval_net.parameters(), lr = lr)
        # self.decay = torch.optim.lr_scheduler.ExponentialLR(self.opt, gamma=0.99)
        if load:
            self.eval_net.load_state_dict(torch.load('dqn_evaluate_net.pth'))
            self.target_net.load_state_dict(torch.load('dqn_target_net.pth'))
    
    def choose_action(self, s):
        s = s.to(self.device)
        if self.flag == 1 and self.decay == 0:
            self.epsilon = np.exp(-self.t / 100000)
            self.decay = 1
        if self.epsilon < 0.05 and self.flag == 1:
            print('......ready......')
            self.flag = 2
        if self.epsilon < 0.005:
            self.epsilon = 0.005
        if np.random.uniform() < self.epsilon:
            action = torch.tensor([[random.randrange(4)]], device=self.device, dtype=torch.long)
        else:
            action = self.eval_net(s).detach().max(1)[1].view(1, 1)
        return action
    
    def get_s(self, s):
        s = np.array(s)
        s = s.transpose((2, 0, 1))
        s = torch.from_numpy(s)
        return s.unsqueeze(0)

    def learn(self):
        if self.flag == 0:
            print("......learning......")
            self.flag = 1
        transitions = self.buffer.sample(self.batch)
        batch = transition(*zip(*transitions))
        b_a = tuple((map(lambda a : torch.tensor([[a]], device = self.device), batch.action)))
        b_d = tuple((map(lambda d : torch.tensor([d], device = self.device), batch.done)))
        b_r = tuple((map(lambda r : torch.tensor([r], device = self.device), batch.reward)))
        b_a = torch.cat(b_a)
        b_r = torch.cat(b_r)
        b_d = torch.cat(b_d)
        mask = (b_d == True)
        b_s = torch.cat(batch.state).to(self.device)
        b_s_ = torch.cat(batch.next_state).to(self.device)
        Q_s = self.eval_net(b_s).gather(1, b_a).squeeze(-1)
        Q_s_ = self.target_net(b_s_).max(1)[0].detach()
        Q_estimate = b_r + self.gamma * Q_s_
        Q_estimate[mask] = b_r[mask]
        loss = self.loss_function(Q_estimate, Q_s)
        self.opt.zero_grad()
        loss.backward()
        self.opt.step()
    
    def train(self, env, epoch = 400000):
        score = []
        max_score = []
        m = 0
        epi_score = 0
        reward = 0
        for i in range(epoch):
            s = env.reset()
            s = self.get_s(s)
            self.t += 1
            self.decay = 0
            while True:
                a = self.choose_action(s)
                s_, r, done, info = env.step(a)
                epi_score += r
                s_ = self.get_s(s_)
                self.buffer.store(s, a, r, done, s_)
                if self.buffer.__len__() >= self.buffer.size:
                    self.learn()
                    self.T_count += 1
                    if self.T_count % self.T == 0:
                        self.target_net.load_state_dict(self.eval_net.state_dict())
                s = s_
                if done:
                    break
            if i % 5 == 0:
                if epi_score > m:
                    m = epi_score
                score.append(epi_score)
                reward += epi_score
                epi_score = 0
            if m > 100: 
                break
            if i % 100 == 0:
                print('episode: {} average score : {} max : {}'.format(i, reward / 20, m))
                max_score.append(m)
                m = 0
                reward = 0   
            if i % 200 == 0:
                torch.save(self.eval_net.state_dict(), 'dqn_evaluate_net.pth')
                torch.save(self.target_net.state_dict(), 'dqn_target_net.pth')
            
        plt.plot(max_score)
        plt.ylabel('Max')
        plt.show()
        plt.savefig('dqn_max_score.jpg')
        plt.close()
        plt.plot(score, marker = 'o', markevery = 10)
        plt.ylabel('episode_score')
        plt.show()
        plt.savefig('dqn_score.jpg')
        plt.close()
            


In [None]:
env = gym.make('Breakout-v4')
env = make_env(env)
env.seed(1)
agent = Agent(load = True)
agent.train(env)