In [1]:
import gym
from collections import deque
import random
import matplotlib.pyplot as plt
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.device = device
        # Neural Net Layers
        self.fc1 = nn.Linear(state_size, 24)
        self.fc2 = nn.Linear(24, 24)
        self.out = nn.Linear(24,action_size)
        # Random Uniform
        torch.nn.init.uniform_(self.out.weight,-1e-3,1e-3)

    def forward(self,x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        q = self.out(x)
        return q

In [3]:
# Hyper-parameters
class DQNAgent:
    def __init__(self, state_size, action_size, device):
        self.state_size = state_size
        self.action_size= action_size
        self.device = device
        
        # Hyper-parameters for learning
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        
        # Experience Replay
        self.batch_size = 64
        self.train_start = 1000
        self.buffer_length = 2000
        self.memory = deque(maxlen=self.buffer_length)

        # Neural Network Architecture
        self.model        = DQN(self.state_size, self.action_size).to(self.device)
        self.target_model = DQN(self.state_size, self.action_size).to(self.device)
        self.optimizer    = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        
        self.update_target_model()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state,\
                            action.to(self.device),\
                            torch.FloatTensor([reward]).to(self.device),\
                            torch.FloatTensor([next_state]).to(self.device),\
                            torch.LongTensor([done]).to(self.device)))

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())
        # self.target_model.load_dict(self.model.state_dict())

    def choose_action(self, state):
        # Exploration and Exploitation
        if (np.random.rand() <= self.epsilon):
            return torch.LongTensor([[random.randrange(self.action_size)]])
        else:
            return self.model.forward(state).max(1)[1].view(1, 1)

    def train_model(self):
        # Train from Experience Replay
        # Training Condition - Memory Size
        if len(self.memory) < self.train_start:
            return 0.0
        # Decaying Exploration Ratio
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        # Sampling from the memory
        mini_batch  = random.sample(self.memory, self.batch_size)
        batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones = zip(*mini_batch)

        states      = torch.cat(batch_states)
        actions     = torch.cat(batch_actions)
        rewards     = torch.cat(batch_rewards)
        next_states = torch.cat(batch_next_states)
        dones       = torch.cat(batch_dones)

        q           = self.model.forward(states).gather(1,actions).squeeze()
        max_q       = self.target_model.forward(next_states).detach().max(1)[0]
        target      = rewards + (1 - dones) * self.discount_factor * max_q
        loss        = F.mse_loss(q,target)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.detach().cpu().numpy()

In [4]:
%matplotlib tk

ENV_NAME = 'CartPole-v1'
EPISODES = 1000
# if gpu is to be used
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("DEVICE : ", device)

if __name__ == "__main__":
    env = gym.make(ENV_NAME)
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    print('Env Name : ',ENV_NAME)
    print('States {}, Actions {}'
            .format(state_size, action_size))

    agent = DQNAgent(state_size, action_size, device)

    scores, episodes, epsilons, losses = [], [], [], []
    score_avg = 0
    
    end = False
    
    # fig = plt.figure(1)
    # fig.clf()
    
    for e in range(EPISODES):
        done = False
        score = 0
        loss_list = []

        state = env.reset()
        
        while not done:
            #env.render()

            # Interact with env.
            state = torch.FloatTensor([state]).to(device)
            action = agent.choose_action(state)
            next_state, reward, done, info = env.step(action.item())
            agent.remember(state, action, reward, next_state, done)
            loss = agent.train_model()
            state = next_state
            
            # 
            score += reward
            loss_list.append(loss)
            if done:
                agent.update_target_model()

                score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
                print('epi: {:3d} | score avg {:3.2f} | mem length: {:4d} | epsilon: {:.4f}'
                      .format(e, score_avg, len(agent.memory), agent.epsilon))

                episodes.append(e)
                scores.append(score_avg)
                epsilons.append(agent.epsilon)
                losses.append(np.mean(loss_list))
                plt.subplot(311)
                plt.plot(episodes, scores, 'b')
                plt.xlabel('episode')
                plt.ylabel('average score')
                plt.title('cartpole DQN TORCH')
                plt.grid()
                
                plt.subplot(312)
                plt.plot(episodes, epsilons, 'b')
                plt.xlabel('episode')
                plt.ylabel('epsilon')
                plt.grid()
                
                plt.subplot(313)
                plt.plot(episodes, losses, 'b')
                plt.xlabel('episode')
                plt.ylabel('losses')
                plt.grid()
                
                plt.savefig('./save_model/cartpole_Tdqn.png')

                if score_avg > 400:
                    torch.save(agent.model.state_dict(),'./save_model/cartpole_Tdqn')
                    end = True
                    break
        if end == True:
            env.close()
            np.save('./save_model/cartpole_Tdqn_epi',  episodes)
            np.save('./save_model/cartpole_Tdqn_score',scores)
            np.save('./save_model/cartpole_Tdqn_loss', losses)
            print("End")
            break

DEVICE :  cuda
Env Name :  CartPole-v1
States 4, Actions 2
epi:   0 | score avg 23.00 | mem length:   23 | epsilon: 1.0000
epi:   1 | score avg 22.00 | mem length:   36 | epsilon: 1.0000
epi:   2 | score avg 23.20 | mem length:   70 | epsilon: 1.0000
epi:   3 | score avg 24.18 | mem length:  103 | epsilon: 1.0000
epi:   4 | score avg 22.96 | mem length:  115 | epsilon: 1.0000
epi:   5 | score avg 22.97 | mem length:  138 | epsilon: 1.0000
epi:   6 | score avg 23.07 | mem length:  162 | epsilon: 1.0000
epi:   7 | score avg 23.36 | mem length:  188 | epsilon: 1.0000
epi:   8 | score avg 22.73 | mem length:  205 | epsilon: 1.0000
epi:   9 | score avg 21.55 | mem length:  216 | epsilon: 1.0000
epi:  10 | score avg 22.20 | mem length:  244 | epsilon: 1.0000
epi:  11 | score avg 24.38 | mem length:  288 | epsilon: 1.0000
epi:  12 | score avg 24.04 | mem length:  309 | epsilon: 1.0000
epi:  13 | score avg 22.64 | mem length:  319 | epsilon: 1.0000
epi:  14 | score avg 22.07 | mem length:  336

In [5]:
# import gym
# import collections
# import random

# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim

# #Hyperparameters
# learning_rate = 0.0001
# gamma         = 0.999
# buffer_limit  = 2000
# batch_size    = 64

# class ReplayBuffer():
#     def __init__(self):
#         self.buffer = collections.deque(maxlen=buffer_limit)
    
#     def put(self, transition):
#         self.buffer.append(transition)
    
#     def sample(self, n):
#         mini_batch = random.sample(self.buffer, n)
#         s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
        
#         for transition in mini_batch:
#             s, a, r, s_prime, done_mask = transition
#             s_lst.append(s)
#             a_lst.append([a])
#             r_lst.append([r])
#             s_prime_lst.append(s_prime)
#             done_mask_lst.append([done_mask])

#         return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
#                torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
#                torch.tensor(done_mask_lst)
    
#     def size(self):
#         return len(self.buffer)

# class Qnet(nn.Module):
#     def __init__(self):
#         super(Qnet, self).__init__()
#         self.fc1 = nn.Linear(4, 24)
#         self.fc2 = nn.Linear(24, 24)
#         self.fc3 = nn.Linear(24, 2)

#     def forward(self, x):
#         x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x))
#         x = self.fc3(x)
#         return x
      
#     def sample_action(self, obs, epsilon):
#         out = self.forward(obs)
#         coin = random.random()
#         if coin < epsilon:
#             return random.randint(0,1)
#         else : 
#             return out.argmax().item()
            
# def train(q, q_target, memory, optimizer):
#     for i in range(10):
#         s,a,r,s_prime,done_mask = memory.sample(batch_size)

#         q_out = q(s)
#         q_a = q_out.gather(1,a)
#         max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
#         target = r + gamma * max_q_prime * done_mask
#         loss = F.mse_loss(q_a, target)
        
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

# def main():
#     env = gym.make('CartPole-v1')
#     q = Qnet()
#     q_target = Qnet()
#     q_target.load_state_dict(q.state_dict())
#     memory = ReplayBuffer()

#     print_interval = 1
#     score = 0.0  
#     optimizer = optim.Adam(q.parameters(), lr=learning_rate)
#     score_avg = 0.0
#     for n_epi in range(10000):
#         epsilon = max(0.01, 0.08 - 0.01*(n_epi/200)) #Linear annealing from 8% to 1%
#         s = env.reset()
#         done = False

#         while not done:
#             a = q.sample_action(torch.from_numpy(s).float(), epsilon)      
#             s_prime, r, done, info = env.step(a)
#             done_mask = 0.0 if done else 1.0
#             memory.put((s,a,r/100.0,s_prime, done_mask))
#             s = s_prime

#             score += r
#             score_avg = 0.9 * score_avg + 0.1 * score if score_avg != 0 else score
#             if done:
#                 break
            
#         if memory.size()>1000:
#             train(q, q_target, memory, optimizer)
#         if n_epi%print_interval==0 and n_epi!=0:
#             q_target.load_state_dict(q.state_dict())
#             print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(
#                                                             n_epi, score/print_interval, memory.size(), epsilon*100))
#             score = 0.0
            
#         if score_avg > 400:
#             torch.save(q_target.state_dict(),'./save_model/cartpole_Tdqn')
#             break
#     env.close()


# if __name__ == '__main__':
#     main()