In [None]:
# torch imports
import torch
import torch.nn as nn
import torch.optim as optim

import numpy as np
import gym

from google.colab import drive
drive.mount('/content/gdrive')

! pip install tensorboardX
# summary writer
from tensorboardX import SummaryWriter

from collections import namedtuple
from collections import deque

import time

LAYER_SIZE = 50
device = torch.device("cuda" if torch.cuda.is_available else "cpu")

model_save_name = 'DQN_Cartpole.pt'
path = F"/content/gdrive/MyDrive/{model_save_name}" 
video_path = F"/content/gdrive/MyDrive/Video" 

In [None]:
class DQN(nn.Module):
    def __init__(self, input_size, output_size):
        super(DQN, self).__init__()
        self.nn = nn.Sequential(
            nn.Linear(input_size, LAYER_SIZE),
            nn.LeakyReLU(),
            nn.Linear(LAYER_SIZE, 30),
            nn.LeakyReLU(),
            nn.Linear(30, 40),
            nn.LeakyReLU(),
            nn.Linear(40, output_size)
        )
    def forward(self, x):
            return self.nn(x)

In [None]:
# training constants

ENV_NAME = "CartPole-v1"
GAMMA = 1

# Buffer size

BATCH_SIZE = 200
REPLAY_SIZE = 3000
REPLAY_START_SIZE = 3000

# Learning Rate constants.
LEARNING_RATE = 1e-4

# target frame sync constants.
SYNC_TARGET_FRAMES = 2000

#constants for epsilon
EPSILON_DECAY_LAST_FRAME = 10 ** 5
EPSILON_START = 0.3
EPSILON_FINAL = 0.05

# Define how the experience is stored

MEAN_REWARD_BOUND = 19.5

Experience = namedtuple('Experience', field_names=['state', 'action', 'reward', 'is_done', 'new_state'])

class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)
    
    def append(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, new_states = zip(*[self.buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), np.array(dones, dtype=np.uint8), np.array(new_states)

In [None]:
class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.buffer = exp_buffer
        self._reset()
    
    def _reset(self):
        self.state = env.reset()
        self.total_reward = 0.0
    
    def play_step(self, net, epsilon, device="cpu"):
            done_reward = None
            if np.random.random() < epsilon:
                action = env.action_space.sample()
            else:
                state_a = np.array([self.state], copy=False)
                state_v = torch.tensor(state_a, dtype=torch.float32).to(device)
                q_vals_v = net(state_v)
                _, acts_v = torch.max(q_vals_v, dim = 1)
                action = int(acts_v.item())
            
            new_state, reward, is_done, _ = self.env.step(action)
            self.total_reward += reward
            exp = Experience(self.state, action, reward, is_done, new_state)
            self.buffer.append(exp)
            self.state = new_state
            if is_done:
                done_reward = self.total_reward
                self._reset()
            return done_reward

    def calc_loss(self, batch, net, tgt_net, device='cpu'):
        states, actions, rewards, dones, new_states = batch
        
        # putting stuff to the gpu

        states_v = torch.tensor(states, dtype=torch.float32).to(device)
        actions_v = torch.tensor(actions, dtype=torch.int64).to(device)
        rewards_v = torch.tensor(rewards).to(device)
        next_states_v = torch.tensor(new_states, dtype=torch.float32).to(device)
        done_mask = torch.ByteTensor(dones).to(device)

        state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
        next_state_values = tgt_net(next_states_v).max(1)[0]
        next_state_values[done_mask] = 0.0
        
        # check how the detach part works theoretically     
        next_state_values.detach()
        expected_state_action_values = next_state_values * GAMMA + rewards_v
        return nn.MSELoss()(state_action_values, expected_state_action_values)

In [None]:
if __name__ == "__main__":
    env = gym.make(ENV_NAME)
    net = DQN(env.observation_space.shape[0], env.action_space.n).to(device)
    net.load_state_dict(torch.load(path))
    tgt_net = DQN(env.observation_space.shape[0], env.action_space.n).to(device)
    tgt_net.load_state_dict(torch.load(path))


    writer = SummaryWriter(comment="-" + ENV_NAME)
    print(net)
    buffer = ExperienceBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer)
    epsilon = EPSILON_START

    optimiser = optim.Adam(net.parameters(), lr=LEARNING_RATE)
    total_rewards = []
    frame_idx = 0
    ts_frame = 0
    ts = time.time()
    best_mean_reward = None

    while True:
        frame_idx += 1
        epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)
        reward = agent.play_step(net, epsilon, device)
        if len(buffer) < REPLAY_START_SIZE:
            continue
        if frame_idx % SYNC_TARGET_FRAMES == 0:
            tgt_net.load_state_dict(net.state_dict())
        if reward is not None:
            total_rewards.append(reward)
            speed = (frame_idx - ts_frame) / (time.time()-ts)
            ts_frame = frame_idx
            ts = time.time()
            mean_reward = np.mean(total_rewards[-50:])
            print("%d: done %d games, mean reward %.3f, eps %.2f, speed %.2f f/s" % (
                frame_idx, len(total_rewards), mean_reward, epsilon,
                speed
            ))
            
            # writer stuff
            writer.add_scalar("epsilon", epsilon, frame_idx)
            writer.add_scalar("speed", speed, frame_idx)
            writer.add_scalar("reward_100", mean_reward, frame_idx)
            writer.add_scalar("reward", reward, frame_idx)

            if best_mean_reward is None or best_mean_reward < mean_reward:
                torch.save(net.state_dict(), path)
                if best_mean_reward is not None:
                    print("Best mean reward updated %.3f -> %.3f, model saved" % (best_mean_reward, mean_reward))
                best_mean_reward = mean_reward
            if mean_reward > 800:
                print("Solved in %d frames!" % frame_idx)
                break
        
        optimiser.zero_grad()
        batch = buffer.sample(BATCH_SIZE)
        loss_t = agent.calc_loss(batch, net, tgt_net, device)
        loss_t.backward()
        optimiser.step()

DQN(
  (nn): Sequential(
    (0): Linear(in_features=4, out_features=50, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=50, out_features=30, bias=True)
    (3): LeakyReLU(negative_slope=0.01)
    (4): Linear(in_features=30, out_features=40, bias=True)
    (5): LeakyReLU(negative_slope=0.01)
    (6): Linear(in_features=40, out_features=2, bias=True)
  )
)




3209: done 1 games, mean reward 324.000, eps 0.27, speed 1687.93 f/s
3224: done 2 games, mean reward 169.500, eps 0.27, speed 220.04 f/s
3570: done 3 games, mean reward 228.333, eps 0.26, speed 238.41 f/s
3983: done 4 games, mean reward 274.500, eps 0.26, speed 239.95 f/s
4100: done 5 games, mean reward 243.000, eps 0.26, speed 228.36 f/s
4113: done 6 games, mean reward 204.667, eps 0.26, speed 226.26 f/s
4495: done 7 games, mean reward 230.000, eps 0.26, speed 237.23 f/s
4827: done 8 games, mean reward 242.750, eps 0.25, speed 237.08 f/s
5175: done 9 games, mean reward 254.444, eps 0.25, speed 236.31 f/s
5519: done 10 games, mean reward 263.400, eps 0.24, speed 234.99 f/s
5875: done 11 games, mean reward 271.818, eps 0.24, speed 237.93 f/s
6211: done 12 games, mean reward 277.167, eps 0.24, speed 232.94 f/s
6520: done 13 games, mean reward 279.615, eps 0.23, speed 238.00 f/s
6814: done 14 games, mean reward 280.643, eps 0.23, speed 238.04 f/s
6936: done 15 games, mean reward 270.067, 

KeyboardInterrupt: ignored