In [11]:
import pandas as pd
import numpy as np
import unicodedata
import string
import re
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from collections import Counter
import pickle
import random
import pdb
from torch.utils.data import DataLoader
import logging
import itertools
import unicodedata
import string
import re
import random
import argparse
from torch import optim
import time
import os
from torch.utils.data import BatchSampler
from torch.utils.data import SequentialSampler
from torch.utils.data import Sampler
from torch.optim.lr_scheduler import ReduceLROnPlateau
import matplotlib.pyplot as plt
import math
import copy
import seaborn
import datetime
from logger import Logger

from torch.optim import RMSprop
import shutil
from statistics import mean

from gym_wrappers import MainGymWrapper
import gym

In [111]:
class ConvolutionalNeuralNetwork(nn.Module):
    def __init__(self, input_shape, action_space):
        super(ConvolutionalNeuralNetwork, self).__init__()
        self.input_shape = input_shape
        self.action_space = action_space
        self.conv1 = nn.Conv2d(in_channels=input_shape[0], out_channels=32, kernel_size=8, stride=(4,4))
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64,kernel_size=4,stride=(2,2))
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3)
        
        o_size = 3136
        self.linear1 = nn.Linear(in_features=o_size, out_features=512)
        self.linear2 = nn.Linear(in_features=512, out_features=self.action_space)
        
    def forward(self, x, batch_size=None):
        x = torch.from_numpy(x).float()
        out = F.relu(self.conv1(x))
        out = F.relu(self.conv2(out))
        out = F.relu(self.conv3(out))

        # out = (bs, nc, x, y)
        if batch_size==None:
            batch_size = out.size(0)
#         print(out.size())
        out_flat = out.view(batch_size, -1)
        
        out_flat = F.relu(self.linear1(out_flat))
        
        out_flat = self.linear2(out_flat)
        return out_flat

In [112]:
game_name = "Breakout-v0"
game_mode = 'ddqn_training'
render = False
total_step_limit = 5000000
total_run_limit = None
clip = True

FRAMES_IN_OBSERVATION = 4
FRAME_SIZE = 84
INPUT_SHAPE = (FRAMES_IN_OBSERVATION, FRAME_SIZE, FRAME_SIZE)

In [113]:
env_name = game_name + "Deterministic-v4" 
game_name, env_name

('Breakout-v0', 'Breakout-v0Deterministic-v4')

In [114]:
env = MainGymWrapper.wrap(gym.make(game_name))

In [115]:
GAMMA = 0.99
MEMORY_SIZE = 900000
BATCH_SIZE = 32
TRAINING_FREQUENCY = 4
TARGET_NETWORK_UPDATE_FREQUENCY = 40000
MODEL_PERSISTENCE_UPDATE_FREQUENCY = 10000
REPLAY_START_SIZE = 50000

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.1
EXPLORATION_TEST = 0.02
EXPLORATION_STEPS = 850000
EXPLORATION_DECAY = (EXPLORATION_MAX-EXPLORATION_MIN)/EXPLORATION_STEPS

In [130]:
class DDQNTrainer:
    def __init__(self, game_name, input_shape, action_space):
        
        self.input_shape = input_shape
        self.game_name = game_name
        self.action_space = action_space
        
        self.game_mode = "DDQN training"
        self.model_path = "./output/neural_nets/" + game_name + "/ddqn/" + self._get_date() + "/model.h5"
        self.logger_path = "./output/logs/" + game_name + "/ddqn/training/" + self._get_date() + "/"
        self.logger = Logger(self.game_name + " " + self.game_mode, self.logger_path)
        
        if os.path.exists(os.path.dirname(self.model_path)):
            shutil.rmtree(os.path.dirname(self.model_path), ignore_errors=True)
        os.makedirs(os.path.dirname(self.model_path))

        self.ddqn = ConvolutionalNeuralNetwork(self.input_shape, action_space)
        
        self.criteria = nn.CrossEntropyLoss()
        self.optimizer = RMSprop(self.ddqn.parameters(), lr=0.00025, alpha = 0.95, eps=0.01)
        
        if os.path.isfile(self.model_path):
            self.ddqn.load_weights(self.model_path)
        
        self.ddqn_target = ConvolutionalNeuralNetwork(self.input_shape, action_space)
#         .model
        self._reset_target_network()
        self.epsilon = EXPLORATION_MAX
        self.memory = []
    
    def _save_model(self):
        self.ddqn.save_weights(self.model_path)
        
    def _get_date(self):
        return str(datetime.datetime.now().strftime('%Y-%m-%d_%H-%M'))
        
    def save_run(self, score, step, run):
        self.logger.add_score(score)
        self.logger.add_step(step)
        self.logger.add_run(run)

    def move(self, state):
        if np.random.rand() < self.epsilon or len(self.memory) < REPLAY_START_SIZE:
            return random.randrange(self.action_space)
        
        q_values = self.ddqn(np.expand_dims(np.asarray(state).astype(np.float64), axis=0), batch_size=1)
        return np.argmax(q_values[0])

    def remember(self, current_state, action, reward, next_state, terminal):
        self.memory.append({"current_state": current_state,
                            "action": action,
                            "reward": reward,
                            "next_state": next_state,
                            "terminal": terminal})
        if len(self.memory) > MEMORY_SIZE:
            self.memory.pop(0)

    def step_update(self, total_step):
        if len(self.memory) < REPLAY_START_SIZE:
            return

        if total_step % TRAINING_FREQUENCY == 0:
            loss, accuracy, average_max_q = self._train()
            self.logger.add_loss(loss)
            self.logger.add_accuracy(accuracy)
            self.logger.add_q(average_max_q)

        self._update_epsilon()

        if total_step % MODEL_PERSISTENCE_UPDATE_FREQUENCY == 0:
            self._save_model()

        if total_step % TARGET_NETWORK_UPDATE_FREQUENCY == 0:
            self._reset_target_network()
            print('{{"metric": "epsilon", "value": {}}}'.format(self.epsilon))
            print('{{"metric": "total_step", "value": {}}}'.format(total_step))

    def _train(self):
        
        self.optimizer.zero_grad()
        batch = np.asarray(random.sample(self.memory, BATCH_SIZE))
        if len(batch) < BATCH_SIZE:
            return

        current_states = []
        q_values = []
        max_q_values = []

        for entry in batch:
            current_state = np.expand_dims(np.asarray(entry["current_state"]).astype(np.float64), axis=0)
            current_states.append(current_state)
            next_state = np.expand_dims(np.asarray(entry["next_state"]).astype(np.float64), axis=0)
            
            next_state_prediction = self.ddqn_target(next_state).detach().numpy().ravel()
            
            next_q_value = np.max(next_state_prediction)
            q = list(self.ddqn(current_state)[0].detach().numpy())
            
            if entry["terminal"]:
                q[entry["action"]] = entry["reward"]
            else:
                q[entry["action"]] = entry["reward"] + GAMMA * next_q_value
            q_values.append(q)
            max_q_values.append(np.max(q))
        
        model_out = self.ddqn(np.asarray(current_states).squeeze())
        
#         fit = self.ddqn.fit(np.asarray(current_states).squeeze(),
#                             np.asarray(q_values).squeeze(),
#                             batch_size=BATCH_SIZE,
#                             verbose=0)

        #TODO : Check proper input to loss function
        loss = self.criteria(model_out, q_values)
        loss.backward()
        self.optimizer.step()
    

#         loss = fit.history["loss"][0]
#         accuracy = fit.history["acc"][0]
        return loss, accuracy, mean(max_q_values)

    def _update_epsilon(self):
        self.epsilon -= EXPLORATION_DECAY
        self.epsilon = max(EXPLORATION_MIN, self.epsilon)

    def _reset_target_network(self):
        self.ddqn_target.load_state_dict(self.ddqn.state_dict())

In [131]:
def main_loop(game_model, env, render, total_step_limit, total_run_limit, clip):
    run = 0
    total_step = 0
    while True:
        if total_run_limit is not None and run >= total_run_limit:
            print("Reached total run limit of: " + str(total_run_limit))
            exit(0)

        run += 1
        current_state = env.reset()
        step = 0
        score = 0
        while True:
            if total_step >= total_step_limit:
                print("Reached total step limit of: " + str(total_step_limit))
                exit(0)
            total_step += 1
            step += 1

            if render:
                env.render()

            action = game_model.move(current_state)
            next_state, reward, terminal, info = env.step(action)
            if clip:
                np.sign(reward)
            score += reward
            game_model.remember(current_state, action, reward, next_state, terminal)
            current_state = next_state

            game_model.step_update(total_step)

            if terminal:
                game_model.save_run(score, step, run)
                break

In [132]:
game_model = DDQNTrainer(game_mode, INPUT_SHAPE, env.action_space.n)
main_loop(game_model, env, render, total_step_limit, total_run_limit, clip)

score: (min: 0.0, avg: 1.4, max: 3.0
{"metric": "score", "value": 1.4}
step: (min: 174, avg: 252.4, max: 366
{"metric": "step", "value": 252.4}
{"metric": "run", "value": 10}
score: (min: 0.0, avg: 0.9, max: 4.0
{"metric": "score", "value": 0.9}
step: (min: 168, avg: 220.8, max: 342
{"metric": "step", "value": 220.8}
{"metric": "run", "value": 20}
score: (min: 0.0, avg: 1.5, max: 4.0
{"metric": "score", "value": 1.5}
step: (min: 161, avg: 251.9, max: 385
{"metric": "step", "value": 251.9}
{"metric": "run", "value": 30}
score: (min: 0.0, avg: 0.8, max: 2.0
{"metric": "score", "value": 0.8}
step: (min: 167, avg: 221.9, max: 298
{"metric": "step", "value": 221.9}
{"metric": "run", "value": 40}
score: (min: 0.0, avg: 1.8, max: 4.0
{"metric": "score", "value": 1.8}
step: (min: 164, avg: 264.9, max: 397
{"metric": "step", "value": 264.9}
{"metric": "run", "value": 50}
score: (min: 0.0, avg: 1.4, max: 4.0
{"metric": "score", "value": 1.4}
step: (min: 169, avg: 252.3, max: 401
{"metric": "step

AttributeError: 'list' object has no attribute 'size'