In [1]:
!nvidia-smi

/bin/sh: 1: nvidia-smi: not found


In [2]:
import warnings ; warnings.filterwarnings('ignore')
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
from IPython.display import display
from collections import namedtuple, deque
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from itertools import cycle, count
from textwrap import wrap

import matplotlib
import subprocess
import os.path
import tempfile
import random
import base64
import pprint
import glob
import time
import json
import sys
import gym
import io
import os
import gc

from gym import wrappers
from subprocess import check_output
from IPython.display import HTML

LEAVE_PRINT_EVERY_N_SECS = 120
ERASE_LINE = '\x1b[2K'
EPS = 1e-6
BEEP = lambda: os.system("printf '\a'")
RESULTS_DIR = os.path.join('..', 'results')
SEEDS = (12, 34, 56, 78, 90)

%matplotlib inline

In [3]:
plt.style.use('fivethirtyeight')
params = {
    'figure.figsize': (15, 8),
    'font.size': 24,
    'legend.fontsize': 20,
    'axes.titlesize': 28,
    'axes.labelsize': 24,
    'xtick.labelsize': 20,
    'ytick.labelsize': 20
}
pylab.rcParams.update(params)
np.set_printoptions(suppress=True)

In [4]:
torch.cuda.is_available()

False

In [5]:
def get_make_env_fn(**kargs):
    def make_env_fn(env_name, seed=None, render=None, record=False,
                    unwrapped=False, monitor_mode=None, 
                    inner_wrappers=None, outer_wrappers=None):
        mdir = tempfile.mkdtemp()
        env = None
        if render:
            try:
                env = gym.make(env_name, render=render)
            except:
                pass
        if env is None:
            env = gym.make(env_name)
        if seed is not None: env.seed(seed)
        env = env.unwrapped if unwrapped else env
        if inner_wrappers:
            for wrapper in inner_wrappers:
                env = wrapper(env)
        env = wrappers.Monitor(
            env, mdir, force=True, 
            mode=monitor_mode, 
            video_callable=lambda e_idx: record) if monitor_mode else env
        if outer_wrappers:
            for wrapper in outer_wrappers:
                env = wrapper(env)
        return env
    return make_env_fn, kargs

In [6]:
def get_videos_html(env_videos, title, max_n_videos=5):
    videos = np.array(env_videos)
    if len(videos) == 0:
        return
    
    n_videos = max(1, min(max_n_videos, len(videos)))
    idxs = np.linspace(0, len(videos) - 1, n_videos).astype(int) if n_videos > 1 else [-1,]
    videos = videos[idxs,...]

    strm = '<h2>{}<h2>'.format(title)
    for video_path, meta_path in videos:
        video = io.open(video_path, 'r+b').read()
        encoded = base64.b64encode(video)

        with open(meta_path) as data_file:    
            meta = json.load(data_file)

        html_tag = """
        <h3>{0}<h3/>
        <video width="960" height="540" controls>
            <source src="data:video/mp4;base64,{1}" type="video/mp4" />
        </video>"""
        strm += html_tag.format('Episode ' + str(meta['episode_id']), encoded.decode('ascii'))
    return strm

In [7]:
def get_gif_html(env_videos, title, subtitle_eps=None, max_n_videos=4):
    videos = np.array(env_videos)
    if len(videos) == 0:
        return
    
    n_videos = max(1, min(max_n_videos, len(videos)))
    idxs = np.linspace(0, len(videos) - 1, n_videos).astype(int) if n_videos > 1 else [-1,]
    videos = videos[idxs,...]

    strm = '<h2>{}<h2>'.format(title)
    for video_path, meta_path in videos:
        basename = os.path.splitext(video_path)[0]
        gif_path = basename + '.gif'
        if not os.path.exists(gif_path):
            ps = subprocess.Popen(
                ('ffmpeg', 
                 '-i', video_path, 
                 '-r', '7',
                 '-f', 'image2pipe', 
                 '-vcodec', 'ppm',
                 '-crf', '20',
                 '-vf', 'scale=512:-1',
                 '-'), 
                stdout=subprocess.PIPE)
            output = subprocess.check_output(
                ('convert',
                 '-coalesce',
                 '-delay', '7',
                 '-loop', '0',
                 '-fuzz', '2%',
                 '+dither',
                 '-deconstruct',
                 '-layers', 'Optimize',
                 '-', gif_path), 
                stdin=ps.stdout)
            ps.wait()

        gif = io.open(gif_path, 'r+b').read()
        encoded = base64.b64encode(gif)
            
        with open(meta_path) as data_file:    
            meta = json.load(data_file)

        html_tag = """
        <h3>{0}<h3/>
        <img src="data:image/gif;base64,{1}" />"""
        prefix = 'Trial ' if subtitle_eps is None else 'Episode '
        sufix = str(meta['episode_id'] if subtitle_eps is None \
                    else subtitle_eps[meta['episode_id']])
        strm += html_tag.format(prefix + sufix, encoded.decode('ascii'))
    return strm

# Dueling DDQN

In [9]:
class FCQ(nn.Module):
    def __init__(self,
                 input_dim, 
                 output_dim, 
                 hidden_dims = (32, 32), 
                 activation_fc = F.relu):
        pass 

    def _format(self, state):
        x = state
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x, 
                             device=self.device, 
                             dtype=torch.float32)
            x = x.unsqueeze(0)
        return x

    def forward(self, state):
        x = self._format(state)
        x = self.activation_fc(self.input_layer(x))

        for hidden_layer in self.hidden_layers: 
            x = self.activation_fc(hidden_layer(x))
        x = self.output_layer(x)
        return x 

    def numpy_float_to_device(self, variable):
        variable = torch.from_numpy(variable).float().to(self.device)
        return variable

    def load(self, experiences):
        states, actions, new_states, rewards, is_terminals = experiences 

        states = torch.from_numpy(states).float().to(self.device)
        actions = torch.from_numpy(actions).long().to(self.device)
        new_states = torch.from_numpy(new_states).float().to(self.device)
        rewards = torch.from_numpy(rewards).float().to(self.device)
        is_terminals = torch.from_numpy(is_terminals).float().to(self.device)
        return states, actions, new_states, rewards, is_terminals


In [8]:
class GreedyStrategy():
    def __init__(self):
        self.exploratory_action_taken = False 
    def select_action(self, model, state):
        with torch.no_grad():
            q_values = model(state).cpu().detach().data.numpy().sqeeze()
            return np.argmax(q_values)

In [10]:
class EGreedyStrategy():
    def __init__(self, epsilon = 0.1):
        self.epsilon = epsilon 

        self.exploratory_action_taken = None

    def select_action(self, model, state):
        self.exploratory_action_taken = False 

        with torch.no_grad():
            q_values = model(state).cpu().detach().data.numpy().squeeze()
        if np.random.rand() > self.epsilon: 
            action = np.argmax(q_values)
        else: 
            action = np.random.randint(len(q_values))

        self.exploratory_action_taken = action != np.argmax(q_values)
        

In [11]:
class EGreedyLinearStrategy():
    def __init__(self, init_epsilon=1.0, min_epsilon=0.1, max_steps=20000):
        self.t = 0
        self.epsilon = init_epsilon
        self.init_epsilon = init_epsilon
        self.min_epsilon = min_epsilon
        self.max_steps = max_steps
        self.exploratory_action_taken = None
        
    def _epsilon_update(self):
        epsilon = 1 - self.t / self.max_steps
        epsilon = (self.init_epsilon - self.min_epsilon) * epsilon + self.min_epsilon
        epsilon = np.clip(epsilon, self.min_epsilon, self.init_epsilon)
        self.t += 1
        return epsilon

    def select_action(self, model, state):
        self.exploratory_action_taken = False
        with torch.no_grad():
            q_values = model(state).cpu().detach().data.numpy().ipynb_checkpoints/squeeze()

        if np.random.rand() > self.epsilon:
            action = np.argmax(q_values)
        else: 
            action = np.random.randint(len(q_values))

        self.epsilon = self._epsilon_update()
        self.exploratory_action_taken = action != np.argmax(q_values)
        return action

In [12]:
class EGreedyLinearStrategy():
    def __init__(self, init_epsilon=1.0, min_epsilon=0.1, max_steps=20000):
        self.t = 0
        self.epsilon = init_epsilon
        self.init_epsilon = init_epsilon
        self.min_epsilon = min_epsilon
        self.max_steps = max_steps
        self.exploratory_action_taken = None
        
    def _epsilon_update(self):
        epsilon = 1 - self.t / self.max_steps
        epsilon = (self.init_epsilon - self.min_epsilon) * epsilon + self.min_epsilon
        epsilon = np.clip(epsilon, self.min_epsilon, self.init_epsilon)
        self.t += 1
        return epsilon

    def select_action(self, model, state):
        self.exploratory_action_taken = False
        with torch.no_grad():
            q_values = model(state).cpu().detach().data.numpy().ipynb_checkpoints/squeeze()

        if np.random.rand() > self.epsilon:
            action = np.argmax(q_values)
        else: 
            action = np.random.randint(len(q_values))

        self.epsilon = self._epsilon_update()
        self.exploratory_action_taken = action != np.argmax(q_values)
        return action
class EGreedyExpStrategy():
    def __init__(self, init_epsilon=1.0, min_epsilon=0.1, decay_steps=20000):
        self.epsilon = init_epsilon
        self.init_epsilon = init_epsilon
        self.decay_steps = decay_steps
        self.min_epsilon = min_epsilon
        self.epsilons = 0.01 / np.logspace(-2, 0, decay_steps, endpoint=False) - 0.01
        self.epsilons = self.epsilons * (init_epsilon - min_epsilon) + min_epsilon
        self.t = 0
        self.exploratory_action_taken = None

    def _epsilon_update(self):
        self.epsilon = self.min_epsilon if self.t >= self.decay_steps else self.epsilons[self.t]
        self.t += 1
        return self.epsilon

    def select_action(self, model, state):
        self.exploratory_action_taken = False
        with torch.no_grad():
            q_values = model(state).detach().cpu().data.numpy().squeeze()

        if np.random.rand() > self.epsilon:
            action = np.argmax(q_values)
        else:
            action = np.random.randint(len(q_values))

        self._epsilon_update()
        self.exploratory_action_taken = action != np.argmax(q_values)
        return action

In [13]:
class SoftMaxStrategy():
    def __init__(self, 
                 init_temp=1.0, 
                 min_temp=0.3, 
                 exploration_ratio=0.8, 
                 max_steps=25000):
        self.t = 0
        self.init_temp = init_temp
        self.exploration_ratio = exploration_ratio
        self.min_temp = min_temp
        self.max_steps = max_steps
        self.exploratory_action_taken = None
        
    def _update_temp(self):
        temp = 1 - self.t / (self.max_steps * self.exploration_ratio)
        temp = (self.init_temp - self.min_temp) * temp + self.min_temp
        temp = np.clip(temp, self.min_temp, self.init_temp)
        self.t += 1
        return temp

    def select_action(self, model, state):
        self.exploratory_action_taken = False
        temp = self._update_temp()

        with torch.no_grad():
            q_values = model(state).cpu().detach().data.numpy().squeeze()
            scaled_qs = q_values/temp
            norm_qs = scaled_qs - scaled_qs.max()            
            e = np.exp(norm_qs)
            probs = e / np.sum(e)
            assert np.isclose(probs.sum(), 1.0)

        action = np.random.choice(np.arange(len(probs)), size=1, p=probs)[0]
        self.exploratory_action_taken = action != np.argmax(q_values)
        return action

In [15]:
class ReplayBuffer():
    def __init__(self, 
                 max_size = 10_000, 
                 batch_size = 64):
        self.ss_mem = np.empty(shape = (max_size), dtype = np.ndarray)
        self.as_mem = np.empty(shape = (max_size), dtype = np.ndarray)
        self.rs_mem = np.empty(shape = (max_size), dtype = np.ndarray)
        self.ps_mem = np.empty(shape = (max_size), dtype = np.ndarray)
        self.ds_mem = np.empty(shape = (max_size), dtype = np.ndarray)

        self.max_size = max_size 
        self.batch_size = batch_size 
        self._idx = 0 
        self.size = 0 

    def store(self, sample):
        s, a, r, p, d = sample 
        self.ss_mem[self._idx] = s 
        self.as_mem[self._idx] = a 
        self.rs_mem[self._idx] = r 
        self.ps_mem[self._idx] = p 
        self.ds_mem[self._idx] = d 

        self._idx += 1 
        self._idx = self._idx % self.max_size 

        self.size += 1 
        self.size = min(self.size, self.max_size)
        
    def sample(self, batch_size = None):
        if batch_size == None: 
            batch_size = self.batch_size 
        idxs = np.random.choice(
            self.size, 
            batch_size, replace= False
        )

        experiences = (
            np.vstack(self.ss_mem[idxs]),
            np.vstack(self.as_mem[idxs]),
            np.vstack(self.rs_mem[idxs]),
            np.vstack(self.ps_mem[idxs]),
            np.vstack(self.ds_mem[idxs])
        )
        return experiences
                                  
    def __len__(self):
        return self.size 

In [16]:
class FCDuelingQ(nn.Module): 
    def __init__(self, 
                 input_dim, 
                 output_dim, 
                 hidden_dims = (32, 32), 
                 activation_fc = F.relu):
        super(FCDuelingQ, self).__init__()
        self.activation_fc = activation_fc 

        self.input_layer = nn.Linear(input_dim, hidden_dims[0])
        self.hidden_layers = nn.ModuleList()

        for i in range(len(hidden_dims) -1): 
            hidden_layer = nn.Linear(hidden_dims[i], hidden_dims[i+1])
            self.hidden_layers.append(hidden_layer)
        self.output_value = nn.Linear(hidden_dims[-1], 1)
        self.output_layer = nn.Linear(hidden_dims[-1], output_dim)

        device = "cpu"
        if torch.cuda.is_available(): 
            device = "cuda:0"
        self.device = torch.device(device)
        self.to(self.device)

    def _format(self, state): 
        x = state 
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x, dtype = torch.float32, device = self.device)
        x = x.unsqueeze(0)
    def forward(self, state): 
        x = self._format(state)
        x = self.activation_fc(self.input_layer(x))

        for hidden_layer in self.hidden_layers: 
            x = self.activation_fc(hidden_layer(x))
        
        a = self.output_layer(x)
        v = self.output_value(x).expand_as(a)
        q = v+a - a.mean(1, keepdim = True).expand_as(a)
        return q 
    
    def numpy_float_to_device(self, variable):
        variable = torch.from_numpy(variable).float().to(self.device)
        return variable

    def load(self, experiences):
        states, actions, new_states, rewards, is_terminals = experiences
        states = torch.from_numpy(states).float().to(self.device)
        actions = torch.from_numpy(actions).long().to(self.device)
        new_states = torch.from_numpy(new_states).float().to(self.device)
        rewards = torch.from_numpy(rewards).float().to(self.device)
        is_terminals = torch.from_numpy(is_terminals).float().to(self.device)
        return states, actions, new_states, rewards, is_terminals

In [None]:
class DuelingDDQN(): 
    def __init__(self, 
                 replay_buffer_fn, 
                 value_model_fn, 
                 value_optimizer_fn, 
                 value_optimizer_lr, 
                 max_gradient_norm, 
                 training_strategy_fn, 
                 evaluation_strategy_fn, 
                 n_warmup_batches, 
                 update_target_every_steps, 
                 tau):
        self.replay_buffer_fn = replay_buffer_fn 
        self.value_model_fn = value_model_fn 
        self.value_optimizer_fn = value_optimizer_fn 
        self.value_optimizer_lr = value_optimizer_lr 
        self.max_gradient_norm = max_gradient_norm 
        self.training_strategy_fn = training_strategy_fn 
        self.evaluation_strategy_fn = self.evaluation_strategy_fn 
        self.n_warup_batches = n_warmup_batches 
        self.update_target_every_steps = update_target_every_steps 
        self.tau = tau 

    def optimize_model(self, experiences): 
        states, actions, rewards, next_states, is_terminals = experiences 
        batch_size = len(is_terminals)

        argmax_a_q_sp = self.online_model(next_states).max(1)[1]
        q_sp = self.target_model(next_states).detach()

        max_a_q_sp = q_sp[np.arange(batch_size), argmax_a_q_sp].unsqueeze(1)
        target_q_sa = rewards  + (self.gamma* max_a_q_sp * (1- is_terminals))

        q_sa = self.online_model(states).gather(1, actions)

        td_error = q_sa - target_q_sa 

        value_loss = td_error.pow(2).mul(0.5).mean()
        self.value_optimizer.zero_grad()
        value_loss.backward()

        torch.nn.utils.clip_grad_norm(self.online_model.parameters(), self.max_gradient_norm)
        self.value_optimizer.step()

    def interaction_step(self, state, env):
        action = self.training_strategy.select_action(self.online_model, state)
        next_state, reward, is_terminal, info = env.step(action)
        

    




In [None]:
dueling_ddqn_results = []

dueling_ddqn_agents, best_dueling_ddqn_agent_key, best_eval_score = {}, None, float('-inf')

for seed in SEEDS: 
    environment_settings = {
        'env_name': 'CartPole-v1',
        'gamma': 1.00,
        'max_minutes': 20,
        'max_episodes': 10000,
        'goal_mean_100_reward': 475
    }
    

value_model_fn = lambda nS, nA: FCDuelingQ(nS, nA, hidden_dims=(512, 128))
value_optimizer_fn = lambda net, lr: optim.RMSprop(net.parameters(), lr = lr)
value_optimizer_lr = 0.0005 
max_gradient_norm = float('inf')

training_strategy_fn =  lambda: EGreedyExpStrategy(init_epsilon= 1.0, min_epsilon= 0.3, decay_steps= 20_000)
evaluation_strategy_fn = lambda: GreedyStrategy()

replay_buffer_fn = lambda: ReplayBuffer(max_size = 50_000, batch_size = 64) 
n_warmup_batches = 5 
update_target_every_steps = 1 
tau = 0.1 

env_name, gamma, max_minutes, \
    max_episodes, goal_mean_100_reward = environment_settings.values()