# Open AI Atari Games. Reinforcement Learning with PyTorch, deep Learning
## By Nasrudin Bin Salim
### Requirements: Python 2.7. Linux Environment/UNIX Environment
    Please Install Pytorch
    OpenAI Gym
    Open AI Universe
    cv2

### Imports

In [2]:
from __future__ import print_function, division
import numpy as np

import json
import logging
from cv2 import resize
from skimage.color import rgb2gray
import os
os.environ["OMP_NUM_THREADS"] = "1" #should be set to 1 to prevent conflicts
import argparse
import math
import time



from torch.autograd import Variable


### Import OpenAI Universe environment and gym
### Import Pytorch for model

In [3]:
import universe

In [4]:
import gym
import pandas as pd
from universe import vectorized
from universe.wrappers import Unvectorize, Vectorize

from gym.spaces.box import Box
from gym.configuration import undo_logger_setup

import torch
from torch.multiprocessing import Process
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Variable


import torch.optim as optim

#from skimage.transform import resize
#from scipy.misc import imresize as resize


# Helper Functions Section

### Logging function

In [5]:
def setup_logger(logger_name, log_file, level=logging.INFO):
    l = logging.getLogger(logger_name)
    formatter = logging.Formatter('%(asctime)s : %(message)s')
    fileHandler = logging.FileHandler(log_file, mode='w')
    fileHandler.setFormatter(formatter)
    streamHandler = logging.StreamHandler()
    streamHandler.setFormatter(formatter)

    l.setLevel(level)
    l.addHandler(fileHandler)
    l.addHandler(streamHandler)

### Read Json Object

In [6]:
def read_config(file_path):
    """Read JSON config."""
    json_object = json.load(open(file_path, 'r'))
    return json_object

### Normalize column

In [7]:
def norm_col_init(weights, std=1.0):
    x = torch.randn(weights.size())
    x *= std / torch.sqrt((x**2).sum(1, keepdim=True))
    return x

### Share grads between 2 models
#### More on this later

In [8]:
def ensure_shared_grads(model, shared_model):
    for param, shared_param in zip(model.parameters(),
                                   shared_model.parameters()):
        if shared_param.grad is not None:
            return
        shared_param._grad = param.grad

### Weights

In [9]:
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        weight_shape = list(m.weight.data.size())
        fan_in = np.prod(weight_shape[1:4])
        fan_out = np.prod(weight_shape[2:4]) * weight_shape[0]
        w_bound = np.sqrt(6. / (fan_in + fan_out))
        m.weight.data.uniform_(-w_bound, w_bound)
        m.bias.data.fill_(0)
    elif classname.find('Linear') != -1:
        weight_shape = list(m.weight.data.size())
        fan_in = weight_shape[1]
        fan_out = weight_shape[0]
        w_bound = np.sqrt(6. / (fan_in + fan_out))
        m.weight.data.uniform_(-w_bound, w_bound)
        m.bias.data.fill_(0)

# Environment, setting up the openAI and Universe 

### Create the atari environment function

In [10]:
def atari_env(env_id, env_conf):
    env = gym.make(env_id)
    if len(env.observation_space.shape) > 1:
        env = Vectorize(env)
        env = AtariRescale(env, env_conf)
        env = NormalizedEnv(env)
        env = Unvectorize(env)
        
    return env

### Create a frame for environment

In [11]:
def _process_frame(frame, conf):
    frame = frame[conf["crop1"]:conf["crop2"] + 160, :160]
    frame = resize(rgb2gray(frame), (80, conf["dimension2"]))
    frame = resize(frame, (80, 80))
    frame = np.reshape(frame, [1, 80, 80])
    return frame

### Atari rescale class

In [12]:
class AtariRescale(vectorized.ObservationWrapper):
    def __init__(self, env, env_conf):
        super(AtariRescale, self).__init__(env)
        self.observation_space = Box(0.0, 1.0, [1, 80, 80])
        self.conf = env_conf

    def _observation(self, observation_n):
        return [
            _process_frame(observation, self.conf)
            for observation in observation_n
        ]

### Normalized environment class, where we can move from one state and observation to another

In [13]:
class NormalizedEnv(vectorized.ObservationWrapper):
    def __init__(self, env=None):
        super(NormalizedEnv, self).__init__(env)
        self.state_mean = 0
        self.state_std = 0
        self.alpha = 0.9999
        self.num_steps = 0

    def _observation(self, observation_n):
        for observation in observation_n:
            self.num_steps += 1
            self.state_mean = self.state_mean * self.alpha + \
                observation.mean() * (1 - self.alpha)
            self.state_std = self.state_std * self.alpha + \
                observation.std() * (1 - self.alpha)

        unbiased_mean = self.state_mean / (1 - pow(self.alpha, self.num_steps))
        unbiased_std = self.state_std / (1 - pow(self.alpha, self.num_steps))

        return [(observation - unbiased_mean) / (unbiased_std + 1e-8)
                for observation in observation_n]

# Model

Using Google DeepMind's Idea. 

    Research Paper: https://arxiv.org/pdf/1602.01783.pdf
    Asynchronous Advantage Actor-Critic (A3C)


The A3C algorithm was released by Google’s DeepMind group earlier this year, and it made a splash by… essentially obsoleting DQN. It was faster, simpler, more robust, and able to achieve much better scores on the standard battery of Deep RL tasks. On top of all that it could work in continuous as well as discrete action spaces. Given this, it has become the go-to Deep RL algorithm for new challenging problems with complex state and action spaces


    
<a href= "https://medium.com/emergent-future/simple-reinforcement-learning-with-tensorflow-part-8-asynchronous-actor-critic-agents-a3c-c88f72a5e9f2" >Medium Article explaining A3c reinforcement learning </a>

## The Actor-Critic Structure
<img src = "img/A3CStructure.png">

## Many workers training and learning concurrently, and then updates global network with gradients
### Process Flow
<img src = "img/A3CProcessFlow.png">
    
### Long Short Term Memory Recurrent Neural Nets
    
## Implementing in Pytorch, with a class and on LSTM

In [14]:
class A3Clstm(torch.nn.Module):
    def __init__(self, num_inputs, action_space):
        super(A3Clstm, self).__init__()
        self.conv1 = nn.Conv2d(num_inputs, 32, 5, stride=1, padding=2)
        self.maxp1 = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 32, 5, stride=1, padding=1)
        self.maxp2 = nn.MaxPool2d(2, 2)
        self.conv3 = nn.Conv2d(32, 64, 4, stride=1, padding=1)
        self.maxp3 = nn.MaxPool2d(2, 2)
        self.conv4 = nn.Conv2d(64, 64, 3, stride=1, padding=1)
        self.maxp4 = nn.MaxPool2d(2, 2)

        self.lstm = nn.LSTMCell(1024, 512)
        num_outputs = action_space.n
        self.critic_linear = nn.Linear(512, 1)
        self.actor_linear = nn.Linear(512, num_outputs)

        self.apply(weights_init)
        self.actor_linear.weight.data = norm_col_init(
            self.actor_linear.weight.data, 0.01)
        self.actor_linear.bias.data.fill_(0)
        self.critic_linear.weight.data = norm_col_init(
            self.critic_linear.weight.data, 1.0)
        self.critic_linear.bias.data.fill_(0)

        self.lstm.bias_ih.data.fill_(0)
        self.lstm.bias_hh.data.fill_(0)

        self.train()

    def forward(self, inputs):
        inputs, (hx, cx) = inputs
        x = F.relu(self.maxp1(self.conv1(inputs)))
        x = F.relu(self.maxp2(self.conv2(x)))
        x = F.relu(self.maxp3(self.conv3(x)))
        x = F.relu(self.maxp4(self.conv4(x)))

        x = x.view(x.size(0), -1)

        hx, cx = self.lstm(x, (hx, cx))

        x = hx

        return self.critic_linear(x), self.actor_linear(x), (hx, cx)


# The player Agent
## (Reinforcement Learning agent to interact with the env)

In [15]:
class Agent(object):
    def __init__(self, model, env, args, state):
        self.model = model
        self.env = env
        self.current_life = 0
        self.state = state
        self.hx = None
        self.cx = None
        self.eps_len = 0
        self.args = args
        self.values = []
        self.log_probs = []
        self.rewards = []
        self.entropies = []
        self.done = True
        self.info = None
        self.reward = 0

    def action_train(self):
        if self.done:
            self.cx = Variable(torch.zeros(1, 512))
            self.hx = Variable(torch.zeros(1, 512))
        else:
            self.cx = Variable(self.cx.data)
            self.hx = Variable(self.hx.data)
        value, logit, (self.hx, self.cx) = self.model((Variable(self.state.unsqueeze(0)), (self.hx, self.cx)))
        prob = F.softmax(logit)
        log_prob = F.log_softmax(logit)
        entropy = -(log_prob * prob).sum(1)
        self.entropies.append(entropy)
        action = prob.multinomial().data
        log_prob = log_prob.gather(1, Variable(action))
        state, self.reward, self.done, self.info = self.env.step(action.numpy())
        self.state = torch.from_numpy(state).float()
        self.eps_len += 1
        self.done = self.done or self.eps_len >= self.args['M']
        self.reward = max(min(self.reward, 1), -1)
        self.values.append(value)
        self.log_probs.append(log_prob)
        self.rewards.append(self.reward)
        return self

    def action_test(self):
        if self.done:
            self.cx = Variable(torch.zeros(1, 512), volatile=True)
            self.hx = Variable(torch.zeros(1, 512), volatile=True)
        else:
            self.cx = Variable(self.cx.data, volatile=True)
            self.hx = Variable(self.hx.data, volatile=True)
        value, logit, (self.hx, self.cx) = self.model((Variable(self.state.unsqueeze(0), volatile=True), (self.hx, self.cx)))
        prob = F.softmax(logit)
        action = prob.max(1)[1].data.numpy()
        state, self.reward, self.done, self.info = self.env.step(action[0])
        self.state = torch.from_numpy(state).float()
        self.eps_len += 1
        self.done = self.done or self.eps_len >= self.args['M']
        return self

    def check_state(self):
        if self.current_life > self.info['ale.lives']:
            self.done = True
        self.current_life = self.info['ale.lives']
        return self

    def clear_actions(self):
        self.values = []
        self.log_probs = []
        self.rewards = []
        self.entropies = []
        return self


# Shared Memory and optimization algorithims
## As Part of the A3C Network, multiple workers will be working together to update a global network

## RMSprop

RMSprop is an unpublished, adaptive learning rate method proposed by Geoff Hinton in Lecture 6e of his Coursera Class.

RMSprop and Adadelta have both been developed independently around the same time stemming from the need to resolve Adagrad's radically diminishing learning rates. RMSprop in fact is identical to the first update vector of Adadelta 

RMSprop as well divides the learning rate by an exponentially decaying average of squared gradients. Hinton suggests γ
to be set to 0.9, while a good default value for the learning rate η is 0.001.

In [16]:
class SharedRMSprop(optim.RMSprop):
    """Implements RMSprop algorithm with shared states.
    """

    def __init__(self,
                 params,
                 lr=7e-4,
                 alpha=0.99,
                 eps=0.1,
                 weight_decay=0,
                 momentum=0,
                 centered=False):
        super(SharedRMSprop, self).__init__(params, lr, alpha, eps,
                                            weight_decay, momentum, centered)

        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'] = torch.zeros(1)
                state['grad_avg'] = p.data.new().resize_as_(p.data).zero_()
                state['square_avg'] = p.data.new().resize_as_(p.data).zero_()
                state['momentum_buffer'] = p.data.new().resize_as_(
                    p.data).zero_()

    def share_memory(self):
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['square_avg'].share_memory_()
                state['step'].share_memory_()
                state['grad_avg'].share_memory_()

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                state = self.state[p]

                square_avg = state['square_avg']
                alpha = group['alpha']

                state['step'] += 1

                if group['weight_decay'] != 0:
                    grad = grad.add(group['weight_decay'], p.data)

                square_avg.mul_(alpha).addcmul_(1 - alpha, grad, grad)

                if group['centered']:
                    grad_avg = state['grad_avg']
                    grad_avg.mul_(alpha).add_(1 - alpha, grad)
                    avg = square_avg.addcmul(
                        -1, grad_avg, grad_avg).sqrt().add_(group['eps'])
                else:
                    avg = square_avg.sqrt().add_(group['eps'])

                if group['momentum'] > 0:
                    buf = state['momentum_buffer']
                    buf.mul_(group['momentum']).addcdiv_(grad, avg)
                    p.data.add_(-group['lr'], buf)
                else:
                    p.data.addcdiv_(-group['lr'], grad, avg)

        return loss

## Adaptive Moment Estimation (Adam) 
is another method that computes adaptive learning rates for each parameter. In addition to storing an exponentially decaying average of past squared gradients vt like Adadelta and RMSprop, Adam also keeps an exponentially decaying average of past gradients mt, similar to momentum:

Adam (short for Adaptive Moment Estimation) is an update to the RMSProp optimizer. In this optimization algorithm, running averages of both the gradients and the second moments of the gradients are used.

In [17]:
class SharedAdam(optim.Adam):
    """Implements Adam algorithm with shared states.
    """

    def __init__(self,
                 params,
                 lr=1e-3,
                 betas=(0.9, 0.999),
                 eps=1e-3,
                 weight_decay=0):
        super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)

        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'] = torch.zeros(1)
                state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
                state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()

    def share_memory(self):
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'].share_memory_()
                state['exp_avg'].share_memory_()
                state['exp_avg_sq'].share_memory_()

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                state = self.state[p]

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                if group['weight_decay'] != 0:
                    grad = grad.add(group['weight_decay'], p.data)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)

                denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1**state['step'][0]
                bias_correction2 = 1 - beta2**state['step'][0]
                step_size = group['lr'] * \
                    math.sqrt(bias_correction2) / bias_correction1

                p.data.addcdiv_(-step_size, exp_avg, denom)

        return loss


sample_lr = [
    0.0001, 0.00009, 0.00008, 0.00007, 0.00006, 0.00005, 0.00004, 0.00003,
    0.00002, 0.00001, 0.000009, 0.000008, 0.000007, 0.000006, 0.000005,
    0.000004, 0.000003, 0.000002, 0.000001
]

## Adam but only with shared Lr

In [18]:
class SharedLrSchedAdam(optim.Adam):
    """Implements Adam algorithm with shared states.
    """

    def __init__(self,
                 params,
                 lr=1e-3,
                 betas=(0.9, 0.999),
                 eps=1e-3,
                 weight_decay=0):
        super(SharedLrSchedAdam, self).__init__(params, lr, betas, eps,
                                                weight_decay)

        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'] = torch.zeros(1)
                state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
                state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()

    def share_memory(self):
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'].share_memory_()
                state['exp_avg'].share_memory_()
                state['exp_avg_sq'].share_memory_()

    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        lr = sample_lr[int(state['step'][0] // 40000000)]
        group['lr'] = lr

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                state = self.state[p]

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                if group['weight_decay'] != 0:
                    grad = grad.add(group['weight_decay'], p.data)

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)

                denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1**state['step'][0]
                bias_correction2 = 1 - beta2**state['step'][0]
                step_size = group['lr'] * \
                    math.sqrt(bias_correction2) / bias_correction1
                p.data.addcdiv_(-step_size, exp_avg, denom)

        return loss

# Functions to run the model on the environment

# Test
## Function To test the model on a game/environ

In [19]:
def test(args, shared_model, env_conf,render=False):
    log = {}
    setup_logger('{}_log'.format(args['ENV']),
                 r'{0}{1}_log'.format(args['LG'], args['ENV']))
    log['{}_log'.format(args['ENV'])] = logging.getLogger(
        '{}_log'.format(args['ENV']))
    d_args = args
    for k in d_args.keys():
        log['{}_log'.format(args['ENV'])].info('{0}: {1}'.format(k, d_args[k]))

    torch.manual_seed(args['seed'])
    env = atari_env(args['ENV'], env_conf)
    reward_sum = 0
    start_time = time.time()
    num_tests = 0
    reward_total_sum = 0
    player = Agent(None, env, args, None)
    player.model = A3Clstm(
        player.env.observation_space.shape[0], player.env.action_space)
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    player.model.eval()

    while True:
        if player.done:
            player.model.load_state_dict(shared_model.state_dict())
        if render:
            env.render()
        player.action_test()
        reward_sum += player.reward

        if player.done:
            num_tests += 1
            player.current_life = 0
            reward_total_sum += reward_sum
            reward_mean = reward_total_sum / num_tests
            log['{}_log'.format(args['ENV'])].info(
                "Time {0}, episode reward {1}, episode length {2}, reward mean {3:.4f}".
                format(
                    time.strftime("%Hh %Mm %Ss",
                                  time.gmtime(time.time() - start_time)),
                    reward_sum, player.eps_len, reward_mean))

            if reward_sum > args['SSL']:
                player.model.load_state_dict(shared_model.state_dict())
                state_to_save = player.model.state_dict()
                torch.save(state_to_save, '{0}{1}.dat'.format(
                    args['SMD'], args['ENV']))

            reward_sum = 0
            player.eps_len = 0
            state = player.env.reset()
            time.sleep(60)
            player.state = torch.from_numpy(state).float()


# Train
## Function to Train the model with an optimizer algorithim on an environment

In [20]:
def train(rank, args, shared_model, optimizer, env_conf):

    torch.manual_seed(args['seed'] + rank)
    env = atari_env(args['ENV'], env_conf)
    if optimizer is None:
        if args['OPT'] == 'RMSprop':
            optimizer = optim.RMSprop(shared_model.parameters(), lr=args['LR'])
        if args['OPT'] == 'Adam':
            optimizer = optim.Adam(shared_model.parameters(), lr=args['LR'])

    env.seed(args['seed'] + rank)
    player = Agent(None, env, args, None)
    player.model = A3Clstm(
        player.env.observation_space.shape[0], player.env.action_space)
    player.state = player.env.reset()
    player.state = torch.from_numpy(player.state).float()
    player.model.train()

    while True:
        player.model.load_state_dict(shared_model.state_dict())
        for step in range(args['NS']):
            player.action_train()
            if args['CL']:
                player.check_state()
            if player.done:
                break

        if player.done:
            player.eps_len = 0
            player.current_life = 0
            state = player.env.reset()
            player.state = torch.from_numpy(state).float()

        R = torch.zeros(1, 1)
        if not player.done:
            value, _, _ = player.model(
                (Variable(player.state.unsqueeze(0)), (player.hx, player.cx)))
            R = value.data

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        gae = torch.zeros(1, 1)
        for i in reversed(range(len(player.rewards))):
            R = args['G'] * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args['G'] * \
                player.values[i + 1].data - player.values[i].data
            gae = gae * args['G'] * args['T'] + delta_t

            policy_loss = policy_loss - \
                player.log_probs[i] * \
                Variable(gae) - 0.01 * player.entropies[i]

        optimizer.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        torch.nn.utils.clip_grad_norm(player.model.parameters(), 40)
        ensure_shared_grads(player.model, shared_model)
        optimizer.step()
        player.clear_actions()


# Putting it altogether

## Input Desription

##### Parameter: LR
    Type: float
    Description: Learning Rate
##### Parameter: G
    Type=float,
    Description: discount factor for rewards (default: 0.99)
##### Parameter: T
    Type=float,
    Description: parameter for GAE (default: 1.00)
##### Parameter:seed
    Type: int
    Descrition: random seed (default: 42)
##### Parameter:W
    Type=int,
    Description: how many training processes to use (default: 5)
##### Parameter: NS
    Type=int,
    Description: number of forward steps in A3C (default: 20)
##### Parameter: M
    Type=int,
    Description: maximum length of an episode (default: 10000)
##### Parameter: ENV
    Description: environment to train on (default: Pong-v0)
##### Parameter: EC
    Description: environment to crop and resize info (default: config.json)
##### Parameter: SO
    Description: use an optimizer without shared statistics.(default: True)
##### Parameter: L
    Description: load a trained model, (default: False)
##### Parameter: SSL
    Type=int,
    Description: reward score test evaluation must get higher than to save model (default:20)
##### Parameter: OPT
    Description: shares optimizer choice of Adam, LrSchedAdam or RMSprop (default: Adam)
##### Parameter: CL
    Description: end of life is end of training episode.(default: False)
##### Parameter: LMD
    Description: folder to load trained models from (default: 'trained_models/')
##### Parameter: SMD
    Description: folder to save trained models (default: '/trained_models/')
##### Parameter: LG
    Description: folder to save logs (default: '/logs/')

## List of Games, pick one here and then edit the environment accordingly
    Choose an Atari game and it has to be a 4D Tensor Game 
    Or if you don't know what that means, just guess and check

In [21]:
gym.envs.registry.all()

[EnvSpec(flashgames.UrbanMicroRacers-v0),
 EnvSpec(flashgames.PlopPlopLite-v0),
 EnvSpec(DoubleDunk-ramDeterministic-v4),
 EnvSpec(flashgames.Sieger2LevelPack-v0),
 EnvSpec(DoubleDunk-ramDeterministic-v0),
 EnvSpec(gym-core.Krull-v0),
 EnvSpec(gym-core.Krull-v3),
 EnvSpec(Pooyan-ram-v4),
 EnvSpec(Pooyan-ram-v0),
 EnvSpec(flashgames.GonAndMon-v0),
 EnvSpec(flashgames.NeonRaceLvl4-v0),
 EnvSpec(flashgames.Hash-v0),
 EnvSpec(gym-core.JourneyEscapeSlow-v3),
 EnvSpec(gym-core.JourneyEscapeSlow-v0),
 EnvSpec(flashgames.FlashBombs-v0),
 EnvSpec(gym-core.JamesbondDeterministicSlow-v0),
 EnvSpec(VentureNoFrameskip-v0),
 EnvSpec(Centipede-v0),
 EnvSpec(Centipede-v4),
 EnvSpec(flashgames.Crumbs2-v0),
 EnvSpec(flashgames.CosmoGravity2-v0),
 EnvSpec(gym-core.Zaxxon30FPS-v0),
 EnvSpec(gym-core.Zaxxon30FPS-v3),
 EnvSpec(flashgames.ZombiesAndDonuts-v0),
 EnvSpec(Frostbite-ramNoFrameskip-v0),
 EnvSpec(Frostbite-ramNoFrameskip-v4),
 EnvSpec(IceHockey-ramNoFrameskip-v4),
 EnvSpec(flashgames.SpacePunkRace

# Running an Environment, Training and simulating(more below)
    1. L(Load)  is set to False because I have no training data for that particular game.
    Once trained, a training data is provided, then set L to True.
    2. Set SO to True so that it can accumualative learn among all workers.
### Interrupt the Kernal to Stop training or stop testing.
    
## Note: Important to run all cells above but don't run everything below this
### The cells below are in sections, choose 1 section to run. E.g if you want to train, just run the Training section. or if you want to play pacman, just run the cells in the PacMan Section ( From input to render)
    
### Training Notes

*It is important to limit number of worker threads to number of cpu cores available
More than one thread per cpu core available is detrimental in training speed and effectiveness*

## Training Section

## Input Dictionary

In [58]:
args = {'LR': 0.0001, "G":0.99, "T":1.00,"W":7,"NS":100,"M":10000,"ENV":'MsPacman-v0',
         "EC":'./config.json',"SO":True,"L":False,"SSL":20, "OPT":"Adam","CL":False,
         "LMD":'./trained_models/',"SMD":"./trained_models/","LG":'./logs/', "seed":42,"config":"Default"
        }

In [59]:
undo_logger_setup()

torch.set_default_tensor_type('torch.FloatTensor')
torch.manual_seed(args['seed'])

setup_json = read_config(args['EC'])
env_conf = setup_json[args['config']]

for i in setup_json.keys():
    if i in args['ENV']:
        env_conf = setup_json[i]
env = atari_env(args['ENV'], env_conf)

shared_model = A3Clstm(env.observation_space.shape[0], env.action_space)
if args['L']:
    saved_state = torch.load(
        '{0}{1}.dat'.format(args['LMD'], args['ENV']))
    shared_model.load_state_dict(saved_state)
shared_model.share_memory()



if args['SO']:
    if args['OPT'] == 'RMSprop':
        optimizer = SharedRMSprop(shared_model.parameters(), lr=args['LR'])
    if args['OPT'] == 'Adam':
        optimizer = SharedAdam(shared_model.parameters(), lr=args['LR'])
    if args['OPT'] == 'LrSchedAdam':
        optimizer = SharedLrSchedAdam(
            shared_model.parameters(), lr=args['LR'])
    optimizer.share_memory()
else:
    optimizer = None


## Run This to Train

In [60]:
processes = []

p = Process(target=test, args=(args, shared_model, env_conf))
p.start()
processes.append(p)
time.sleep(0.1)
for rank in range(0, args['W']):
    p = Process(
        target=train, args=(rank, args, shared_model, optimizer, env_conf))
    p.start()
    processes.append(p)
    time.sleep(0.1)
for p in processes:
    time.sleep(0.1)
    p.join()

2017-10-07 14:38:01,660 : OPT: Adam
2017-10-07 14:38:01,660 : OPT: Adam
2017-10-07 14:38:01,660 : OPT: Adam
2017-10-07 14:38:01,660 : OPT: Adam
2017-10-07 14:38:01,660 : OPT: Adam
2017-10-07 14:38:01,660 : OPT: Adam
2017-10-07 14:38:01,660 : OPT: Adam
2017-10-07 14:38:01,704 : LG: ./logs/
2017-10-07 14:38:01,704 : LG: ./logs/
2017-10-07 14:38:01,704 : LG: ./logs/
2017-10-07 14:38:01,704 : LG: ./logs/
2017-10-07 14:38:01,704 : LG: ./logs/
2017-10-07 14:38:01,704 : LG: ./logs/
2017-10-07 14:38:01,704 : LG: ./logs/
2017-10-07 14:38:01,709 : SMD: ./trained_models/
2017-10-07 14:38:01,709 : SMD: ./trained_models/
2017-10-07 14:38:01,709 : SMD: ./trained_models/
2017-10-07 14:38:01,709 : SMD: ./trained_models/
2017-10-07 14:38:01,709 : SMD: ./trained_models/
2017-10-07 14:38:01,709 : SMD: ./trained_models/
2017-10-07 14:38:01,709 : SMD: ./trained_models/
2017-10-07 14:38:01,713 : ENV: MsPacman-v0
2017-10-07 14:38:01,713 : ENV: MsPacman-v0
2017-10-07 14:38:01,713 : ENV: MsPacman-v0
2017-10-07

KeyboardInterrupt: 

# Playing the Atari Games Section
## The best part
### WEEEEEEEEEEEEe
#### If it gives tensorflow errors, just run the cells again and somehow it works the 2nd time. This is because we don't have the full Share optimizer data generated yet
### Load model is disabled on default so that you can observe how it learns through iteration,  Set L to True if you want to load the trained models and see how well it performs

## Playing PacMan (10000 episodes)

### Input Parameters

In [55]:
args = {'LR': 0.0001, "G":0.99, "T":1.00,"W":7,"NS":20,"M":10000,"ENV":'MsPacman-v0',
         "EC":'./config.json',"SO":True,"L":False,"SSL":20, "OPT":"Adam","CL":False,
         "LMD":'./trained_models/',"SMD":"./trained_models/","LG":'./logs/', "seed":42,"config":"MsPacman"
        }


In [56]:
undo_logger_setup()

torch.set_default_tensor_type('torch.FloatTensor')
torch.manual_seed(args['seed'])

setup_json = read_config(args['EC'])
env_conf = setup_json[args['config']]

for i in setup_json.keys():
    if i in args['ENV']:
        env_conf = setup_json[i]
env = atari_env(args['ENV'], env_conf)

shared_model = A3Clstm(env.observation_space.shape[0], env.action_space)
if args['L']:
    saved_state = torch.load(
        '{0}{1}.dat'.format(args['LMD'], args['ENV']))
    shared_model.load_state_dict(saved_state)
shared_model.share_memory()



if args['SO']:
    if args['OPT'] == 'RMSprop':
        optimizer = SharedRMSprop(shared_model.parameters(), lr=args['LR'])
    if args['OPT'] == 'Adam':
        optimizer = SharedAdam(shared_model.parameters(), lr=args['LR'])
    if args['OPT'] == 'LrSchedAdam':
        optimizer = SharedLrSchedAdam(
            shared_model.parameters(), lr=args['LR'])
    optimizer.share_memory()
else:
    optimizer = None


### Run this to Start

In [57]:
test(args, shared_model, env_conf,render=True)

2017-10-07 14:37:26,808 : OPT: Adam
2017-10-07 14:37:26,808 : OPT: Adam
2017-10-07 14:37:26,808 : OPT: Adam
2017-10-07 14:37:26,808 : OPT: Adam
2017-10-07 14:37:26,808 : OPT: Adam
2017-10-07 14:37:26,808 : OPT: Adam
2017-10-07 14:37:26,850 : LG: ./logs/
2017-10-07 14:37:26,850 : LG: ./logs/
2017-10-07 14:37:26,850 : LG: ./logs/
2017-10-07 14:37:26,850 : LG: ./logs/
2017-10-07 14:37:26,850 : LG: ./logs/
2017-10-07 14:37:26,850 : LG: ./logs/
2017-10-07 14:37:26,856 : SMD: ./trained_models/
2017-10-07 14:37:26,856 : SMD: ./trained_models/
2017-10-07 14:37:26,856 : SMD: ./trained_models/
2017-10-07 14:37:26,856 : SMD: ./trained_models/
2017-10-07 14:37:26,856 : SMD: ./trained_models/
2017-10-07 14:37:26,856 : SMD: ./trained_models/
2017-10-07 14:37:26,861 : ENV: MsPacman-v0
2017-10-07 14:37:26,861 : ENV: MsPacman-v0
2017-10-07 14:37:26,861 : ENV: MsPacman-v0
2017-10-07 14:37:26,861 : ENV: MsPacman-v0
2017-10-07 14:37:26,861 : ENV: MsPacman-v0
2017-10-07 14:37:26,861 : ENV: MsPacman-v0
2017

KeyboardInterrupt: 

## Playing BeamRider (4000 episodes)

### Input

In [43]:
args = {'LR': 0.0001, "G":0.99, "T":1.00,"W":7,"NS":20,"M":4000,"ENV":'BeamRider-v0',
         "EC":'./config.json',"SO":True,"L":False,"SSL":20, "OPT":"Adam","CL":False,
         "LMD":'./trained_models/',"SMD":"./trained_models/","LG":'./logs/', "seed":42,"config":"BeamRider"
        }


In [44]:
undo_logger_setup()

torch.set_default_tensor_type('torch.FloatTensor')
torch.manual_seed(args['seed'])

setup_json = read_config(args['EC'])
env_conf = setup_json[args['config']]

for i in setup_json.keys():
    if i in args['ENV']:
        env_conf = setup_json[i]
env = atari_env(args['ENV'], env_conf)

shared_model = A3Clstm(env.observation_space.shape[0], env.action_space)
if args['L']:
    saved_state = torch.load(
        '{0}{1}.dat'.format(args['LMD'], args['ENV']))
    shared_model.load_state_dict(saved_state)
shared_model.share_memory()



if args['SO']:
    if args['OPT'] == 'RMSprop':
        optimizer = SharedRMSprop(shared_model.parameters(), lr=args['LR'])
    if args['OPT'] == 'Adam':
        optimizer = SharedAdam(shared_model.parameters(), lr=args['LR'])
    if args['OPT'] == 'LrSchedAdam':
        optimizer = SharedLrSchedAdam(
            shared_model.parameters(), lr=args['LR'])
    optimizer.share_memory()
else:
    optimizer = None


### Run this to Start

In [45]:
test(args, shared_model, env_conf,render=True)

2017-10-07 14:32:34,412 : OPT: Adam
2017-10-07 14:32:34,412 : OPT: Adam
2017-10-07 14:32:34,449 : LG: ./logs/
2017-10-07 14:32:34,449 : LG: ./logs/
2017-10-07 14:32:34,451 : SMD: ./trained_models/
2017-10-07 14:32:34,451 : SMD: ./trained_models/
2017-10-07 14:32:34,453 : ENV: BeamRider-v0
2017-10-07 14:32:34,453 : ENV: BeamRider-v0
2017-10-07 14:32:34,454 : G: 0.99
2017-10-07 14:32:34,454 : G: 0.99
2017-10-07 14:32:34,456 : CL: False
2017-10-07 14:32:34,456 : CL: False
2017-10-07 14:32:34,458 : config: BeamRider
2017-10-07 14:32:34,458 : config: BeamRider
2017-10-07 14:32:34,460 : M: 4000
2017-10-07 14:32:34,460 : M: 4000
2017-10-07 14:32:34,461 : L: False
2017-10-07 14:32:34,461 : L: False
2017-10-07 14:32:34,463 : EC: ./config.json
2017-10-07 14:32:34,463 : EC: ./config.json
2017-10-07 14:32:34,465 : SSL: 20
2017-10-07 14:32:34,465 : SSL: 20
2017-10-07 14:32:34,467 : seed: 42
2017-10-07 14:32:34,467 : seed: 42
2017-10-07 14:32:34,468 : LR: 0.0001
2017-10-07 14:32:34,468 : LR: 0.0001


KeyboardInterrupt: 

## Playing Breakout (3000 episodes)

### Input

In [46]:
args = {'LR': 0.0001, "G":0.99, "T":1.00, "S":1,"W":7,"NS":20,"M":3000,"ENV":'Breakout-v0',
         "EC":'./config.json',"SO":True,"L":False,"SSL":20, "OPT":"Adam","CL":False,
         "LMD":'./trained_models/',"SMD":"./trained_models/","LG":'./logs/', "seed":42,"config":"Breakout"
        }


In [47]:
undo_logger_setup()

torch.set_default_tensor_type('torch.FloatTensor')
torch.manual_seed(args['seed'])

setup_json = read_config(args['EC'])
env_conf = setup_json[args['config']]

for i in setup_json.keys():
    if i in args['ENV']:
        env_conf = setup_json[i]
env = atari_env(args['ENV'], env_conf)

shared_model = A3Clstm(env.observation_space.shape[0], env.action_space)
if args['L']:
    saved_state = torch.load(
        '{0}{1}.dat'.format(args['LMD'], args['ENV']))
    shared_model.load_state_dict(saved_state)
shared_model.share_memory()



if args['SO']:
    if args['OPT'] == 'RMSprop':
        optimizer = SharedRMSprop(shared_model.parameters(), lr=args['LR'])
    if args['OPT'] == 'Adam':
        optimizer = SharedAdam(shared_model.parameters(), lr=args['LR'])
    if args['OPT'] == 'LrSchedAdam':
        optimizer = SharedLrSchedAdam(
            shared_model.parameters(), lr=args['LR'])
    optimizer.share_memory()
else:
    optimizer = None


### Run this to Start

In [48]:
test(args, shared_model, env_conf,render=True)

2017-10-07 14:32:50,550 : OPT: Adam
2017-10-07 14:32:50,551 : LG: ./logs/
2017-10-07 14:32:50,551 : SMD: ./trained_models/
2017-10-07 14:32:50,552 : ENV: Breakout-v0
2017-10-07 14:32:50,553 : G: 0.99
2017-10-07 14:32:50,554 : CL: False
2017-10-07 14:32:50,555 : config: Breakout
2017-10-07 14:32:50,555 : M: 3000
2017-10-07 14:32:50,556 : L: False
2017-10-07 14:32:50,557 : EC: ./config.json
2017-10-07 14:32:50,558 : SSL: 20
2017-10-07 14:32:50,558 : S: 1
2017-10-07 14:32:50,559 : seed: 42
2017-10-07 14:32:50,560 : LR: 0.0001
2017-10-07 14:32:50,561 : T: 1.0
2017-10-07 14:32:50,561 : W: 7
2017-10-07 14:32:50,562 : SO: True
2017-10-07 14:32:50,563 : NS: 20
2017-10-07 14:32:50,564 : LMD: ./trained_models/
2017-10-07 14:33:00,461 : Time 00h 00m 09s, episode reward 0.0, episode length 163, reward mean 0.0000


KeyboardInterrupt: 

## Playing SpaceInvader (10000 episodes)

### Input

In [None]:
args = {'LR': 0.0001, "G":0.99, "T":1.00, "S":1,"W":7,"NS":20,"M":10000,"ENV":'SpaceInvaders-v0',
         "EC":'./config.json',"SO":True,"L":False,"SSL":20, "OPT":"Adam","CL":False,
         "LMD":'./trained_models/',"SMD":"./trained_models/","LG":'./logs/', "seed":42,"config":"SpaceInvaders"
        }


In [None]:
undo_logger_setup()

torch.set_default_tensor_type('torch.FloatTensor')
torch.manual_seed(args['seed'])

setup_json = read_config(args['EC'])
env_conf = setup_json[args['config']]

for i in setup_json.keys():
    if i in args['ENV']:
        env_conf = setup_json[i]
env = atari_env(args['ENV'], env_conf)

shared_model = A3Clstm(env.observation_space.shape[0], env.action_space)
if args['L']:
    saved_state = torch.load(
        '{0}{1}.dat'.format(args['LMD'], args['ENV']))
    shared_model.load_state_dict(saved_state)
shared_model.share_memory()



if args['SO']:
    if args['OPT'] == 'RMSprop':
        optimizer = SharedRMSprop(shared_model.parameters(), lr=args['LR'])
    if args['OPT'] == 'Adam':
        optimizer = SharedAdam(shared_model.parameters(), lr=args['LR'])
    if args['OPT'] == 'LrSchedAdam':
        optimizer = SharedLrSchedAdam(
            shared_model.parameters(), lr=args['LR'])
    optimizer.share_memory()
else:
    optimizer = None


### Run this to Start

In [None]:
test(args, shared_model, env_conf,render=True)