In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torch.nn.init import kaiming_uniform_#
from matplotlib import pyplot as plt
import time

In [2]:
import numpy as np
import gym
from tqdm import tqdm
import random as rand
from itertools import count
from torch.distributions import Categorical
main_seed = 0

In [3]:
import torch.multiprocessing as mp 

In [6]:
mp.get_all_sharing_strategies()

{'file_system'}

In [5]:
for rank in range(5):
    print(mp.Pipe())

(<multiprocessing.connection.PipeConnection object at 0x0000027D69527F48>, <multiprocessing.connection.PipeConnection object at 0x0000027D6952D648>)
(<multiprocessing.connection.PipeConnection object at 0x0000027D6952D648>, <multiprocessing.connection.PipeConnection object at 0x0000027D6952D4C8>)
(<multiprocessing.connection.PipeConnection object at 0x0000027D6952D648>, <multiprocessing.connection.PipeConnection object at 0x0000027D6952D6C8>)
(<multiprocessing.connection.PipeConnection object at 0x0000027D6952D648>, <multiprocessing.connection.PipeConnection object at 0x0000027D6952D088>)
(<multiprocessing.connection.PipeConnection object at 0x0000027D6952D648>, <multiprocessing.connection.PipeConnection object at 0x0000027D6952D4C8>)


In [3]:
env = gym.make("CartPole-v0")
env.seed(0)
env

<TimeLimit<CartPoleEnv<CartPole-v0>>>

In [4]:
class linearApproximator_A2C(nn.Module):
    def __init__(self,state_shape,policy_outputs, state_value_output=1, hidden_dims=(32,32)):
        super(linearApproximator_A2C, self).__init__()
        self.input_size = state_shape
        self.policy_outputs = policy_outputs
        self.state_value_output = state_value_output
        self.device = torch.device("cuda" if torch.cuda.is_available()\
                                   else "cpu")
        
        self.fc1  = nn.Linear(self.input_size,hidden_dims[0])
        self.hidden_layers = nn.ModuleList()
        for i in range(len(hidden_dims)-1):
            hidden_layer = nn.Linear(\
                                hidden_dims[i], hidden_dims[i+1])
            self.hidden_layers.append(hidden_layer)
        
        self.policy_output_layer  = nn.Linear(hidden_dims[-1],self.policy_outputs)
        self.state_value_output_layer = nn.Linear(hidden_dims[-1],self.state_value_output)
        self.to(self.device)
        
        
        
    def forward(self, state_shape):
        if not isinstance(state_shape, torch.Tensor):
            state_shape = torch.tensor(state_shape, dtype=torch.float32)
        state_shape = state_shape.to(self.device)
            
        x = self.fc1(state_shape)
        x = F.relu(x)
        
        for hidden_layer in self.hidden_layers:
            x = F.relu(hidden_layer(x))
        
        logits = self.policy_output_layer(x)#logits, preferences of actions
        state_value = self.state_value_output_layer(x)#predicted state value
        return logits, state_value
        
    def full_pass(self, state):
        if not isinstance(state_shape, torch.Tensor):
            state_shape = torch.tensor(state_shape, dtype=torch.float32)
        state = state.float().to(device)
        logits, state_value = self.forward(state)
        distribution = Categorical(logits=logits)
        action = distribution.sample()#sample action
        log_prob_action = distribution.log_prob(action).unsqueeze(-1)#gets prob of sampled action
        entropy = distribution.entropy().unsqueeze(-1)
        return action.item(), log_prob_action, entropy, logits, state_value

In [6]:
class linearApproximator_FCV(nn.Module):
    def __init__(self,state_shape,outputs,hidden_dims=(32,32)):
        super(linearApproximator_FCV, self).__init__()
        self.input_size = state_shape
        self.out = outputs
        self.device = torch.device("cuda" if torch.cuda.is_available()\
                                   else "cpu")
        
        self.fc1  = nn.Linear(self.input_size,hidden_dims[0])
        self.hidden_layers = nn.ModuleList()
        for i in range(len(hidden_dims)-1):
            hidden_layer = nn.Linear(\
                                hidden_dims[i], hidden_dims[i+1])
            self.hidden_layers.append(hidden_layer)
        
        self.output_layer  = nn.Linear(hidden_dims[-1],self.out)
        self.to(self.device)
        
        
        
    def forward(self, state_shape):
        if not isinstance(state_shape, torch.Tensor):
            state_shape = torch.tensor(state_shape, dtype=torch.float32)
        state_shape = state_shape.to(self.device)
            
        x = self.fc1(state_shape)
        x = F.relu(x)
        
        for hidden_layer in self.hidden_layers:
            x = F.relu(hidden_layer(x))
        
        state_value = self.output_layer(x)#logits, preferences of actions
        return state_value

In [5]:
def select_action(action_model, state):
    action, log_prob_action, entropy, logits\
                = action_model.full_pass(state)
    return action, log_prob_action, entropy

In [4]:
def Multiprocess_env(env, local_env_seed, A2C_network):
    env.seed(local_env_seed)
    abort_after = 60
    start = time.time()
    terminate_env = False
    ovr_rewards = []
    
    while not terminate_env:
        delta = time.time() - start
        state = env.reset()#set of n_envs
        acc_rewards = 0
        
        n_steps = 0
        reward_state = []
        state_store = []
        log_pa = []
        entropy_store = []
        
        for step in count(start = 1):
            action, log_prob, entropy = \
                    select_action(A2C_policy_network, state)
            next_state, reward, done, info = \
                    env.step(action)
            acc_rewards += reward
            is_truncated = 'TimeLimit.truncated' in info and\
                                info['TimeLimit.truncated']
            is_failure = done and not is_truncated
            reward_store.append(reward)
            state_store.append(state)
            log_pa.append(log_prob)
            entropy_store.append(entropy)
            state = next_state
            
            if done==True or step - n_steps == max_steps:
                if is_failure:
                    reward_store[-1] = 0.0
                if len(reward_store) < 2:
                    continue
                optimize_model(reward_state, log_pa,\
                    state_store, entropy_store, A2C_value_network,\
                    A2C_policy_network, A2C_policy_optimizer,\
                    A2C_value_optimizer, gamma, entropy_beta,\
                    gae_tau)
                reward_store = []
                state_store = []
                log_pa = []
                entropy_store = []
                n_steps = step
            if done == True:
                ovr_rewards.append(acc_rewards)
                break
            if delta >= abort_after:
                terminate_worker = True
                break
    return ovr_rewards

In [None]:
def A2C_(env,
        n_envs,
        gamma=0.99):
    
    obs_space = len(env.reset())
    action_space = len(env.action_space.high)
    A2C_network = linearApproximator_A2C(obs_space, action_space)
    A2C_network_optimizer = torch.optim.Adam(A2C_network.parameters(),lr=0.0008, weight_decay = 0.01)
    
    for seed in range(n_envs):
        #get trajectory from environment seed using model
        
        #optimize model
        