In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import ipympl
import glob, os
import gym
import math
import random
from itertools import count

In [2]:
from model import Actor, Critic, OrnsteinUhlenbeckActionNoise
from memory import Transition, ReplayMemory
from train import optimize_model_AC

In [3]:
env = gym.make('MountainCarContinuous-v0')
print(env.action_space, env.action_space.high, env.action_space.low)
print(env.observation_space.shape[0], env.observation_space.high, env.observation_space.low)

Box(1,) [1.] [-1.]
2 [0.6  0.07] [-1.2  -0.07]


In [4]:
BATCH_SIZE = 32
GAMMA = 0.999
LRA=0.0001      #LEARNING RATE ACTOR
LRC=0.001       #LEARNING RATE CRITIC
epsilon = 1
epsilon_decay = 1./1000000
noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(1))
print(noise())
actor = Actor(env.observation_space.shape[0], 1)
critic = Critic(env.observation_space.shape[0], 1)

q_optimizer = optim.Adam(critic.parameters(), lr=LRA)
policy_optimizer= optim.Adam(actor.parameters(), lr=LRC)

memory = ReplayMemory(10000)
loss_fn = nn.MSELoss().type(torch.FloatTensor)

[0.02876802]


In [None]:
num_episodes = 100
step_counter = 0
successes = 0
for i_episode in range(num_episodes):
    # Initialize the environment and state
    state = torch.from_numpy(env.reset()).type(torch.FloatTensor)
    states = []
    rewards = []
    actions = []
    counter = 0
    
    for t in count():
        action = torch.from_numpy(10*noise()).type(torch.FloatTensor)
        next_state, reward, done, _ = env.step(action.detach().numpy())
        reward = torch.tensor([-1.]).type(torch.FloatTensor)
        
        states.append(state)
        rewards.append(reward)
        actions.append(action.detach())
        
        if not done:
            next_state = torch.from_numpy(next_state).type(torch.FloatTensor)
            # Move to the next state
            state = next_state
        else:
            #print(next_state)
            if next_state[0] >= env.goal_position:
                successes += 1
                reward = torch.tensor([100.]).type(torch.FloatTensor)
            next_state = None

        counter += 1
        step_counter += 1
        # Perform one step of the optimization (on the target network)
        optimize_model_AC(actor, critic, q_optimizer, policy_optimizer, 
                              memory, loss_fn, BATCH_SIZE)
        if done:
            discounted_rewards = []
            discounted_rewards.append(0)
            for i in range(len(rewards)):
                discounted_rewards[0] += rewards[i]*(GAMMA**(i))
            for i in range(1, len(rewards)):
                discounted_rewards.append((discounted_rewards[i-1] - rewards[i-1])/GAMMA)
            # Store the transitions in memory
            for i in range(len(rewards)):
                memory.push(states[i], actions[i], rewards[i])
            break
print(successes/num_episodes*100)
print('Complete')
env.close()

In [None]:
successes = 0
for i_episode in range(100):
    # Initialize the environment and state
    state = torch.from_numpy(env.reset()).type(torch.FloatTensor)
    for t in count():
        # Select and perform an action
        env.render()
        action = actor(state)
        next_state, reward, done, _ = env.step(action.detach().numpy())
        reward = torch.tensor([reward]).type(torch.FloatTensor)
        
        next_state = torch.from_numpy(next_state).type(torch.FloatTensor)
        state = next_state
        
        if done:
            if state[0] >= env.goal_position:
                successes += 1
            break
print(successes)
print('Complete')
env.close()

In [None]:
num_episodes = 100
step_counter = 0
successes = 0
for i_episode in range(num_episodes):
    # Initialize the environment and state
    state = torch.from_numpy(env.reset()).type(torch.FloatTensor)
    states = []
    rewards = []
    actions = []
    counter = 0
    
    for t in count():
        action = actor(state)
        #epsilon -=  epsilon_decay
        #action += torch.from_numpy(10*noise()*max(0, epsilon)).type(torch.FloatTensor)
        next_state, reward, done, _ = env.step(action.detach().numpy())
        reward = torch.tensor([reward]).type(torch.FloatTensor)
        #env.render()
        
        states.append(state)
        rewards.append(reward)
        actions.append(action.detach())
        
        if not done:
            next_state = torch.from_numpy(next_state).type(torch.FloatTensor)
            # Move to the next state
            state = next_state
        else:
            if next_state[0] >= env.goal_position:
                successes += 1
            next_state = None

        counter += 1
        step_counter += 1
        # Perform one step of the optimization (on the target network)
        optimize_model_AC(actor, critic, q_optimizer, policy_optimizer, 
                              memory, loss_fn, BATCH_SIZE)
        if done:
            discounted_rewards = []
            discounted_rewards.append(0)
            for i in range(len(rewards)):
                discounted_rewards[0] += rewards[i]*(GAMMA**(i))
            for i in range(1, len(rewards)):
                discounted_rewards.append((discounted_rewards[i-1] - rewards[i-1])/GAMMA)
            # Store the transitions in memory
            for i in range(len(rewards)):
                memory.push(states[i], actions[i], discounted_rewards[i])
            break
print(successes/num_episodes*100)
print('Complete')
env.close()

In [None]:
successes = 0
for i_episode in range(100):
    # Initialize the environment and state
    state = torch.from_numpy(env.reset()).type(torch.FloatTensor)
    for t in count():
        # Select and perform an action
        env.render()
        action = actor(state)
        next_state, reward, done, _ = env.step(action.detach().numpy())
        reward = torch.tensor([reward]).type(torch.FloatTensor)
        
        next_state = torch.from_numpy(next_state).type(torch.FloatTensor)
        state = next_state
        
        if done:
            if state[0] >= env.goal_position:
                successes += 1
            break
print(successes)
print('Complete')
env.close()