In [43]:
import numpy as np
import gym
import matplotlib.pyplot as plt
import time

# Import and initialize Mountain Car Environment
env = gym.make('MountainCar-v0')
env.reset()

array([-0.4903313,  0.       ], dtype=float32)

In [44]:
# Define Q-learning function
def QLearning(env, learning, discount, epsilon, min_eps, episodes):
    # Determine size of discretized state space
    num_states = (env.observation_space.high - env.observation_space.low)*\
                    np.array([10, 100])
    num_states = np.round(num_states, 0).astype(int) + 1
    
    # Initialize Q table
    Q = np.random.uniform(low = -1, high = 1, 
                          size = (num_states[0], num_states[1], 
                                  env.action_space.n))
    
    # Initialize variables to track rewards
    reward_list = []
    ave_reward_list = []
    
    # Calculate episodic reduction in epsilon
    reduction = (epsilon - min_eps)/episodes
    
    # Run Q learning algorithm
    for i in range(episodes):
        # Initialize parameters
        done = False
        tot_reward, reward = 0,0
        state = env.reset()
        
        # Discretize state
        state_adj = (state - env.observation_space.low)*np.array([10, 100])
        state_adj = np.round(state_adj, 0).astype(int)
    
        while done != True:   
            # Render environment for last five episodes
            # if i == (episodes - 20):
            #     env.render()
            # if i == (episodes - 15):    
            #     env.render()
            # if i == (episodes - 10):
            #     env.render()
            # if i == (episodes - 5):
            #     env.render()
            # if i == (episodes - 0):
            #     env.render()
            #     time.sleep(60)
            if i >= (episodes - 10):
                env.render()
                time.sleep(0.05)

            # # Render environment for the last episodes
            # if i == (episodes):
            #     env.render()
                
            # Determine next action - epsilon greedy strategy
            if np.random.random() < 1 - epsilon:
                action = np.argmax(Q[state_adj[0], state_adj[1]]) 
            else:
                action = np.random.randint(0, env.action_space.n)
                
            # Get next state and reward
            state2, reward, done, info = env.step(action) 
            
            # Discretize state2
            state2_adj = (state2 - env.observation_space.low)*np.array([10, 100])
            state2_adj = np.round(state2_adj, 0).astype(int)
            
            #Allow for terminal states
            if done and state2[0] >= 0.5:
                Q[state_adj[0], state_adj[1], action] = reward
                
            # Adjust Q value for current state
            else:
                delta = learning*(reward + 
                                 discount*np.max(Q[state2_adj[0], 
                                                   state2_adj[1]]) - 
                                 Q[state_adj[0], state_adj[1],action])
                Q[state_adj[0], state_adj[1],action] += delta
                                     
            # Update variables
            tot_reward += reward
            state_adj = state2_adj
        
        # Decay epsilon
        if epsilon > min_eps:
            epsilon -= reduction
        
        # Track rewards
        reward_list.append(tot_reward)
        
        if (i+1) % 100 == 0:
            ave_reward = np.mean(reward_list)
            ave_reward_list.append(ave_reward)
            reward_list = []
            
        if (i+1) % 500 == 0:    
            print('Episode {} Average Reward: {}'.format(i+1, ave_reward))
            
    env.close()            
            # if i == (episodes - 20):
            #     env.render()
            # if i == (episodes - 15):    
            #     env.render()
            # if i == (episodes - 10):
            #     env.render()
            # if i == (episodes - 5):
            #     env.render()
    
    return ave_reward_list

# Run Q-learning algorithm
rewards = QLearning(env, 0.05, 0.9, 0.95, 0, 8000)

Episode 500 Average Reward: -200.0
Episode 1000 Average Reward: -200.0
Episode 1500 Average Reward: -200.0
Episode 2000 Average Reward: -200.0
Episode 2500 Average Reward: -200.0
Episode 3000 Average Reward: -200.0
Episode 3500 Average Reward: -200.0
Episode 4000 Average Reward: -199.89
Episode 4500 Average Reward: -199.35
Episode 5000 Average Reward: -199.09
Episode 5500 Average Reward: -198.99
Episode 6000 Average Reward: -184.52
Episode 6500 Average Reward: -199.7
Episode 7000 Average Reward: -185.12
Episode 7500 Average Reward: -196.49
Episode 8000 Average Reward: -156.25


## Plot figure

In [45]:
# Plot Rewards
# plt.plot(50*(np.arange(len(rewards)) + 1), rewards)
# plt.xlabel('Episodes')
# plt.ylabel('Average Reward')
# plt.title('Average Reward vs Episodes')
# plt.savefig('rewards.jpg')     
# plt.close()  

In [46]:
import argparse
import logging

parser = argparse.ArgumentParser()
parser.add_argument('--env',       type=str,   default='MountainCar-v0')
parser.add_argument('--seed',      type=int,   default=0)
parser.add_argument('--gpu',       type=int,   default=0)
parser.add_argument('--outdir',    type=str,   default='results')
parser.add_argument('--beta',      type=float, default=1e-4)
parser.add_argument('--batchsize', type=int,   default=10)
parser.add_argument('--steps',     type=int,   default=1000)
parser.add_argument('--lr',        type=float, default=1e-3)
parser.add_argument('--logger-level', type=int, default=logging.DEBUG)

parser.add_argument('--eval-interval', type=int, default=10 ** 4)
parser.add_argument('--eval-n-runs', type=int, default=100)

args, unknown = parser.parse_known_args()

logging.basicConfig(level=args.logger_level)

## Environment

In [47]:
# ENVIROMNET
import gym
import chainerrl

env = gym.make(args.env)
env = chainerrl.wrappers.CastObservationToFloat32(env)  # Cast observations to float32 because our model uses float32
env = chainerrl.wrappers.Render(env)
chainerrl.misc.set_random_seed(args.seed, gpus=(args.gpu,))
env.seed(args.seed)

[0]

## NN

In [48]:
import chainer

obs_space    = env.observation_space
action_space = env.action_space

model = chainerrl.policies.FCSoftmaxPolicy(
    obs_space.low.size,
    action_space.n,
    n_hidden_channels=200,
    n_hidden_layers=2,
    nonlinearity=chainer.functions.leaky_relu,)

if args.gpu >= 0:
    chainer.cuda.get_device_from_id(args.gpu).use()
    model.to_gpu(args.gpu)

opt = chainer.optimizers.Adam(alpha=args.lr)
opt.setup(model)
opt.add_hook(chainer.optimizer.GradientClipping(1))

## Agent

In [49]:
agent = chainerrl.agents.REINFORCE(
    model, 
    opt, 
    beta=args.beta, 
    batchsize=args.batchsize)

## Evaluate