In [None]:
#pip install gym
#pip install torch
#pip install torchvision
#pip install matplotlib


In [None]:
import gym
import tensorflow
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T


In [None]:
# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
env = gym.make('MountainCar-v0')
env.reset()
for _ in range(1000):
    env.render()
    env.step(env.action_space.sample()) # take a random action
env.close()

In [None]:
def getHeight(x_position):
    return np.sin(3 * x_position) * .45 + .55

In [None]:
def newreward(pos):
    if(pos >= 0.5):
        return 2
    else:
        return (pos+1.2)/1.8 - 1

In [None]:
# Define Q-learning function
def QLearning(env, learning, epsilon, min_eps, episodes):
    #Determine size of discretized state space
    num_states = (env.observation_space.high - env.observation_space.low)*np.array([10, 50])
    num_states = np.round(num_states, 0).astype(int) + 1

    # Initialize Q table
    Q = np.random.uniform(low = -1, high = 0,
                          size = (num_states[0], num_states[1], 
                                  env.action_space.n))
    Qinit = np.copy(Q)

    # Initialize variables to track rewards
    reward_list = []
    ave_reward_list = []

    # Make copy of epsilon
    epsl = epsilon

    #Keep track of first success
    first = episodes + 1

    # Run Q learning algorithm
    for i in range(episodes):
        # Initialize parameters
        done = False
        tot_reward, reward = 0,0
        state = env.reset()

        # Discretize state
        state_adj = (state - env.observation_space.low)*np.array([10, 50])
        state_adj = np.round(state_adj, 0).astype(int)

        while done != True:
            # Render environment for last few episodes
            if i >= (episodes - 5) or i<5:
                env.render()

            # Determine next action - epsilon greedy strategy
            if np.random.random() < 1 - epsilon:
                action = np.argmax(Q[state_adj[0], state_adj[1]])
            else:
                action = np.random.randint(0, env.action_space.n)

            # Get next state and reward
            state2, reward, done, info = env.step(action)

            # Discrtize state2
            state2_adj = (state2 - env.observation_space.low)*np.array([10,50])
            state2_adj = np.round(state2_adj, 0).astype(int)

            # Save to Qpoints
            row = np.array([state_adj[0],state_adj[1],action])

            #Allow for terminal states
            if done and state2[0] >= 0.5:
                Q[state_adj[0], state_adj[1], action] = reward

            #Adjust Q value for current state
            else:
                delta = learning*(newreward(state2[0]) + np.max(Q[state2_adj[0], state2_adj[1]]) - Q[state_adj[0]], - [state2_adj[1]])
                Q[state_adj[0], state_adj[1],action] += delta

            #Notifies of any clears
            if state[0]>=0.5 and i<first:
                    first = i
                    print('First clear on episode {}'.format(i+1))

            # Update variables
            tot_reward += newreward(state2[0])
            state_adj = state2_adj

        # Decay epilson
        if epsilon > min_eps:
            epsilon *= epsl #epsl

        # Track rewards
        reward_list.append(tot_reward)

        if (i+1) % 100 == 0:
            ave_reward = np.mean(reward_list)
            ave_reward_list.append(ave_reward)
            reward_list = []

        if (i+1) % 100 == 0:
            print('Episode {} Average Reward: {}' .format(i+1, ave_reward))

    env.close()

    return ave_reward_list, Q, Qinit

In [None]:
# Run Q-learning algorithm
env.reset()
rewards, Qpts, Qinit = QLearning(env, 0.2, 0.999, 0, 10000)