### Mountain car - all configurations

#### Team: Happy Campers

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#Configuration 1 with the random agent

# Learning rate = .1, epsilon = 1,

env = gym.make("MountainCar-v0")

LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 10000
SHOW_EVERY = 500
STATS_EVERY = 100


DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE

# Exploration settings
epsilon = 1  #Exploration/exploitation parameter. When equal to 1 agent plays randomly
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

# payoff = 0 is the payoff when car reaches flag
# initializes 20x20x3 table which is the observation space (20x20) for all three actions
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))

# For stats
ep_rewards = [] #contains each episodes reward as a list
rand_aggr_ep_rewards = {'ep': [], 'avg': [], 'max': [], 'min': []} #dictionary that tracks episode number, average, min, max

def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table


for episode in range(EPISODES):
    episode_reward = 0
    discrete_state = get_discrete_state(env.reset())
    done = False

    if episode % SHOW_EVERY == 0:
        render = True
        print(episode)
    else:
        render = False

    while not done:

        if np.random.random() > epsilon:
            # Get action from Q table. Change to random to to random action (action space = 3 here. print(env.action_space.n) to print action space)
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)

        new_state, reward, done, _ = env.step(action)
        # Could add print statement to get printout of all the states
        episode_reward += reward
        new_discrete_state = get_discrete_state(new_state)

        if episode % SHOW_EVERY == 0:
            env.render()
        #new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

        # If simulation did not end yet after last step - update Q table
        if not done:

            # Maximum possible Q value in next step (for new state)
            max_future_q = np.max(q_table[new_discrete_state])

            # Current Q value (for current state and performed action)
            current_q = q_table[discrete_state + (action,)]

            # Equation for a new Q value for current state and action
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

            # Update Q table with new Q value
            q_table[discrete_state + (action,)] = new_q


        # Simulation ended (for any reason) - if goal position is archived - update Q value with reward directly
        elif new_state[0] >= env.goal_position:
            #q_table[discrete_state + (action,)] = reward
            q_table[discrete_state + (action,)] = 0

        discrete_state = new_discrete_state

    # Decaying is being done every episode if episode number is within decaying range
    # Comment out to avoid epsilon decay
    #if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
     #   epsilon -= epsilon_decay_value

    ep_rewards.append(episode_reward)

    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        rand_aggr_ep_rewards['ep'].append(episode)
        rand_aggr_ep_rewards['avg'].append(average_reward)
        rand_aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:]))
        rand_aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:]))

        print(f'Episode: {episode:>5d}, average reward: {average_reward:>4.1f}, current epsilon: {epsilon:>1.2f}')
env.close()

plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['avg'], label = "avg")
plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['min'], label = "min")
plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['max'], label = "max")
plt.legend(loc=2)
plt.show()

In [None]:
#Configuration 1 with the greedy agent

#learning rate = .1, epsilon = .5

# When grey out decay, epsilon is fixed which means playing epsilon greedy

LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 10000
SHOW_EVERY = 500
STATS_EVERY = 100


DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE

# Exploration settings
epsilon = .5  #Exploration/exploitation parameter. When equal to 1 agent plays randomly
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

# payoff = 0 is the payoff when car reaches flag
# initializes 20x20x3 table which is the observation space (20x20) for all three actions
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))

# For stats
ep_rewards = [] #contains each episodes reward as a list
greedy_aggr_ep_rewards = {'ep': [], 'avg': [], 'max': [], 'min': []} #dictionary that tracks episode number, average, min, max

def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table


for episode in range(EPISODES):
    episode_reward = 0
    discrete_state = get_discrete_state(env.reset())
    done = False

    if episode % SHOW_EVERY == 0:
        render = True
        print(episode)
    else:
        render = False

    while not done:

        if np.random.random() > epsilon:
            # Get action from Q table. Change to random to to random action (action space = 3 here. print(env.action_space.n) to print action space)
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)

        new_state, reward, done, _ = env.step(action)
        # Could add print statement to get printout of all the states
        episode_reward += reward
        new_discrete_state = get_discrete_state(new_state)

        if episode % SHOW_EVERY == 0:
            env.render()
        #new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

        # If simulation did not end yet after last step - update Q table
        if not done:

            # Maximum possible Q value in next step (for new state)
            max_future_q = np.max(q_table[new_discrete_state])

            # Current Q value (for current state and performed action)
            current_q = q_table[discrete_state + (action,)]

            # Equation for a new Q value for current state and action
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

            # Update Q table with new Q value
            q_table[discrete_state + (action,)] = new_q


        # Simulation ended (for any reason) - if goal position is archived - update Q value with reward directly
        elif new_state[0] >= env.goal_position:
            #q_table[discrete_state + (action,)] = reward
            q_table[discrete_state + (action,)] = 0

        discrete_state = new_discrete_state

    # Decaying is being done every episode if episode number is within decaying range
    #if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
     #   epsilon -= epsilon_decay_value

    ep_rewards.append(episode_reward)

    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        greedy_aggr_ep_rewards['ep'].append(episode)
        greedy_aggr_ep_rewards['avg'].append(average_reward)
        greedy_aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:]))
        greedy_aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:]))

        print(f'Episode: {episode:>5d}, average reward: {average_reward:>4.1f}, current epsilon: {epsilon:>1.2f}')
env.close()


In [None]:
#Plot for configuration 1
plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['avg'], label = "random avg")
plt.plot(greedy_aggr_ep_rewards['ep'], greedy_aggr_ep_rewards['avg'], label = "greedy avg")
plt.axis('on')
plt.title("Random Agent (Epsilon = 1) vs Greedy Player (Epsilon = .5), LR .1")
plt.xlabel("Episodes")
plt.ylabel("Score")
plt.legend(loc=2)
plt.show()


In [None]:
#Configuration 2 - random agent
#Learning rate = .5, epsilon = 1

LEARNING_RATE = 0.5
DISCOUNT = 0.95
EPISODES = 10000
SHOW_EVERY = 500
STATS_EVERY = 100


DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE

# Exploration settings
epsilon = 1  #Exploration/exploitation parameter. When equal to 1 agent plays randomly
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

# payoff = 0 is the payoff when car reaches flag
# initializes 20x20x3 table which is the observation space (20x20) for all three actions
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))

# For stats
ep_rewards = [] #contains each episodes reward as a list
rand_aggr_ep_rewards = {'ep': [], 'avg': [], 'max': [], 'min': []} #dictionary that tracks episode number, average, min, max

def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table


for episode in range(EPISODES):
    episode_reward = 0
    discrete_state = get_discrete_state(env.reset())
    done = False

    if episode % SHOW_EVERY == 0:
        render = True
        print(episode)
    else:
        render = False

    while not done:

        if np.random.random() > epsilon:
            # Get action from Q table. Change to random to to random action (action space = 3 here. print(env.action_space.n) to print action space)
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)

        new_state, reward, done, _ = env.step(action)
        # Could add print statement to get printout of all the states
        episode_reward += reward
        new_discrete_state = get_discrete_state(new_state)

        if episode % SHOW_EVERY == 0:
            env.render()
        #new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

        # If simulation did not end yet after last step - update Q table
        if not done:

            # Maximum possible Q value in next step (for new state)
            max_future_q = np.max(q_table[new_discrete_state])

            # Current Q value (for current state and performed action)
            current_q = q_table[discrete_state + (action,)]

            # Equation for a new Q value for current state and action
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

            # Update Q table with new Q value
            q_table[discrete_state + (action,)] = new_q


        # Simulation ended (for any reason) - if goal position is archived - update Q value with reward directly
        elif new_state[0] >= env.goal_position:
            #q_table[discrete_state + (action,)] = reward
            q_table[discrete_state + (action,)] = 0

        discrete_state = new_discrete_state

    # Decaying is being done every episode if episode number is within decaying range
    # Comment out to avoid epsilon decay
    #if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
     #   epsilon -= epsilon_decay_value

    ep_rewards.append(episode_reward)

    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        rand_aggr_ep_rewards['ep'].append(episode)
        rand_aggr_ep_rewards['avg'].append(average_reward)
        rand_aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:]))
        rand_aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:]))

        print(f'Episode: {episode:>5d}, average reward: {average_reward:>4.1f}, current epsilon: {epsilon:>1.2f}')
env.close()

plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['avg'], label = "avg")
plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['min'], label = "min")
plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['max'], label = "max")
plt.legend(loc=2)
plt.show()

In [None]:
#Configuration - greedy player

#Learning rate = .5, epsilon = .5

# When grey out decay, epsilon is fixed which means playing epsilon greedy
env = gym.make("MountainCar-v0")

LEARNING_RATE = 0.5
DISCOUNT = 0.95
EPISODES = 10000
SHOW_EVERY = 500
STATS_EVERY = 100


DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE

# Exploration settings
epsilon = .5  #Exploration/exploitation parameter. When equal to 1 agent plays randomly
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

# payoff = 0 is the payoff when car reaches flag
# initializes 20x20x3 table which is the observation space (20x20) for all three actions
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))

# For stats
ep_rewards = [] #contains each episodes reward as a list
greedy_aggr_ep_rewards = {'ep': [], 'avg': [], 'max': [], 'min': []} #dictionary that tracks episode number, average, min, max

def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table


for episode in range(EPISODES):
    episode_reward = 0
    discrete_state = get_discrete_state(env.reset())
    done = False

    if episode % SHOW_EVERY == 0:
        render = True
        print(episode)
    else:
        render = False

    while not done:

        if np.random.random() > epsilon:
            # Get action from Q table. Change to random to to random action (action space = 3 here. print(env.action_space.n) to print action space)
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)

        new_state, reward, done, _ = env.step(action)
        # Could add print statement to get printout of all the states
        episode_reward += reward
        new_discrete_state = get_discrete_state(new_state)

        if episode % SHOW_EVERY == 0:
            env.render()
        #new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

        # If simulation did not end yet after last step - update Q table
        if not done:

            # Maximum possible Q value in next step (for new state)
            max_future_q = np.max(q_table[new_discrete_state])

            # Current Q value (for current state and performed action)
            current_q = q_table[discrete_state + (action,)]

            # Equation for a new Q value for current state and action
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

            # Update Q table with new Q value
            q_table[discrete_state + (action,)] = new_q


        # Simulation ended (for any reason) - if goal position is archived - update Q value with reward directly
        elif new_state[0] >= env.goal_position:
            #q_table[discrete_state + (action,)] = reward
            q_table[discrete_state + (action,)] = 0

        discrete_state = new_discrete_state

    # Decaying is being done every episode if episode number is within decaying range
    #if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
     #   epsilon -= epsilon_decay_value

    ep_rewards.append(episode_reward)

    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        greedy_aggr_ep_rewards['ep'].append(episode)
        greedy_aggr_ep_rewards['avg'].append(average_reward)
        greedy_aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:]))
        greedy_aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:]))

        print(f'Episode: {episode:>5d}, average reward: {average_reward:>4.1f}, current epsilon: {epsilon:>1.2f}')
env.close()

In [None]:
#Configuration 2 plot

plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['avg'], label = "random avg")
plt.plot(greedy_aggr_ep_rewards['ep'], greedy_aggr_ep_rewards['avg'], label = "greedy avg")
plt.axis('on')
plt.title("Random Agent (Epsilon = 1)  vs Greedy Player (Epsilon .5), LR .5")
plt.xlabel("Episodes")
plt.ylabel("Score")
plt.legend(loc=2)
plt.show()


In [None]:
#Configuration 3 - random player

# Learning rate = .1, epsilon = 1

LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 10000
SHOW_EVERY = 500
STATS_EVERY = 100


DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE

# Exploration settings
epsilon = 1  #Exploration/exploitation parameter. When equal to 1 agent plays randomly
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

# payoff = 0 is the payoff when car reaches flag
# initializes 20x20x3 table which is the observation space (20x20) for all three actions
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))

# For stats
ep_rewards = [] #contains each episodes reward as a list
rand_aggr_ep_rewards = {'ep': [], 'avg': [], 'max': [], 'min': []} #dictionary that tracks episode number, average, min, max

def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table


for episode in range(EPISODES):
    episode_reward = 0
    discrete_state = get_discrete_state(env.reset())
    done = False

    if episode % SHOW_EVERY == 0:
        render = True
        print(episode)
    else:
        render = False

    while not done:

        if np.random.random() > epsilon:
            # Get action from Q table. Change to random to to random action (action space = 3 here. print(env.action_space.n) to print action space)
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)

        new_state, reward, done, _ = env.step(action)
        # Could add print statement to get printout of all the states
        episode_reward += reward
        new_discrete_state = get_discrete_state(new_state)

        if episode % SHOW_EVERY == 0:
            env.render()
        #new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

        # If simulation did not end yet after last step - update Q table
        if not done:

            # Maximum possible Q value in next step (for new state)
            max_future_q = np.max(q_table[new_discrete_state])

            # Current Q value (for current state and performed action)
            current_q = q_table[discrete_state + (action,)]

            # Equation for a new Q value for current state and action
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

            # Update Q table with new Q value
            q_table[discrete_state + (action,)] = new_q


        # Simulation ended (for any reason) - if goal position is archived - update Q value with reward directly
        elif new_state[0] >= env.goal_position:
            #q_table[discrete_state + (action,)] = reward
            q_table[discrete_state + (action,)] = 0

        discrete_state = new_discrete_state

    # Decaying is being done every episode if episode number is within decaying range
    # Comment out to avoid epsilon decay
    #if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
     #   epsilon -= epsilon_decay_value

    ep_rewards.append(episode_reward)

    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        rand_aggr_ep_rewards['ep'].append(episode)
        rand_aggr_ep_rewards['avg'].append(average_reward)
        rand_aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:]))
        rand_aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:]))

        print(f'Episode: {episode:>5d}, average reward: {average_reward:>4.1f}, current epsilon: {epsilon:>1.2f}')
env.close()

plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['avg'], label = "avg")
plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['min'], label = "min")
plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['max'], label = "max")
plt.legend(loc=2)
plt.show()

In [None]:
#Configuration 3 - greedy player

# Learning rate = .1, epsilon = .1

# When grey out decay, epsilon is fixed which means playing epsilon greedy
env = gym.make("MountainCar-v0")

LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 10000
SHOW_EVERY = 500
STATS_EVERY = 100


DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE

# Exploration settings
epsilon = .1  #Exploration/exploitation parameter. When equal to 1 agent plays randomly
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

# payoff = 0 is the payoff when car reaches flag
# initializes 20x20x3 table which is the observation space (20x20) for all three actions
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))

# For stats
ep_rewards = [] #contains each episodes reward as a list
greedy_aggr_ep_rewards = {'ep': [], 'avg': [], 'max': [], 'min': []} #dictionary that tracks episode number, average, min, max

def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table


for episode in range(EPISODES):
    episode_reward = 0
    discrete_state = get_discrete_state(env.reset())
    done = False

    if episode % SHOW_EVERY == 0:
        render = True
        print(episode)
    else:
        render = False

    while not done:

        if np.random.random() > epsilon:
            # Get action from Q table. Change to random to to random action (action space = 3 here. print(env.action_space.n) to print action space)
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)

        new_state, reward, done, _ = env.step(action)
        # Could add print statement to get printout of all the states
        episode_reward += reward
        new_discrete_state = get_discrete_state(new_state)

        if episode % SHOW_EVERY == 0:
            env.render()
        #new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

        # If simulation did not end yet after last step - update Q table
        if not done:

            # Maximum possible Q value in next step (for new state)
            max_future_q = np.max(q_table[new_discrete_state])

            # Current Q value (for current state and performed action)
            current_q = q_table[discrete_state + (action,)]

            # Equation for a new Q value for current state and action
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

            # Update Q table with new Q value
            q_table[discrete_state + (action,)] = new_q


        # Simulation ended (for any reason) - if goal position is archived - update Q value with reward directly
        elif new_state[0] >= env.goal_position:
            #q_table[discrete_state + (action,)] = reward
            q_table[discrete_state + (action,)] = 0

        discrete_state = new_discrete_state

    # Decaying is being done every episode if episode number is within decaying range
    #if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
     #   epsilon -= epsilon_decay_value

    ep_rewards.append(episode_reward)

    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        greedy_aggr_ep_rewards['ep'].append(episode)
        greedy_aggr_ep_rewards['avg'].append(average_reward)
        greedy_aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:]))
        greedy_aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:]))

        print(f'Episode: {episode:>5d}, average reward: {average_reward:>4.1f}, current epsilon: {epsilon:>1.2f}')
env.close()

In [None]:
#Configuration 3 plot

plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['avg'], label = "random avg")
plt.plot(greedy_aggr_ep_rewards['ep'], greedy_aggr_ep_rewards['avg'], label = "greedy avg")
plt.axis('on')
plt.title("Random Agent (Epsilon = 1) vs Greedy Player (Epsilon .1) LR .1")
plt.xlabel("Episodes")
plt.ylabel("Score")
plt.legend(loc=2)
plt.show()

In [None]:
#Configuration 4 - random player

# Learning rate = .1, epsilon = 1

LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 10000
SHOW_EVERY = 500
STATS_EVERY = 100


DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE

# Exploration settings
epsilon = 1  #Exploration/exploitation parameter. When equal to 1 agent plays randomly
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

# payoff = 0 is the payoff when car reaches flag
# initializes 20x20x3 table which is the observation space (20x20) for all three actions
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))

# For stats
ep_rewards = [] #contains each episodes reward as a list
rand_aggr_ep_rewards = {'ep': [], 'avg': [], 'max': [], 'min': []} #dictionary that tracks episode number, average, min, max

def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table


for episode in range(EPISODES):
    episode_reward = 0
    discrete_state = get_discrete_state(env.reset())
    done = False

    if episode % SHOW_EVERY == 0:
        render = True
        print(episode)
    else:
        render = False

    while not done:

        if np.random.random() > epsilon:
            # Get action from Q table. Change to random to to random action (action space = 3 here. print(env.action_space.n) to print action space)
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)

        new_state, reward, done, _ = env.step(action)
        # Could add print statement to get printout of all the states
        episode_reward += reward
        new_discrete_state = get_discrete_state(new_state)

        if episode % SHOW_EVERY == 0:
            env.render()
        #new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

        # If simulation did not end yet after last step - update Q table
        if not done:

            # Maximum possible Q value in next step (for new state)
            max_future_q = np.max(q_table[new_discrete_state])

            # Current Q value (for current state and performed action)
            current_q = q_table[discrete_state + (action,)]

            # Equation for a new Q value for current state and action
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

            # Update Q table with new Q value
            q_table[discrete_state + (action,)] = new_q


        # Simulation ended (for any reason) - if goal position is archived - update Q value with reward directly
        elif new_state[0] >= env.goal_position:
            #q_table[discrete_state + (action,)] = reward
            q_table[discrete_state + (action,)] = 0

        discrete_state = new_discrete_state

    # Decaying is being done every episode if episode number is within decaying range
    # Comment out to avoid epsilon decay
    #if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
     #   epsilon -= epsilon_decay_value

    ep_rewards.append(episode_reward)

    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        rand_aggr_ep_rewards['ep'].append(episode)
        rand_aggr_ep_rewards['avg'].append(average_reward)
        rand_aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:]))
        rand_aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:]))

        print(f'Episode: {episode:>5d}, average reward: {average_reward:>4.1f}, current epsilon: {epsilon:>1.2f}')
env.close()

plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['avg'], label = "avg")
plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['min'], label = "min")
plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['max'], label = "max")
plt.legend(loc=4)
plt.show()

In [None]:
#Configuration 4 - greedy player

# Learning rate = .1, epsilon = .01

# When grey out decay, epsilon is fixed which means playing epsilon greedy
env = gym.make("MountainCar-v0")

LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 10000
SHOW_EVERY = 500
STATS_EVERY = 100


DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE

# Exploration settings
epsilon = .01  #Exploration/exploitation parameter. When equal to 1 agent plays randomly
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

# payoff = 0 is the payoff when car reaches flag
# initializes 20x20x3 table which is the observation space (20x20) for all three actions
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))

# For stats
ep_rewards = [] #contains each episodes reward as a list
greedy_aggr_ep_rewards = {'ep': [], 'avg': [], 'max': [], 'min': []} #dictionary that tracks episode number, average, min, max

def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table


for episode in range(EPISODES):
    episode_reward = 0
    discrete_state = get_discrete_state(env.reset())
    done = False

    if episode % SHOW_EVERY == 0:
        render = True
        print(episode)
    else:
        render = False

    while not done:

        if np.random.random() > epsilon:
            # Get action from Q table. Change to random to to random action (action space = 3 here. print(env.action_space.n) to print action space)
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)

        new_state, reward, done, _ = env.step(action)
        # Could add print statement to get printout of all the states
        episode_reward += reward
        new_discrete_state = get_discrete_state(new_state)

        if episode % SHOW_EVERY == 0:
            env.render()
        #new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

        # If simulation did not end yet after last step - update Q table
        if not done:

            # Maximum possible Q value in next step (for new state)
            max_future_q = np.max(q_table[new_discrete_state])

            # Current Q value (for current state and performed action)
            current_q = q_table[discrete_state + (action,)]

            # Equation for a new Q value for current state and action
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

            # Update Q table with new Q value
            q_table[discrete_state + (action,)] = new_q


        # Simulation ended (for any reason) - if goal position is archived - update Q value with reward directly
        elif new_state[0] >= env.goal_position:
            #q_table[discrete_state + (action,)] = reward
            q_table[discrete_state + (action,)] = 0

        discrete_state = new_discrete_state

    # Decaying is being done every episode if episode number is within decaying range
    #if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
     #   epsilon -= epsilon_decay_value

    ep_rewards.append(episode_reward)

    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        greedy_aggr_ep_rewards['ep'].append(episode)
        greedy_aggr_ep_rewards['avg'].append(average_reward)
        greedy_aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:]))
        greedy_aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:]))

        print(f'Episode: {episode:>5d}, average reward: {average_reward:>4.1f}, current epsilon: {epsilon:>1.2f}')
env.close()

In [None]:
#Configuration 4 plot

plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['avg'], label = "random avg")
plt.plot(greedy_aggr_ep_rewards['ep'], greedy_aggr_ep_rewards['avg'], label = "greedy avg")
plt.axis('on')
plt.title("Random Agent (Epsilon = 1) vs Greedy Player (Epsilon .01) LR .1")
plt.xlabel("Episodes")
plt.ylabel("Score")
plt.legend(loc=2)
plt.show()

In [None]:
#Configuration 5 - random player

# Learning rate = .01, epsilon = 1

LEARNING_RATE = 0.01
DISCOUNT = 0.95
EPISODES = 10000
SHOW_EVERY = 500
STATS_EVERY = 100


DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE

# Exploration settings
epsilon = 1  #Exploration/exploitation parameter. When equal to 1 agent plays randomly
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

# payoff = 0 is the payoff when car reaches flag
# initializes 20x20x3 table which is the observation space (20x20) for all three actions
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))

# For stats
ep_rewards = [] #contains each episodes reward as a list
rand_aggr_ep_rewards = {'ep': [], 'avg': [], 'max': [], 'min': []} #dictionary that tracks episode number, average, min, max

def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table


for episode in range(EPISODES):
    episode_reward = 0
    discrete_state = get_discrete_state(env.reset())
    done = False

    if episode % SHOW_EVERY == 0:
        render = True
        print(episode)
    else:
        render = False

    while not done:

        if np.random.random() > epsilon:
            # Get action from Q table. Change to random to to random action (action space = 3 here. print(env.action_space.n) to print action space)
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)

        new_state, reward, done, _ = env.step(action)
        # Could add print statement to get printout of all the states
        episode_reward += reward
        new_discrete_state = get_discrete_state(new_state)

        if episode % SHOW_EVERY == 0:
            env.render()
        #new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

        # If simulation did not end yet after last step - update Q table
        if not done:

            # Maximum possible Q value in next step (for new state)
            max_future_q = np.max(q_table[new_discrete_state])

            # Current Q value (for current state and performed action)
            current_q = q_table[discrete_state + (action,)]

            # Equation for a new Q value for current state and action
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

            # Update Q table with new Q value
            q_table[discrete_state + (action,)] = new_q


        # Simulation ended (for any reason) - if goal position is archived - update Q value with reward directly
        elif new_state[0] >= env.goal_position:
            #q_table[discrete_state + (action,)] = reward
            q_table[discrete_state + (action,)] = 0

        discrete_state = new_discrete_state

    # Decaying is being done every episode if episode number is within decaying range
    # Comment out to avoid epsilon decay
    #if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
     #   epsilon -= epsilon_decay_value

    ep_rewards.append(episode_reward)

    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        rand_aggr_ep_rewards['ep'].append(episode)
        rand_aggr_ep_rewards['avg'].append(average_reward)
        rand_aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:]))
        rand_aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:]))

        print(f'Episode: {episode:>5d}, average reward: {average_reward:>4.1f}, current epsilon: {epsilon:>1.2f}')
env.close()

plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['avg'], label = "avg")
plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['min'], label = "min")
plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['max'], label = "max")
plt.legend(loc=4)
plt.show()

In [None]:
#Configuration 5 - greedy player

# Learning rate = .01, epsilon = .01

# When grey out decay, epsilon is fixed which means playing epsilon greedy
env = gym.make("MountainCar-v0")

LEARNING_RATE = 0.01
DISCOUNT = 0.95
EPISODES = 10000
SHOW_EVERY = 500
STATS_EVERY = 100


DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE

# Exploration settings
epsilon = .01  #Exploration/exploitation parameter. When equal to 1 agent plays randomly
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

# payoff = 0 is the payoff when car reaches flag
# initializes 20x20x3 table which is the observation space (20x20) for all three actions
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))

# For stats
ep_rewards = [] #contains each episodes reward as a list
greedy_aggr_ep_rewards = {'ep': [], 'avg': [], 'max': [], 'min': []} #dictionary that tracks episode number, average, min, max

def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table


for episode in range(EPISODES):
    episode_reward = 0
    discrete_state = get_discrete_state(env.reset())
    done = False

    if episode % SHOW_EVERY == 0:
        render = True
        print(episode)
    else:
        render = False

    while not done:

        if np.random.random() > epsilon:
            # Get action from Q table. Change to random to to random action (action space = 3 here. print(env.action_space.n) to print action space)
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)

        new_state, reward, done, _ = env.step(action)
        # Could add print statement to get printout of all the states
        episode_reward += reward
        new_discrete_state = get_discrete_state(new_state)

        if episode % SHOW_EVERY == 0:
            env.render()
        #new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

        # If simulation did not end yet after last step - update Q table
        if not done:

            # Maximum possible Q value in next step (for new state)
            max_future_q = np.max(q_table[new_discrete_state])

            # Current Q value (for current state and performed action)
            current_q = q_table[discrete_state + (action,)]

            # Equation for a new Q value for current state and action
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

            # Update Q table with new Q value
            q_table[discrete_state + (action,)] = new_q


        # Simulation ended (for any reason) - if goal position is archived - update Q value with reward directly
        elif new_state[0] >= env.goal_position:
            #q_table[discrete_state + (action,)] = reward
            q_table[discrete_state + (action,)] = 0

        discrete_state = new_discrete_state

    # Decaying is being done every episode if episode number is within decaying range
    #if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
     #   epsilon -= epsilon_decay_value

    ep_rewards.append(episode_reward)

    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        greedy_aggr_ep_rewards['ep'].append(episode)
        greedy_aggr_ep_rewards['avg'].append(average_reward)
        greedy_aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:]))
        greedy_aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:]))

        print(f'Episode: {episode:>5d}, average reward: {average_reward:>4.1f}, current epsilon: {epsilon:>1.2f}')
env.close()

In [None]:
#Configuration 5 plot
plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['avg'], label = "random avg")
plt.plot(greedy_aggr_ep_rewards['ep'], greedy_aggr_ep_rewards['avg'], label = "greedy avg")
plt.axis('on')
plt.title("Random Agent (Epsilon = 1) vs Greedy Player (Epsilon .01) LR .01")
plt.xlabel("Episodes")
plt.ylabel("Score")
plt.legend(loc=2)
plt.show()

In [None]:
#Configuration 6 - random player

# Learning rate = .01, epsilon = 1

LEARNING_RATE = 0.01
DISCOUNT = 0.95
EPISODES = 10000
SHOW_EVERY = 500
STATS_EVERY = 100


DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE

# Exploration settings
epsilon = 1  #Exploration/exploitation parameter. When equal to 1 agent plays randomly
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

# payoff = 0 is the payoff when car reaches flag
# initializes 20x20x3 table which is the observation space (20x20) for all three actions
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))

# For stats
ep_rewards = [] #contains each episodes reward as a list
rand_aggr_ep_rewards = {'ep': [], 'avg': [], 'max': [], 'min': []} #dictionary that tracks episode number, average, min, max

def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table


for episode in range(EPISODES):
    episode_reward = 0
    discrete_state = get_discrete_state(env.reset())
    done = False

    if episode % SHOW_EVERY == 0:
        render = True
        print(episode)
    else:
        render = False

    while not done:

        if np.random.random() > epsilon:
            # Get action from Q table. Change to random to to random action (action space = 3 here. print(env.action_space.n) to print action space)
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)

        new_state, reward, done, _ = env.step(action)
        # Could add print statement to get printout of all the states
        episode_reward += reward
        new_discrete_state = get_discrete_state(new_state)

        if episode % SHOW_EVERY == 0:
            env.render()
        #new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

        # If simulation did not end yet after last step - update Q table
        if not done:

            # Maximum possible Q value in next step (for new state)
            max_future_q = np.max(q_table[new_discrete_state])

            # Current Q value (for current state and performed action)
            current_q = q_table[discrete_state + (action,)]

            # Equation for a new Q value for current state and action
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

            # Update Q table with new Q value
            q_table[discrete_state + (action,)] = new_q


        # Simulation ended (for any reason) - if goal position is archived - update Q value with reward directly
        elif new_state[0] >= env.goal_position:
            #q_table[discrete_state + (action,)] = reward
            q_table[discrete_state + (action,)] = 0

        discrete_state = new_discrete_state

    # Decaying is being done every episode if episode number is within decaying range
    # Comment out to avoid epsilon decay
    #if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
     #   epsilon -= epsilon_decay_value

    ep_rewards.append(episode_reward)

    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        rand_aggr_ep_rewards['ep'].append(episode)
        rand_aggr_ep_rewards['avg'].append(average_reward)
        rand_aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:]))
        rand_aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:]))

        print(f'Episode: {episode:>5d}, average reward: {average_reward:>4.1f}, current epsilon: {epsilon:>1.2f}')
env.close()

plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['avg'], label = "avg")
plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['min'], label = "min")
plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['max'], label = "max")
plt.legend(loc=2)
plt.show()

In [None]:
#Configuration 6 - greedy agent

# Learning rate = .1, epsilon = .001

# When grey out decay, epsilon is fixed which means playing epsilon greedy
env = gym.make("MountainCar-v0")

LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 10000
SHOW_EVERY = 500
STATS_EVERY = 100


DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE

# Exploration settings
epsilon = .001  #Exploration/exploitation parameter. When equal to 1 agent plays randomly
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

# payoff = 0 is the payoff when car reaches flag
# initializes 20x20x3 table which is the observation space (20x20) for all three actions
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))

# For stats
ep_rewards = [] #contains each episodes reward as a list
greedy_aggr_ep_rewards = {'ep': [], 'avg': [], 'max': [], 'min': []} #dictionary that tracks episode number, average, min, max

def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table


for episode in range(EPISODES):
    episode_reward = 0
    discrete_state = get_discrete_state(env.reset())
    done = False

    if episode % SHOW_EVERY == 0:
        render = True
        print(episode)
    else:
        render = False

    while not done:

        if np.random.random() > epsilon:
            # Get action from Q table. Change to random to to random action (action space = 3 here. print(env.action_space.n) to print action space)
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)

        new_state, reward, done, _ = env.step(action)
        # Could add print statement to get printout of all the states
        episode_reward += reward
        new_discrete_state = get_discrete_state(new_state)

        if episode % SHOW_EVERY == 0:
            env.render()
        #new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

        # If simulation did not end yet after last step - update Q table
        if not done:

            # Maximum possible Q value in next step (for new state)
            max_future_q = np.max(q_table[new_discrete_state])

            # Current Q value (for current state and performed action)
            current_q = q_table[discrete_state + (action,)]

            # Equation for a new Q value for current state and action
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

            # Update Q table with new Q value
            q_table[discrete_state + (action,)] = new_q


        # Simulation ended (for any reason) - if goal position is archived - update Q value with reward directly
        elif new_state[0] >= env.goal_position:
            #q_table[discrete_state + (action,)] = reward
            q_table[discrete_state + (action,)] = 0

        discrete_state = new_discrete_state

    # Decaying is being done every episode if episode number is within decaying range
    #if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
     #   epsilon -= epsilon_decay_value

    ep_rewards.append(episode_reward)

    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        greedy_aggr_ep_rewards['ep'].append(episode)
        greedy_aggr_ep_rewards['avg'].append(average_reward)
        greedy_aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:]))
        greedy_aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:]))

        print(f'Episode: {episode:>5d}, average reward: {average_reward:>4.1f}, current epsilon: {epsilon:>1.2f}')
env.close()

In [None]:
#Configuration 6 plot

plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['avg'], label = "random avg")
plt.plot(greedy_aggr_ep_rewards['ep'], greedy_aggr_ep_rewards['avg'], label = "greedy avg")
plt.axis('on')
plt.title("Random Agent (Epsilon = 1) vs Greedy Player (Epsilon .001), LR .1")
plt.xlabel("Episodes")
plt.ylabel("Score")
plt.legend(loc=2)
plt.show()

In [None]:
#Configuration 7 - random agent
env = gym.make("MountainCar-v0")
# Learning rate = .1, epsilon = 1, adding epsilon decay

LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 10000
SHOW_EVERY = 500
STATS_EVERY = 100


DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE

# Exploration settings
epsilon = 1  #Exploration/exploitation parameter. When equal to 1 agent plays randomly
#START_EPSILON_DECAYING = 1
#END_EPSILON_DECAYING = EPISODES//2
#epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

# payoff = 0 is the payoff when car reaches flag
# initializes 20x20x3 table which is the observation space (20x20) for all three actions
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))

# For stats
ep_rewards = [] #contains each episodes reward as a list
rand_aggr_ep_rewards = {'ep': [], 'avg': [], 'max': [], 'min': []} #dictionary that tracks episode number, average, min, max

def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table


for episode in range(EPISODES):
    episode_reward = 0
    discrete_state = get_discrete_state(env.reset())
    done = False

    if episode % SHOW_EVERY == 0:
        render = True
        print(episode)
    else:
        render = False

    while not done:

        if np.random.random() > epsilon:
            # Get action from Q table. Change to random to to random action (action space = 3 here. print(env.action_space.n) to print action space)
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)

        new_state, reward, done, _ = env.step(action)
        # Could add print statement to get printout of all the states
        episode_reward += reward
        new_discrete_state = get_discrete_state(new_state)

        if episode % SHOW_EVERY == 0:
            env.render()
        #new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

        # If simulation did not end yet after last step - update Q table
        if not done:

            # Maximum possible Q value in next step (for new state)
            max_future_q = np.max(q_table[new_discrete_state])

            # Current Q value (for current state and performed action)
            current_q = q_table[discrete_state + (action,)]

            # Equation for a new Q value for current state and action
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

            # Update Q table with new Q value
            q_table[discrete_state + (action,)] = new_q


        # Simulation ended (for any reason) - if goal position is archived - update Q value with reward directly
        elif new_state[0] >= env.goal_position:
            #q_table[discrete_state + (action,)] = reward
            q_table[discrete_state + (action,)] = 0

        discrete_state = new_discrete_state

    # Decaying is being done every episode if episode number is within decaying range
    # Comment out to avoid epsilon decay
    if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
        epsilon -= epsilon_decay_value

    ep_rewards.append(episode_reward)

    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        rand_aggr_ep_rewards['ep'].append(episode)
        rand_aggr_ep_rewards['avg'].append(average_reward)
        rand_aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:]))
        rand_aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:]))

        print(f'Episode: {episode:>5d}, average reward: {average_reward:>4.1f}, current epsilon: {epsilon:>1.2f}')
env.close()

plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['avg'], label = "avg")
plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['min'], label = "min")
plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['max'], label = "max")
plt.legend(loc=2)
plt.show()

In [None]:
#Configuration 7 - greedy player

# Learning rate = .1, epsilon = .01

# When grey out decay, epsilon is fixed which means playing epsilon greedy
env = gym.make("MountainCar-v0")

LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 10000
SHOW_EVERY = 500
STATS_EVERY = 100


DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE

# Exploration settings
epsilon = .01  #Exploration/exploitation parameter. When equal to 1 agent plays randomly (only explores)
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

# payoff = 0 is the payoff when car reaches flag
# initializes 20x20x3 table which is the observation space (20x20) for all three actions
q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))

# For stats
ep_rewards = [] #contains each episodes reward as a list
greedy_aggr_ep_rewards = {'ep': [], 'avg': [], 'max': [], 'min': []} #dictionary that tracks episode number, average, min, max

def get_discrete_state(state):
    discrete_state = (state - env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table

for episode in range(EPISODES):
    episode_reward = 0
    discrete_state = get_discrete_state(env.reset())
    done = False

    if episode % SHOW_EVERY == 0:
        render = True
        print(episode)
    else:
        render = False

    while not done:

        if np.random.random() > epsilon:
            # Get action from Q table. Change to random to to random action (action space = 3 here. print(env.action_space.n) to print action space)
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)

        new_state, reward, done, _ = env.step(action)
        # Could add print statement to get printout of all the states
        episode_reward += reward
        new_discrete_state = get_discrete_state(new_state)

        if episode % SHOW_EVERY == 0:
            env.render()
        #new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

        # If simulation did not end yet after last step - update Q table
        if not done:

            # Maximum possible Q value in next step (for new state)
            max_future_q = np.max(q_table[new_discrete_state])

            # Current Q value (for current state and performed action)
            current_q = q_table[discrete_state + (action,)]

            # Equation for a new Q value for current state and action
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

            # Update Q table with new Q value
            q_table[discrete_state + (action,)] = new_q


        # Simulation ended (for any reason) - if goal position is archived - update Q value with reward directly
        elif new_state[0] >= env.goal_position:
            #q_table[discrete_state + (action,)] = reward
            q_table[discrete_state + (action,)] = 0

        discrete_state = new_discrete_state

    # Decaying is being done every episode if episode number is within decaying range
    if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
        epsilon -= epsilon_decay_value

    ep_rewards.append(episode_reward)

    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        greedy_aggr_ep_rewards['ep'].append(episode)
        greedy_aggr_ep_rewards['avg'].append(average_reward)
        greedy_aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:]))
        greedy_aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:]))

        print(f'Episode: {episode:>5d}, average reward: {average_reward:>4.1f}, current epsilon: {epsilon:>1.2f}')
env.close()

In [None]:
#Configuration 7 plot

plt.plot(rand_aggr_ep_rewards['ep'], rand_aggr_ep_rewards['avg'], label = "random avg")
plt.plot(greedy_aggr_ep_rewards['ep'], greedy_aggr_ep_rewards['avg'], label = "greedy avg")
plt.axis('on')
plt.title("Random Agent vs Greedy, LR .1, and Epsilon Decay")
plt.xlabel("Episodes")
plt.ylabel("Score")
plt.legend(loc=2)
plt.show()

