In [103]:
import gym
import numpy as np
import matplotlib.pyplot as plt

env = gym.make("MountainCar-v0")

LEARNING_RATE = 0.1
DISCOUNT = 0.95

def Q_learning(LEARNING_RATE, DISCOUNT):

    EPISODES = 10000
    SHOW_EVERY = 1000
    STATS_EVERY = 100

    DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
    discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE

    # For stats
    ep_rewards = []
    aggr_ep_rewards = {'ep': [], 'avg': [], 'max': [], 'min': []}

    # Exploration settings
    epsilon = 1  # not a constant, qoing to be decayed
    START_EPSILON_DECAYING = 1
    END_EPSILON_DECAYING = EPISODES//2
    epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

    q_table = np.random.uniform(low=-2, high=0, size=(DISCRETE_OS_SIZE + [env.action_space.n]))

    def get_discrete_state(state):
        discrete_state = (state - env.observation_space.low)/discrete_os_win_size
        return tuple(discrete_state.astype(np.int))  # we use this tuple to look up the 3 Q values for the available actions in the q-table

    for episode in range(EPISODES):
        episode_reward = 0
        discrete_state = get_discrete_state(env.reset())
        done = False

        if episode % SHOW_EVERY == 0:
            render = True
            print(episode)
        else:
            render = False

        while not done:

            if np.random.random() > epsilon:
                # Get action from Q table
                action = np.argmax(q_table[discrete_state])
            else:
                # Get random action
                action = np.random.randint(0, env.action_space.n)


            new_state, reward, done, _ = env.step(action)

            episode_reward += reward

            new_discrete_state = get_discrete_state(new_state)

            if episode % SHOW_EVERY == 0:
                env.render()
    #         new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

            # If simulation did not end yet after last step - update Q table
            if not done:

                # Maximum possible Q value in next step (for new state)
                max_future_q = np.max(q_table[new_discrete_state])

                # Current Q value (for current state and performed action)
                current_q = q_table[discrete_state + (action,)]

                # And here's our equation for a new Q value for current state and action
                new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)

                # Update Q table with new Q value
                q_table[discrete_state + (action,)] = new_q


            # Simulation ended (for any reson) - if goal position is achived - update Q value with reward directly
            elif new_state[0] >= env.goal_position:
                #q_table[discrete_state + (action,)] = reward
                q_table[discrete_state + (action,)] = 0

            discrete_state = new_discrete_state

        # Decaying is being done every episode if episode number is within decaying range
        if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
            epsilon -= epsilon_decay_value

        ep_rewards.append(episode_reward)
        if not episode % STATS_EVERY:
            average_reward = sum(ep_rewards[-STATS_EVERY:])/STATS_EVERY
            aggr_ep_rewards['ep'].append(episode)
            aggr_ep_rewards['avg'].append(average_reward)
            aggr_ep_rewards['max'].append(max(ep_rewards[-STATS_EVERY:]))
            aggr_ep_rewards['min'].append(min(ep_rewards[-STATS_EVERY:]))
            print(f'Episode: {episode:>5d}, average reward: {average_reward:>4.1f}, current epsilon: {epsilon:>1.2f}')
    env.close()
    return aggr_ep_rewards, q_table

def plot_q_table(q_table):
    """Visualize max Q-value for each state and corresponding action."""
    q_image = np.max(q_table, axis=2)       # max Q-value for each state
    q_actions = np.argmax(q_table, axis=2)  # best action for each state
    fig, ax = plt.subplots(figsize=(10, 10))
    cax = ax.imshow(q_image, cmap='jet');
    cbar = fig.colorbar(cax)
    for x in range(q_image.shape[0]):
        for y in range(q_image.shape[1]):
            ax.text(x, y, q_actions[x, y], color='white',
                    horizontalalignment='center', verticalalignment='center')
    ax.grid(False)
    ax.set_title("Q-table, size: {}".format(q_table.shape))
    ax.set_xlabel('position')
    ax.set_ylabel('velocity')

plot_q_table(q_table)

In [None]:
fig, axs= plt.subplots(2, 2)
fig.set_size_inches(18.5, 10.5)

learning_rates = [0.05, 0.10]
discounts = [0.90, 0.95]

results = []
for j in range(len(discounts)):
    for i in range(len(learning_rates)):
        aggr_ep_rewards = Q_learning(learning_rates[i], discounts[j])
        results.append(aggr_ep_rewards)

fig, axs= plt.subplots(2, 2)
fig.set_size_inches(18.5, 10.5)

axs = axs.ravel()

means =[]
mins = []
maxs = []

for i, aggr_ep_rewards in enumerate(results):

    axs[i].plot(aggr_ep_rewards['ep'][1:], aggr_ep_rewards['avg'][1:], label="average rewards")
    axs[i].plot(aggr_ep_rewards['ep'][1:], aggr_ep_rewards['max'][1:], label="max rewards")
    axs[i].plot(aggr_ep_rewards['ep'][1:], aggr_ep_rewards['min'][1:], label="min rewards")
    axs[i].set_title(f'Using a Learning Rate of {learning_rates[i]}')
    axs[i].legend(loc=2)
#     plt.show()
    print(np.array(aggr_ep_rewards['avg']).mean(), learning_rates[i])
    
    mean_mean_reward = np.array(aggr_ep_rewards['avg']).mean()
    max_mean_reward = np.array(aggr_ep_rewards['max']).mean()
    min_mean_reward = np.array(aggr_ep_rewards['min']).mean()
    means.append(mean_mean_reward)
    mins.append(min_mean_reward)
    maxs.append(max_mean_reward)
    print(i)
    
import pandas as pd

results_df = pd.DataFrame()
results_df['Alpha'] = learning_rates
results_df['Gamma'] = discounts
results_df['Min'] = mins
results_df['Max'] = maxs
results_df['Means'] = means

results_df.sort_values('Means', ascending=False)

optimal_learning_rate = results_df.sort_values('Means', ascending=False).reset_index().loc[0, 'Alpha']
optimal_discount = results_df.sort_values('Means', ascending=False).reset_index().loc[0, 'Gamma']
_, q_table = Q_learning(optimal_learning_rate, optimal_discount)
plot_q_table(q_table)