In [None]:
"""
    algorithm：Q-Learing
               Q(s,a) <- Q(s,a) + alpha(r + gamma * maxa_Q(s_, a_) - Q(s, a))
               
    environment：FrozenLake-v0
    
    author: Xinchen Han
    date: 2020/7/25
"""

In [None]:
import os
import time

import gym
import matplotlib.pyplot as plt
import numpy as np

In [None]:
alg_name = 'Q_Learning'
env_id = 'FrozenLake-v0'
env = gym.make(env_id)

## Set hyperparameters
epsilon = .8
alpha = .8  
gamma = .9  # decay factor
max_episodes = 20000
t0 = time.time()

Q_table = np.zeros([env.observation_space.n, env.action_space.n], dtype = np.float64)
reward_buffer = [0] # In case the error: list index out of range

In [None]:
def choose_action(state):
    """
        take action policy：epsilon-greedy
    """
    if (np.random.rand() > epsilon) or ((Q_table[state, :] == 0)).all():
        action = np.random.choice(env.action_space.n)
    else:
        action = np.argmax(Q_table[state,:])
    return action

def Q_Learning():
    for episode in range(max_episodes):
        state = env.reset()
        epi_reward = 0
        done = False
        while not done:
#             action = choose_action(state)  # the epsilon-greedy policy is worse than the nosiy_greedy policy
            action = np.argmax(Q_table[state, :] + np.random.randn(1, env.action_space.n) * (1. / (episode + 1)))
            state_, reward, done, _ = env.step(action)
            Q_table[state][action] = Q_table[state][action] + \
                                        alpha * (reward + gamma * np.max(Q_table[state_, :]) - Q_table[state][action])
            state = state_
            epi_reward += reward
        reward_buffer.append(reward_buffer[-1] * 0.9 + epi_reward * 0.1)
        print(
                'Training  | Episode: {}/{}  | Reward:{: .4f} |Running Time: {:.4f}'.format(
                episode + 1, max_episodes, epi_reward, time.time() - t0))

In [None]:
Q_Learning()
plt.plot(reward_buffer)