In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt 
from gym.envs.registration import register
import random as pr

In [None]:
env = gym.make('FrozenLake-v0')
learning_rate = 0.85
dis = .99
num_episodes = 2000

In [None]:
Q = np.zeros([env.observation_space.n, env.action_space.n])
rList = []

## Exploration Strategy
###  $\epsilon$ - greedy
$a_t = \underset{a}{\operatorname{argmax}Q(s,a)}$ (with probability $1 - \epsilon$)  
$a_t = \mathcal{N}_t$ (with probability $\epsilon$)  
- $\epsilon$의 확률로 random action을 취해 exploration을 진행
- action space가 continuous한 경우 잘 쓰이지 않으며, discrete action에 대하여 자주 쓰임


In [None]:
for i in range(num_episodes):
    # env 리셋
    state = env.reset()
    rAll = 0
    done = False
    
    e = 1./((i//100)+1) # decaying E-greedy
    
    # Q-테이블 알고리즘
    while not done:
        # egreedy에 의한 행동 설정
        if np.random.rand(1) < e:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state, :])

        # new_state, reward 업데이트 
        new_state, reward, done, _ = env.step(action)
        
        # 업데이트 Q-table (상태, 행동)
        Q[state, action] = (1-learning_rate) * Q[state, action] + learning_rate*(reward + dis*np.max(Q[new_state, :]))
        
        rAll += reward
        state = new_state

    rList.append(rAll)


In [None]:
print('성공율: ', str(sum(rList)/num_episodes))
print('Q-table')
print(Q)
plt.bar(range(len(rList)), rList, color = 'blue')
plt.show()