In [1]:
import gym
import numpy as np
env = gym.make('FrozenLake-v1')

In [2]:
#Q Table 초기화(2차원)
# Q: 주어진 state에서 어떤 action을 취할 것인지에 대한 길잡이
# env.observation_space.n: 환경의 경우의 수
# env.action_space.n: 행동의 경우의 수
Q = np.zeros([env.observation_space.n, env.action_space.n])
print(Q.shape)

(16, 4)


In [3]:
#하이퍼 파라미터 초기화
# 할인률
dis = 0.99 

# 시도 횟수(에피소드)
num_episodes = 2000

# 에피소드마다 총 보상의 합을 저장하는 리스트
rList = []

In [4]:
#Q업데이트 - 랜덤방식
frames=[]
for i in range(num_episodes):
    state = env.reset()
    rAll = 0
    done = False

    # Q learning 알고리즘
    while not done:
        # Action 중 가장 R이 큰 Action을 랜덤으로 고르는 방식
        action = np.argmax(Q[state, :] + np.random.randn(1, env.action_space.n)/(i+1))
        
        new_state, reward, done, _ = env.step(action)

        # Q = R + 할인율*max(Q)
        Q[state, action] = reward + dis*np.max(Q[new_state, :])
        
        rAll += reward
        state = new_state

        # 애니메이션을 위하여 정보 기록
        frames.append({
            'frame': env.render(mode='ansi'),
            'state': new_state,
            'action': action,
            'reward': reward
            }
        )  
    rList.append(rAll)

In [5]:
print("Success rate: " + str(sum(rList) / num_episodes))
print("Final Q-Table Values")
print(Q)

Success rate: 0.019
Final Q-Table Values
[[0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.96059601 0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.970299   0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         1.         0.        ]
 [0.         0.         0.         0.        ]]


In [7]:
#Q업데이트 - E-greedy 방식
frames=[]
for i in range(num_episodes):
    state = env.reset()
    rAll = 0
    done = False

    # exploration의 확률
    e = 1. / ((i/100) + 1)

    # Q learning 알고리즘
    while not done:
        # E-Greedy 알고리즘으로 Action을 고르는 방식
        if np.random.rand(1) < e:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state, :])
            
        new_state, reward, done, _ = env.step(action)

        # Q = R + 할인율*max(Q)
        Q[state, action] = reward + dis*np.max(Q[new_state, :])

        rAll += reward
        state = new_state

        # 애니메이션을 위하여 정보 기록
        frames.append({
            'frame': env.render(mode='ansi'),
            'state': new_state,
            'action': action,
            'reward': reward
            }
        )
    rList.append(rAll)

In [8]:
print("Success rate: " + str(sum(rList) / num_episodes))
print("Final Q-Table Values")
print(Q)

Success rate: 0.051
Final Q-Table Values
[[0.0643292  0.0643292  0.0643292  0.62352539]
 [0.         0.         0.         0.06241855]
 [0.         0.         0.         0.06497899]
 [0.         0.         0.         0.0643292 ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.9801     0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.12741334 1.         0.        ]
 [0.         0.         0.         0.        ]]


In [9]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)
        
print_frames(frames)

  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG

Timestep: 250
State: 0
Action: 1
Reward: 0.0
