<a href="https://colab.research.google.com/github/JSJeong-me/AI-Innovation-2024/blob/main/RL/6-4-Frozen-Lake-Q-Table.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gym
import numpy as np

# Frozen Lake 환경 불러오기
env = gym.make('FrozenLake-v1', is_slippery=True)

# Q 테이블 초기화
action_space_size = env.action_space.n
state_space_size = env.observation_space.n
q_table = np.zeros((state_space_size, action_space_size))

# 하이퍼파라미터 설정
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = 0.1  # 학습률
discount_rate = 0.99  # 감가율
exploration_rate = 1  # 탐험 비율
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

# 리워드 저장 리스트
rewards_all_episodes = []

# Q-learning 알고리즘
for episode in range(num_episodes):
    state = env.reset()
    done = False
    rewards_current_episode = 0

    for step in range(max_steps_per_episode):
        # 탐험-탐사 결정
        exploration_rate_threshold = np.random.uniform(0, 1)
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state, :])  # Q값 기반 행동 선택 (탐사)
        else:
            action = env.action_space.sample()  # 랜덤 행동 선택 (탐험)

        # 환경에 행동 적용
        new_state, reward, done, info = env.step(action)

        # Q 테이블 업데이트
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
            learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))

        state = new_state
        rewards_current_episode += reward

        if done == True:
            break

    # 탐험 비율 점진적 감소
    exploration_rate = min_exploration_rate + \
        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)

    rewards_all_episodes.append(rewards_current_episode)

# 평균 리워드 계산
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes / 1000)
count = 1000
print("******** 평균 리워드 ********\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r / 1000)))
    count += 1000

# 최종 Q 테이블 출력
print("\n\n******** 최종 Q 테이블 ********\n")
print(q_table)



  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


******** 평균 리워드 ********

1000 :  0.03300000000000002
2000 :  0.21100000000000016
3000 :  0.4140000000000003
4000 :  0.5610000000000004
5000 :  0.6360000000000005
6000 :  0.6390000000000005
7000 :  0.6760000000000005
8000 :  0.6690000000000005
9000 :  0.6620000000000005
10000 :  0.6890000000000005


******** 최종 Q 테이블 ********

[[0.57966561 0.51741236 0.50806741 0.52250704]
 [0.35216096 0.25829468 0.32001542 0.5127292 ]
 [0.44540407 0.28653831 0.26320102 0.30290423]
 [0.03820051 0.16383636 0.02965983 0.07225523]
 [0.60115621 0.39286599 0.49665928 0.37971481]
 [0.         0.         0.         0.        ]
 [0.13091692 0.08015433 0.4022208  0.1041386 ]
 [0.         0.         0.         0.        ]
 [0.40882497 0.33501742 0.41347084 0.61991955]
 [0.39492826 0.63640498 0.43869335 0.39154075]
 [0.58180128 0.44700355 0.43838244 0.30192312]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.33804413 0.64599841 0.75972848 0.47577761]
 [0.68956045 