Library 준비

In [None]:
!pip install numpy==1.23.5 # 버전 호환 문제
!pip install gym



In [None]:
import numpy as np
import gym
import random

In [None]:
pip show gym

Name: gym
Version: 0.25.2
Summary: Gym: A universal API for reinforcement learning environments
Home-page: https://www.gymlibrary.ml/
Author: Gym Community
Author-email: jkterry@umd.edu
License: MIT
Location: /usr/local/lib/python3.11/dist-packages
Requires: cloudpickle, gym_notices, numpy
Required-by: dopamine_rl


1. Taxi 환경 생성하기

In [None]:
env = gym.make("Taxi-v3") #버전 0.25.2 임
env.reset() # gym 에서는 env 사용전에 reset 해야함 => env.render 하기 전에 reset
env.render()

  deprecation(
  deprecation(
If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-name

2. Q table 만들고 초기화

In [None]:
state_space = env.observation_space.n
print("There are ", state_space, " possible states")
action_space = env.action_space.n
print("There are ", action_space, " possible actions")

There are  500  possible states
There are  6  possible actions


In [None]:
# Create our Q table with state_size rows and action_size columns (500x6)
Q = np.zeros((state_space, action_space))
print(Q)
print(Q.shape)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
(500, 6)


3. hyperparameters 설정

In [None]:
total_episodes = 5000        # Total number of training episodes
total_test_episodes = 10      # Total number of test episodes
max_steps = 200               # Max steps per episode

learning_rate = 0.01          # Learning rate
gamma = 0.99                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.001           # Minimum exploration probability
decay_rate = 0.01             # Exponential decay rate for exploration prob

4. epsilon-greedy policy 정의

In [None]:
def epsilon_greedy_policy(Q, state):
  # if random number > greater than epsilon --> exploitation
  if(random.uniform(0,1) > epsilon):
    action = np.argmax(Q[state])
  # else --> exploration
  else:
    action = env.action_space.sample()

  return action

5. Q-learning 알고리즘 정의 및 Train

In [None]:
"""
Q. AttributeError: module 'numpy' has no attribute 'bool8' 에러 메세지가 나왔다면?
A. 최근 버전의 gym 또는 numpy 와 gym 내부 구현 간의 버전 호환성 문제임. gym 내부에서 np.bool8을 사용하는 부분에서 문제 발생
numpy >= 1.24 버전에서는 np.bool8이 삭제되고 np.bool_로 통일됨.
sol1) numpy downgrade => pip install numpy==1.23.5
sol2)  env.step() 반환값 호환 처리 => 5개의 값을 반환하도록 수정
new_state, reward, terminated, truncated, info = env.step(action)
done = terminated or truncated


"""

"\nQ. AttributeError: module 'numpy' has no attribute 'bool8' 에러 메세지가 나왔다면?\nA. 최근 버전의 gym 또는 numpy 와 gym 내부 구현 간의 버전 호환성 문제임. gym 내부에서 np.bool8을 사용하는 부분에서 문제 발생\nnumpy >= 1.24 버전에서는 np.bool8이 삭제되고 np.bool_로 통일됨.\nsol1) numpy downgrade => pip install numpy==1.23.5\nsol2)  env.step() 반환값 호환 처리 => 5개의 값을 반환하도록 수정\nnew_state, reward, terminated, truncated, info = env.step(action)\ndone = terminated or truncated\n\n\n"

In [None]:
 for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    #step = 0
    done = False

    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)

    for step in range(max_steps):
        #
        action = epsilon_greedy_policy(Q, state)

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)
        # done = terminated or truncated


        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        Q[state][action] = Q[state][action] + learning_rate * (reward + gamma *
                                    np.max(Q[new_state]) - Q[state][action])
        # If done : finish episode
        if done == True:
            break

        # Our new state is state
        state = new_state
    # epsilon 감소
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)


print("🚀 Q-learning 학습 완료!")
env.close()


🚀 Q-learning 학습 완료!


6. agent 실행

In [None]:
import time
rewards = []

frames = []
for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    print("****************************************************")
    print("EPISODE ", episode)
    for step in range(max_steps):
        print(env.render(mode="ansi"))
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(Q[state][:])
        new_state, reward, done, info = env.step(action)
        total_rewards += reward

        if done:
            rewards.append(total_rewards)
            #print ("Score", total_rewards)
            break
        state = new_state
env.close()
print ("Score over time: " +  str(sum(rewards)/total_test_episodes))

****************************************************
EPISODE  0
+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |B: |
+---------+


+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[43mB[0m: |
+---------+
  (South)

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[43mB[0m: |
+---------+
  (South)

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[43mB[0m: |
+---------+
  (South)

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[43mB[0m: |
+---------+
  (South)

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[43mB[0m: |
+---------+
  (South)

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[43mB[0m: |
+---------+
  (South)

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | :

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


In [None]:
Q

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-3.36130519, -3.36090352, -3.36006781, -3.35967887, -3.35389914,
        -3.41730653],
       [-1.65377724, -1.63598618, -1.64192615, -1.63631443,  3.79806203,
        -1.69461042],
       ...,
       [-0.75713785, -0.72132849, -0.755952  , -0.75482736, -0.79745106,
        -0.79060446],
       [-2.15613454, -2.15976023, -2.15639158, -2.15481668, -2.19367362,
        -2.19122704],
       [-0.019999  , -0.029997  , -0.019999  ,  0.31885947, -0.199     ,
        -0.29726698]])