In [1]:
# Q-learning

import gym
import random
import numpy as np
import time
from gym.envs.registration import register   # we use this to get rid of the slippery stuff in frozen lake
from IPython.display import clear_output

In [2]:
# try:
#     register(
#             id='FrozenLakeNoSlip-v0',
#             entry_point='gym.envs.toy_text:FrozenLakeEnv',
#             kwargs={'map_name' : '4x4', 'is_slippery':False},
#             max_episode_steps=100,
#             reward_threshold=0.78, # optimum = .8196
#             )
# except:
#     pass

# try:
#     register(
#             id='KellyCoinfliptest-v0',
#             entry_point='gym.envs.toy_text:KellyCoinflipEnv',
#             kwargs={'initial_wealth':25.0, 'edge':0.6, 'max_wealth':250.0, 'max_rounds' : 10.0},
#             reward_threshold=246.61,
# )
# except:
#     pass

In [3]:
#env_name = 'CartPole-v1'
#env_name = 'MountainCarContinuous-v0'
#env_name = 'Acrobot-v1'
#env_name = 'FrozenLake-v0'
#env_name = 'FrozenLakeNoSlip-v0'
#env_name = 'Taxi-v3'
#env_name = 'Gamble-v0'
#env_name = 'KellyCoinflip-v0'
env_name = 'Maze_edited-v0'

env = gym.make(env_name)
#goal_steps = 200

In [4]:
class Agent():
    def __init__(self, env):
        self.is_discrete = (type(env.action_space) == gym.spaces.discrete.Discrete)
        
        if self.is_discrete:
            self.action_size = env.action_space.n # how many actions are available - only works for discrete
            print("Action size: ", self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
    
    def get_action(self, state): # choosing an action from the available actions
        if self.is_discrete:
            action = random.choice(range(self.action_size))  # discrete number of actions이기 때문에 이런 random 선정이 가능하다.
        else:
            action = np.random.uniform(self.action_low, self.action_high, self.action_shape)
        return action
    
agent = Agent(env)

Action size:  4


In [5]:
class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 0.2                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
        elif state == next_state:
             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

Action size:  4
State size:  121


In [6]:
print(env.observation_space)
print(agent.state_size, agent.action_size)
print(agent.eps)
env.render()
'''
LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3
벽으로 갈 때 강한 penalty 부여
'''

Discrete(121)
121 4
0.2

WWWWWWWWWWW
WPWPPPPPPGW
WPWPWPWWWWW
WPPPWPPPPPW
WPWWWWWWWPW
WPPPPPPWPPW
WPWWWWPWPWW
WPPPPWPPPPW
WWWWPWPWWWW
W[41mS[0mPPPWPPPPW
WWWWWWWWWWW


'\nLEFT = 0\nDOWN = 1\nRIGHT = 2\nUP = 3\n벽으로 갈 때 강한 penalty 부여\n'

In [7]:
q_table_softmax = np.zeros([agent.state_size, agent.action_size])

# q-table softmax화
def softmax_array(array):
    for i in range(agent.state_size):
        q_table_softmax[i,:]  = np.exp(array[i,:])
        q_table_softmax[i,:] /= np.sum(q_table_softmax[i,:])
    return np.around(q_table_softmax, decimals = 2)
        
def print_softmax_array(array):
    for i in range(agent.state_size):
        q_table_softmax[i,:]  = np.exp(array[i,:])
        q_table_softmax[i,:] /= np.sum(q_table_softmax[i,:])
    q_table_softmax_final = np.around(q_table_softmax, decimals = 3)
    print(q_table_softmax_final)
        
print_softmax_array(agent.q_table)

[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25

In [8]:
total_reward = 0
for i in range(500):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    #agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()

state:  19 action:  2
Training Session:  499     Total reward:  49900     randomness:  0.2
  (Right)
WWWWWWWWWWW
WPWPPPPPP[41mG[0mW
WPWPWPWWWWW
WPPPWPPPPPW
WPWWWWWWWPW
WPPPPPPWPPW
WPWWWWPWPWW
WPPPPWPPPPW
WWWWPWPWWWW
WSPPPWPPPPW
WWWWWWWWWWW
[[ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [-3.8888276  -3.89488401 -3.8888276  -3.8888276 ]
 [ 0.          0.          0.          0.        ]
 [-3.31028241 -0.65625043 29.01710086 -3.573

In [9]:
agent.eps = 0
#agent.eps = 0.4

In [10]:
'''
12.9
reward에 대한 욕심이 없다? 그냥 계속 벽에 박는다. 앞으로 나아가는 것에 대해서 안좋게 인식.

Possible solutions:
중간에 reward
벽에 부딪치면 stronger punishment (개선 가능성이 적어보인다)
==>

'''

'''
벽에 부딪치는 것에 대한 punishment 증가 시킴.
randomness 0.2로 설정해서 하니까 잘찾는다.
100번으로는 학습이 잘 안된다.
==> 400번 하면 학습이 어느 정도 잘 된다.

더 먼 경로를 사전정보로 주면 어떤 결과가 있는지

아래에 길 하나 더 뚫어보기 (best, medium, worst path)
'''

'\n벽에 부딪치는 것에 대한 punishment 증가 시킴.\nrandomness 0.2로 설정해서 하니까 잘찾는다.\n100번으로는 학습이 잘 안된다.\n==> 400번 하면 학습이 어느 정도 잘 된다.\n\n더 먼 경로를 사전정보로 주면 어떤 결과가 있는지\n'

In [11]:
# 현재 연구에 써볼만한 환경 연구해보기 (3개)
# cliff-walking
# kelly coinflip

# pacman

# maze