In [1]:
# Q-learning

import gym
import random
import numpy as np
import time
from gym.envs.registration import register   # we use this to get rid of the slippery stuff in frozen lake
from IPython.display import clear_output

In [2]:
# try:
#     register(
#             id='FrozenLakeNoSlip-v0',
#             entry_point='gym.envs.toy_text:FrozenLakeEnv',
#             kwargs={'map_name' : '4x4', 'is_slippery':False},
#             max_episode_steps=100,
#             reward_threshold=0.78, # optimum = .8196
#             )
# except:
#     pass

# try:
#     register(
#             id='KellyCoinfliptest-v0',
#             entry_point='gym.envs.toy_text:KellyCoinflipEnv',
#             kwargs={'initial_wealth':25.0, 'edge':0.6, 'max_wealth':250.0, 'max_rounds' : 10.0},
#             reward_threshold=246.61,
# )
# except:
#     pass

In [3]:
#env_name = 'CartPole-v1'
#env_name = 'MountainCarContinuous-v0'
#env_name = 'Acrobot-v1'
#env_name = 'FrozenLake-v0'
#env_name = 'FrozenLakeNoSlip-v0'
#env_name = 'Taxi-v3'
#env_name = 'Gamble-v0'
#env_name = 'KellyCoinflip-v0'
env_name = 'Cliff_edited-v0'

env = gym.make(env_name)
#goal_steps = 200

In [4]:
class Agent():
    def __init__(self, env):
        self.is_discrete = (type(env.action_space) == gym.spaces.discrete.Discrete)
        
        if self.is_discrete:
            self.action_size = env.action_space.n # how many actions are available - only works for discrete
            print("Action size: ", self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
    
    def get_action(self, state): # choosing an action from the available actions
        if self.is_discrete:
            action = random.choice(range(self.action_size))  # discrete number of actions이기 때문에 이런 random 선정이 가능하다.
        else:
            action = np.random.uniform(self.action_low, self.action_high, self.action_shape)
        return action
    
agent = Agent(env)

Action size:  4


In [5]:
class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
        for i in range(39,47):
            self.q_table[i][2] = 5
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
        elif state == next_state:
             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

Action size:  4
State size:  72


In [6]:
print(env.observation_space)
print(agent.state_size, agent.action_size)
print(agent.eps)
env.render()
'''
UP = 0
RIGHT = 1
DOWN = 2
LEFT = 3
'''

Discrete(72)
72 4
1.0

WWWWWWWWWWWW
WPPPPPPPPPPW
WPPPPPPPPPPW
WPPPPPPPPPPW
W[41mS[0mCCCCCCCCGW
WWWWWWWWWWWW


'\nUP = 0\nRIGHT = 1\nDOWN = 2\nLEFT = 3\n'

In [7]:
q_table_softmax = np.zeros([agent.state_size, agent.action_size])

# q-table softmax화
def softmax_array(array):
    for i in range(agent.state_size):
        q_table_softmax[i,:]  = np.exp(array[i,:])
        q_table_softmax[i,:] /= np.sum(q_table_softmax[i,:])
    return np.around(q_table_softmax, decimals = 2)
        
def print_softmax_array(array):
    for i in range(agent.state_size):
        q_table_softmax[i,:]  = np.exp(array[i,:])
        q_table_softmax[i,:] /= np.sum(q_table_softmax[i,:])
    q_table_softmax_final = np.around(q_table_softmax, decimals = 3)
    print(q_table_softmax_final)
        
print_softmax_array(agent.q_table)

[[0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 

In [12]:
total_reward = 0
for i in range(5):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        print_softmax_array(agent.q_table)
        print(agent.q_table)               # show q-table after every action
        time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward

env.close()

state:  46 action:  2
Training Session:  4     Total reward:  400     randomness:  0.0
  (Down)
WWWWWWWWWWWW
WPPPPPPPPPPW
WPPPPPPPPPPW
WPPPPPPPPPPW
WSCCCCCCCC[41mG[0mW
WWWWWWWWWWWW
[[0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.271 0.272 0.271 0.186]
 [0.205 0.265 0.264 0.266]
 [0.239 0.255 0.253 0.254]
 [0.222 0.26  0.261 0.258]
 [0.232 0.257 0.256 0.255]
 [0.2   0.267 0.267 0.266]
 [0.225 0.258 0.259 0.258]
 [0.219 0.261 0.262 0.258]
 [0.249 0.249 0.252 0.25 ]
 [0.211 0.233 0.284 0.273]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.271 0.269 0.27  0.19 ]
 [0.251 0.25  0.249 0.249]
 [0.25  0.25  0.25  0.251]
 [0.251 0.251 0.25  0.248]
 [0.238 0.24  0.284 0.239]
 [0.212

In [9]:
agent.eps = 0
#agent.eps = 0.1
np.savetxt('q_table.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table.csv', softmax_array(agent.q_table), delimiter=',')

In [10]:
'''
12.3
2개정도 줄이기. (faster simulation)
보상과 step penalty 차이가 너무 크다.

12.9
cliff penalty를 -10으로 줄였더니 계속 첫 자리에 남아있으려는 경향이 있다 (벽쪽으로 이동하려는 경향)
==> 사각에 Cliff 추가 후 학습 잘 됨 (optimal route). Goal reward 50, Cliff penalty -10, step penalty -1

'''

'''
12.23
W(벽) 만들어서 실험 해보니 성공적으로 제자리 걸음 안하게 했다.
현재 1000번 학습하면 최단거리로 길 잘 찾는다.

안전한 경로에 대한 사전정보 & 잘못된 사전정보에 대한 결과 확인해보기
'''

'''
1.6
random 1, decays by 0.99하면 400번에서 어느 정도 학습은 된다 (최적 경로 X, 답을 찾긴하지만 오래 걸린다)
                             500번에서 거의 최적 경로로 학습한다.
                             
State 39-46이 아래로 이동하면 cliff으로 떨어지는 구간. Q-table score은 대략 -31 ~ -8 까지
해당 구간에 1이라는 잘못된 prior를 줬을때 500번 학습하면 성공적으로 길 찾음
해당 구간에 5이라는 잘못된 prior를 줬을때 500번 학습하면 성공적으로 길 찾

1.7
랜덤 initial을 더 낮게, decay를 0.999로
최적해를 처음 찾는 training session의 number(?)
training test 번갈아가면서 (with & without random)
'''

'\n1.6\nrandom 1, decays by 0.99하면 400번에서 어느 정도 학습은 된다 (최적 경로 X, 답을 찾긴하지만 오래 걸린다)\n                             500번에서 거의 최적 경로로 학습한다.\n                             \nState 39-46이 아래로 이동하면 cliff으로 떨어지는 구간. Q-table score은 대략 -31 ~ -8 까지\n해당 구간에 1이라는 잘못된 prior를 줬을때는 학습속도에 영향 X\n'

In [11]:
# 현재 연구에 써볼만한 환경 연구해보기 (3개)
# cliff-walking
# kelly coinflip

# pacman

# maze