In [1]:
# Q-learning

import gym
import random
import numpy as np
import time
from gym.envs.registration import register   # we use this to get rid of the slippery stuff in frozen lake
from IPython.display import clear_output

In [2]:
# try:
#     register(
#             id='FrozenLakeNoSlip-v0',
#             entry_point='gym.envs.toy_text:FrozenLakeEnv',
#             kwargs={'map_name' : '4x4', 'is_slippery':False},
#             max_episode_steps=100,
#             reward_threshold=0.78, # optimum = .8196
#             )
# except:
#     pass

# try:
#     register(
#             id='KellyCoinfliptest-v0',
#             entry_point='gym.envs.toy_text:KellyCoinflipEnv',
#             kwargs={'initial_wealth':25.0, 'edge':0.6, 'max_wealth':250.0, 'max_rounds' : 10.0},
#             reward_threshold=246.61,
# )
# except:
#     pass

In [3]:
#env_name = 'CartPole-v1'
#env_name = 'MountainCarContinuous-v0'
#env_name = 'Acrobot-v1'
#env_name = 'FrozenLake-v0'
#env_name = 'FrozenLakeNoSlip-v0'
#env_name = 'Taxi-v3'
#env_name = 'Gamble-v0'
#env_name = 'KellyCoinflip-v0'
env_name = 'Cliff_edited-v0'

env = gym.make(env_name)
#goal_steps = 200

In [4]:
class Agent():
    def __init__(self, env):
        self.is_discrete = (type(env.action_space) == gym.spaces.discrete.Discrete)
        
        if self.is_discrete:
            self.action_size = env.action_space.n # how many actions are available - only works for discrete
            print("Action size: ", self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
    
    def get_action(self, state): # choosing an action from the available actions
        if self.is_discrete:
            action = random.choice(range(self.action_size))  # discrete number of actions이기 때문에 이런 random 선정이 가능하다.
        else:
            action = np.random.uniform(self.action_low, self.action_high, self.action_shape)
        return action
    
agent = Agent(env)

Action size:  4


In [5]:
class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 0.6                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
#         for i in range(38,46):
#             self.q_table[i][0] = 0
#             self.q_table[i][1] = 0.3
#             self.q_table[i][2] = -0.3
#             self.q_table[i][3] = 0
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
        
    def get_action_greedy(self, state):
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
        elif state == next_state:
             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.

agent = QAgent(env)

Action size:  4
State size:  72


In [6]:
print(env.observation_space)
print(agent.state_size, agent.action_size)
print(agent.eps)
env.render()
'''
UP = 0
RIGHT = 1
DOWN = 2
LEFT = 3
'''

Discrete(72)
72 4
0.6

WWWWWWWWWWWW
WPPPPPPPPPPW
WPPPPPPPPPPW
WPPPPPPPPPPW
W[41mS[0mCCCCCCCCGW
WWWWWWWWWWWW


'\nUP = 0\nRIGHT = 1\nDOWN = 2\nLEFT = 3\n'

In [7]:
q_table_softmax = np.zeros([agent.state_size, agent.action_size])

# q-table softmax화
def softmax_array(array):
    for i in range(agent.state_size):
        q_table_softmax[i,:]  = np.exp(array[i,:])
        q_table_softmax[i,:] /= np.sum(q_table_softmax[i,:])
    return np.around(q_table_softmax, decimals = 2)
        
def print_softmax_array(array):
    for i in range(agent.state_size):
        q_table_softmax[i,:]  = np.exp(array[i,:])
        q_table_softmax[i,:] /= np.sum(q_table_softmax[i,:])
    q_table_softmax_final = np.around(q_table_softmax, decimals = 3)
    print(q_table_softmax_final)
        
print_softmax_array(agent.q_table)

[[0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25 0.25 0.25]
 [0.25 0.25

In [8]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
    print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#        print("state: ", state, "action: ", action)
#        print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

251


In [9]:
#agent.eps = 0
#agent.eps = 0.1
np.savetxt('q_table.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table.csv', softmax_array(agent.q_table), delimiter=',')

In [10]:
'''
12.3
2개정도 줄이기. (faster simulation)
보상과 step penalty 차이가 너무 크다.

12.9
cliff penalty를 -10으로 줄였더니 계속 첫 자리에 남아있으려는 경향이 있다 (벽쪽으로 이동하려는 경향)
==> 사각에 Cliff 추가 후 학습 잘 됨 (optimal route). Goal reward 50, Cliff penalty -10, step penalty -1

'''

'''
12.23
W(벽) 만들어서 실험 해보니 성공적으로 제자리 걸음 안하게 했다.
현재 1000번 학습하면 최단거리로 길 잘 찾는다.

안전한 경로에 대한 사전정보 & 잘못된 사전정보에 대한 결과 확인해보기
'''

'''
1.6
random 1, decays by 0.99하면 400번에서 어느 정도 학습은 된다 (최적 경로 X, 답을 찾긴하지만 오래 걸린다)
                             500번에서 거의 최적 경로로 학습한다.
                             
State 38-45이 아래로 이동하면 cliff으로 떨어지는 구간. Q-table score은 대략 -31 ~ -8 까지
해당 구간에 1이라는 잘못된 prior를 줬을때 500번 학습하면 성공적으로 길 찾음
해당 구간에 5이라는 잘못된 prior를 줬을때 500번 학습하면 성공적으로 길 찾

1.7
랜덤 initial을 더 낮게, decay를 0.999로
최적해를 처음 찾는 training session의 number(?)
training test 번갈아가면서 (with & without random)

1.13
0.996 decay with initial random 0.6 ==> 800번 반복했을 때 random이 0.0243
더 빠르게 학습 가능할 수 있도록 lower random initial, lower decay

1.19
0.99 decay with initial random 0.6
0.5 쯤부터는 학습을 아예 실패하는 경우가 생긴다
'''

'\n1.6\nrandom 1, decays by 0.99하면 400번에서 어느 정도 학습은 된다 (최적 경로 X, 답을 찾긴하지만 오래 걸린다)\n                             500번에서 거의 최적 경로로 학습한다.\n                             \nState 38-45이 아래로 이동하면 cliff으로 떨어지는 구간. Q-table score은 대략 -31 ~ -8 까지\n해당 구간에 1이라는 잘못된 prior를 줬을때 500번 학습하면 성공적으로 길 찾음\n해당 구간에 5이라는 잘못된 prior를 줬을때 500번 학습하면 성공적으로 길 찾\n\n1.7\n랜덤 initial을 더 낮게, decay를 0.999로\n최적해를 처음 찾는 training session의 number(?)\ntraining test 번갈아가면서 (with & without random)\n\n1.13\n0.996 decay with initial random 0.6 ==> 800번 반복했을 때 random이 0.0243\n더 빠르게 학습 가능할 수 있도록 lower random initial, lower decay\n'

In [11]:
# for i in range(10):
#     state = env.reset()
#     done = False
#     count = 0
#     while not done:
#         action = agent.get_action_greedy(state)                    # decide on an action
#         next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         count = count+1
#         print("state: ", state, "action: ", action)
#         print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
#         if state==next_state or count>15:
#             break
#         else:
#             state = next_state
#             env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#             time.sleep(0.4)                   # 약간의 딜레이 시간 추가
#             clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
# # states 38-45

In [12]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
    print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#        print("state: ", state, "action: ", action)
#        print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal2.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table2.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table2.csv', softmax_array(agent.q_table), delimiter=',')

428


In [13]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
    print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#        print("state: ", state, "action: ", action)
#        print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal3.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table3.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table3.csv', softmax_array(agent.q_table), delimiter=',')

276


In [14]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
    print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#         print("state: ", state, "action: ", action)
#         print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal4.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table4.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table4.csv', softmax_array(agent.q_table), delimiter=',')

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [15]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
    print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#         print("state: ", state, "action: ", action)
#         print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal5.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table5.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table5.csv', softmax_array(agent.q_table), delimiter=',')

329


In [16]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
    print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#         print("state: ", state, "action: ", action)
#         print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal6.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table6.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table6.csv', softmax_array(agent.q_table), delimiter=',')

310


In [17]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
    print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#         print("state: ", state, "action: ", action)
#         print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal7.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table7.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table7.csv', softmax_array(agent.q_table), delimiter=',')

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [18]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
    print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#         print("state: ", state, "action: ", action)
#         print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal8.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table8.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table8.csv', softmax_array(agent.q_table), delimiter=',')

In [19]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
    print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#         print("state: ", state, "action: ", action)
#         print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal9.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table9.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table9.csv', softmax_array(agent.q_table), delimiter=',')

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [20]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
    print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#         print("state: ", state, "action: ", action)
#         print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal10.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table10.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table10.csv', softmax_array(agent.q_table), delimiter=',')

69


In [21]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#         print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
    print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#         print("state: ", state, "action: ", action)
#         print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal11.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table11.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table11.csv', softmax_array(agent.q_table), delimiter=',')

211


In [22]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
#     print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#         print("state: ", state, "action: ", action)
#         print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal12.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table12.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table12.csv', softmax_array(agent.q_table), delimiter=',')

In [23]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
#     print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#         print("state: ", state, "action: ", action)
#         print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal13.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table13.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table13.csv', softmax_array(agent.q_table), delimiter=',')

In [24]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
#    print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#        print("state: ", state, "action: ", action)
#        print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal14.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table14.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table14.csv', softmax_array(agent.q_table), delimiter=',')

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [25]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
#     print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#         print("state: ", state, "action: ", action)
#         print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal15.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table15.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table15.csv', softmax_array(agent.q_table), delimiter=',')

251


In [26]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
#     print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#         print("state: ", state, "action: ", action)
#         print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal16.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table16.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table16.csv', softmax_array(agent.q_table), delimiter=',')

288


In [27]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
#     print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#         print("state: ", state, "action: ", action)
#         print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal17.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table17.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table17.csv', softmax_array(agent.q_table), delimiter=',')

357


In [28]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
#     print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#         print("state: ", state, "action: ", action)
#         print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record > 0:
        with open('first_optimal18.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table18.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table18.csv', softmax_array(agent.q_table), delimiter=',')

343


In [29]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
#     print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#         print("state: ", state, "action: ", action)
#         print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal19.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table19.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table19.csv', softmax_array(agent.q_table), delimiter=',')

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [30]:
agent = QAgent(env)
total_reward = 0
for i in range(3000):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         print("state: ", state, "action: ", action)
#         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
#        env.render()                       # show the screen of the game
#         print_softmax_array(agent.q_table)
#        print(agent.q_table)               # show q-table after every action
        #time.sleep(0.2)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.99
    total_reward += reward
    
    state = env.reset()
    reward_record = 0
    done = False
    count = 0
#     print(state)
    while not done:
        action = agent.get_action_greedy(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        count = count+1
#         print("state: ", state, "action: ", action)
#         print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        reward_record = reward
        if state==next_state or count>11:
            break
        else:
            state = next_state
#            env.render()                       # show the screen of the game
#             print(agent.q_table)               # show q-table after every action
#            time.sleep(0.4)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
    if reward_record == 30:
        with open('first_optimal20.txt', 'w') as f:
            f.write('%d' % i)
        print(i)
        break

env.close()

np.savetxt('q_table20.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table20.csv', softmax_array(agent.q_table), delimiter=',')

315


In [31]:
# # agent = QAgent(env)
# for i in range(10):
#     state = env.reset()
#     reward_record = 0
#     done = False
#     count = 0
#     print(state)
#     while not done:
#         action = agent.get_action_greedy(state)                    # decide on an action
#         next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#         count = count+1
# #         print("state: ", state, "action: ", action)
# #         print("Testing Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
#         reward_record = reward
#         if state==next_state or count>11:
#             break
#         else:
#             state = next_state
#             env.render()                       # show the screen of the game
# #             print(agent.q_table)               # show q-table after every action
#             time.sleep(0.4)                   # 약간의 딜레이 시간 추가
#             clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    
#     if reward_record == 30:
#         with open('first_optimal20.txt', 'w') as f:
#             f.write('%d' % i)
#         print(i)
#         break

# env.close()

print(agent.q_table[37][2])

-2.4269458773438117
