In [16]:
# Q-learning

import gym
import random
import numpy as np
import time
from gym.envs.registration import register   # we use this to get rid of the slippery stuff in frozen lake
from IPython.display import clear_output

In [17]:
env_name = 'Gamble-v0'
env = gym.make(env_name)

In [18]:
class Agent():
    def __init__(self, env):
        self.is_discrete = (type(env.action_space) == gym.spaces.discrete.Discrete)
        
        if self.is_discrete:
            self.action_size = env.action_space.n # how many actions are available - only works for discrete
            print("Action size: ", self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
    
    def get_action(self, state): # choosing an action from the available actions
        if self.is_discrete:
            action = random.choice(range(self.action_size))  # discrete number of actions이기 때문에 이런 random 선정이 가능하다.
        else:
            action = np.random.uniform(self.action_low, self.action_high, self.action_shape)
        return action
    
agent = Agent(env)

Action size:  4


In [19]:
class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])

            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # how much to update the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

Action size:  4
State size:  31


In [20]:
agent.q_table = np.genfromtxt(r'D:\52Material\Lab\RL Experiment\Gamble Environment\no prior.csv', delimiter=',')
print(env.observation_space)
print(agent.state_size, agent.action_size)
print(agent.eps)
print(agent.q_table)
env.render()

# q-table 더 잘 보기 위한 확률 기반 q-table

Discrete(31)
31 4
1.0
[[0.12657989 0.13478144 0.21337874 0.29090808]
 [0.54817722 0.77803416 0.79439719 0.85600043]
 [0.94489635 0.99238679 1.02770998 1.10070199]
 [1.21941676 1.10495167 1.28898284 1.48511871]
 [1.31568412 1.3624673  1.50075357 1.37553902]
 [1.49589546 1.42914645 1.62756057 1.52041209]
 [1.22906602 1.43003036 1.2433691  1.2287811 ]
 [1.35249361 1.33641781 1.41266268 1.5805303 ]
 [1.36241645 1.03797825 1.56131066 1.28167796]
 [1.03875234 1.01435687 1.11172365 1.0327792 ]
 [1.19915407 1.27999196 0.83413312 1.07300731]
 [1.15783573 0.89811577 0.78006141 0.94476969]
 [1.16741435 0.68727139 1.05514946 1.44676088]
 [1.3489256  0.82944368 0.69065583 0.95038071]
 [0.92097354 0.85301083 1.37922451 1.20919247]
 [0.87330564 0.96045611 0.83639617 0.47028804]
 [0.69803015 0.61165529 0.97770922 0.87132625]
 [0.62726285 0.75964514 0.92030874 0.46943472]
 [0.80863647 0.5114523  0.20285433 0.79543065]
 [0.6085705  0.8934974  0.35776391 4.18705218]
 [0.34996775 0.48060523 0.54728494 2.3

In [21]:
q_table_softmax = np.zeros([agent.state_size, agent.action_size])

# q-table softmax화
def softmax_array(array):
    for i in range(agent.state_size):
        q_table_softmax[i,:]  = np.exp(array[i,:])
        q_table_softmax[i,:] /= np.sum(q_table_softmax[i,:])
    return np.around(q_table_softmax, decimals = 3)
        
def print_softmax_array(array):
    for i in range(agent.state_size):
        q_table_softmax[i,:]  = np.exp(array[i,:])
        q_table_softmax[i,:] /= np.sum(q_table_softmax[i,:])
    q_table_softmax_final = np.around(q_table_softmax, decimals = 3)
    print(q_table_softmax_final)
        
print_softmax_array(agent.q_table)

[[0.234 0.236 0.255 0.276]
 [0.204 0.257 0.261 0.278]
 [0.232 0.244 0.252 0.272]
 [0.234 0.209 0.251 0.306]
 [0.232 0.243 0.279 0.246]
 [0.244 0.228 0.278 0.25 ]
 [0.236 0.289 0.239 0.236]
 [0.232 0.229 0.247 0.292]
 [0.259 0.187 0.316 0.239]
 [0.247 0.241 0.266 0.246]
 [0.273 0.296 0.19  0.241]
 [0.306 0.236 0.21  0.248]
 [0.261 0.161 0.233 0.345]
 [0.359 0.214 0.186 0.241]
 [0.206 0.193 0.326 0.275]
 [0.269 0.293 0.259 0.179]
 [0.226 0.207 0.299 0.268]
 [0.231 0.263 0.309 0.197]
 [0.305 0.227 0.167 0.301]
 [0.026 0.034 0.02  0.92 ]
 [0.095 0.108 0.116 0.681]
 [0.094 0.079 0.193 0.634]
 [0.086 0.092 0.226 0.595]
 [0.11  0.106 0.503 0.281]
 [0.112 0.092 0.517 0.279]
 [0.18  0.27  0.279 0.27 ]
 [0.121 0.298 0.322 0.26 ]
 [0.269 0.248 0.234 0.249]
 [0.458 0.196 0.135 0.211]
 [0.482 0.148 0.131 0.238]
 [0.25  0.25  0.25  0.25 ]]


In [24]:
for testnum in range(20):
    total_reward = 0
    for i in range(300):
        state = env.reset()
        done = False
        while not done:
        #    action = env.action_space.sample() # choosing a random action
            action = agent.get_action(state)                    # decide on an action
            next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
    #         print("state: ", state, "action: ", action)
    #         print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
            agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
            state = next_state
    #         env.render()                       # show the screen of the game
            #print(agent.q_table)               # show q-table after every action
    #         print_softmax_array(agent.q_table)
            #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
        agent.eps = agent.eps * 0.999
        total_reward += reward

    env.close()
    np.savetxt(r'D:\52Material\Lab\RL Experiment\Gamble Environment\No Prior as Prior\q_table%d.csv' % testnum, agent.q_table, delimiter=',')
    np.savetxt(r'D:\52Material\Lab\RL Experiment\Gamble Environment\No Prior as Prior\softmax_q_table%d.csv' % testnum, softmax_array(agent.q_table), delimiter=',')
    
    agent.eps = 0
    total_reward = 0
    action_array = np.zeros([10, 10])
    money_array =  np.zeros([10, 11])
    
    for i in range(10):
        state = env.reset()
        done = False
        j = 0
        while not done:
        #    action = env.action_space.sample() # choosing a random action
            money_array[i][j] = env.wealth
            action = agent.get_action(state)                    # decide on an action
            action_array[i][j] = action
            j = j+1
            next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
#             print("state: ", state, "action: ", action)
#             print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
            agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
            state = next_state
#             env.render()                       # show the screen of the game
            #print(agent.q_table)               # show q-table after every action
#             print_softmax_array(agent.q_table)
            #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
            clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
        money_array[i][10] = env.wealth
        total_reward += env.wealth

    env.close()

    print(total_reward)
    np.savetxt(r'D:\52Material\Lab\RL Experiment\Gamble Environment\No Prior as Prior\action_list%d.csv' % testnum, action_array, delimiter=',')
    np.savetxt(r'D:\52Material\Lab\RL Experiment\Gamble Environment\No Prior as Prior\money_list%d.csv' % testnum, money_array, delimiter=',')

    class QAgent(Agent):
        def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
            super().__init__(env)
            self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
            print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)

            self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
            self.discount_rate = discount_rate
            self.learning_rate = learning_rate
            self.build_model()

        def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
            #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
            self.q_table = np.zeros([self.state_size, self.action_size])


        def get_action(self, state):
            '''
            Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
            Use randomness to search through the whole area, and as time passes we will reduce randomness
            '''
            q_state = self.q_table[state]                 # current state
            action_greedy = np.argmax(q_state)            # max q-value among the possible choices
            action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
            if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
                return action_random
            else:
                return action_greedy

        def train(self, experience):
            state, action, next_state, reward, done = experience

            q_next = self.q_table[next_state]
            # current state is terminal
            if done:
                q_next = np.zeros([self.action_size])
    #         elif state == next_state:
    #             q_next = np.zeros([self.action_size])
            else:
                q_next

            # what the next action is based on the q-table
            q_target = reward + self.discount_rate*np.max(q_next)

            # update table
            q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
            self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate

            # reduce randomness after each epoch

            # penalty 함수도 있어야한다.
            # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
    agent = QAgent(env)
    agent.q_table = np.genfromtxt(r'D:\52Material\Lab\RL Experiment\Gamble Environment\no prior.csv', delimiter=',')

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [12]:
#agent.eps = 1
agent.eps = 0
#print(total_reward)
#print(agent.q_table)

In [13]:
'''
12.1
계속 0이 가장 안전하니까 성공하기에도 가장 좋다고 판단하는듯.

12.9
0에 대한 penalty를 만들었더니 도박 하려는 경향 증가. 그러나 적은 액수에서는 도박 해도 어차피 얻는게 없어서 돈 없을 때는 도박을 안한다.
initial wealth, success rate 상향조정


초기 자금 20, success rate 0.65로 하고 action을 [0, 20%, 40%, 60%] 도박으로 하면 40 혹은 60을 자주 선택한다.
평균 초기 자금의 90% 내외를 벌고 있다. ==> 최대치에 도달 했을 때 done인 것을 취소하거나 최대치를 높여야할 것 같다.
==> 최대치 200으로 상향조정 ==> 돈을 오히려 더 많이 잘 번다. ==> success rate 다시 0.6으로 조정
==> 다시 0, 20% 위주로 한다. ==> 0.62로 조정 ==> 일정 수량 이상으로 가면 돈 아끼려는 경향이 보임 ==> hold penalty를 0.9에서 0.85로 조정
==> hold을 없애교 그냥 0.1 도박으로 조정 ==> 최종 action 목록:[10%, 20%, 40%, 60%]

이제는 골고루 이것저것 시도해본다.
'''

'''
12.23
사전 정보 없이 1000번 학습 후 10회 결과 (randomness = 1.0부터 *0.999로 decay):
1회차: 204
2회차: 583
3회차: 104
4회차: 336
5회차: 484
==> 평균: 342.2   표준편차: 175.48150899739

시도해볼 사전 정보는 0.01. 최종적으로는 0.1 내외의 값이 q-table에 저장되므로 10%정도의 사전정보라고 판단했다.

사전 정보 가지고 1000번 학습:
- 사전 정보: action 0(0.1 베팅) = 0.01
1회차: 456
2회차: 383
3회차: 321
4회차: 557
5회차: 621
==> 평균: 467.6    표준편차: 109.82458741102

사전 정보 가지고 1000번 학습:
- 사전 정보: action 1(0.2 베팅) = 0.01
1회차: 437
2회차: 325
3회차: 261
4회차: 346
5회차: 538
==> 평균: 381.4    표준편차: 96.483366442097

- 사전 정보: action 3(0.6 베팅) = 0.01
1회차: 220
2회차: 426
3회차: 265
4회차: 318
5회차: 154
==> 평균: 276.6    표준편차: 92.09039037815

결과와 상관없이 안정적으로 선택하게 된다.

q-table을 block처럼
'''

'\n12.23\n사전 정보 없이 1000번 학습 후 10회 결과 (randomness = 1.0부터 *0.999로 decay):\n1회차: 204\n2회차: 583\n3회차: 104\n4회차: 336\n5회차: 484\n==> 평균: 342.2   표준편차: 175.48150899739\n\n시도해볼 사전 정보는 0.01. 최종적으로는 0.1 내외의 값이 q-table에 저장되므로 10%정도의 사전정보라고 판단했다.\n\n사전 정보 가지고 1000번 학습:\n- 사전 정보: action 0(0.1 베팅) = 0.01\n1회차: 456\n2회차: 383\n3회차: 321\n4회차: 557\n5회차: 621\n==> 평균: 467.6    표준편차: 109.82458741102\n\n사전 정보 가지고 1000번 학습:\n- 사전 정보: action 1(0.2 베팅) = 0.01\n1회차: 437\n2회차: 325\n3회차: 261\n4회차: 346\n5회차: 538\n==> 평균: 381.4    표준편차: 96.483366442097\n\n- 사전 정보: action 3(0.6 베팅) = 0.01\n1회차: 220\n2회차: 426\n3회차: 265\n4회차: 318\n5회차: 154\n==> 평균: 276.6    표준편차: 92.09039037815\n\n결과와 상관없이 안정적으로 선택하게 된다.\n\nq-table을 block처럼\n'

In [14]:
np.savetxt(r'D:\52Material\Lab\RL Experiment\Gamble Environment\No Prior as Prior\q_table.csv', agent.q_table, delimiter=',')
np.savetxt(r'D:\52Material\Lab\RL Experiment\Gamble Environment\No Prior as Prior\softmax_q_table.csv', softmax_array(agent.q_table), delimiter=',')

In [15]:
total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = env.wealth
    total_reward += env.wealth

env.close()

print(total_reward)
np.savetxt(r'D:\52Material\Lab\RL Experiment\Gamble Environment\action_list.csv', action_array, delimiter=',')
np.savetxt(r'D:\52Material\Lab\RL Experiment\Gamble Environment\money_list.csv', money_array, delimiter=',')

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

359
Action size:  4
State size:  31


In [12]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table2.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table2.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list2.csv', action_array, delimiter=',')
np.savetxt('money_list2.csv', money_array, delimiter=',')

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

491
Action size:  4
State size:  31


In [13]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table3.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table3.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list3.csv', action_array, delimiter=',')
np.savetxt('money_list3.csv', money_array, delimiter=',')

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])

        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

370
Action size:  4
State size:  31


In [14]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table4.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table4.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list4.csv', action_array, delimiter=',')
np.savetxt('money_list4.csv', money_array, delimiter=',')

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

311
Action size:  4
State size:  31


In [15]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table5.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table5.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list5.csv', action_array, delimiter=',')
np.savetxt('money_list5.csv', money_array, delimiter=',')

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

552
Action size:  4
State size:  31


In [16]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table6.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table6.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list6.csv', action_array, delimiter=',')
np.savetxt('money_list6.csv', money_array, delimiter=',')
print(total_reward)

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

435
435
Action size:  4
State size:  31


In [17]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table7.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table7.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list7.csv', action_array, delimiter=',')
np.savetxt('money_list7.csv', money_array, delimiter=',')

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

362
Action size:  4
State size:  31


In [18]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table8.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table8.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list8.csv', action_array, delimiter=',')
np.savetxt('money_list8.csv', money_array, delimiter=',')

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

297
Action size:  4
State size:  31


In [19]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table9.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table9.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list9.csv', action_array, delimiter=',')
np.savetxt('money_list9.csv', money_array, delimiter=',')

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

110
Action size:  4
State size:  31


In [20]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table10.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table10.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list10.csv', action_array, delimiter=',')
np.savetxt('money_list10.csv', money_array, delimiter=',')

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

346
Action size:  4
State size:  31


In [21]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table11.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table11.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list11.csv', action_array, delimiter=',')
np.savetxt('money_list11.csv', money_array, delimiter=',')

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

602
Action size:  4
State size:  31


In [22]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table12.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table12.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list12.csv', action_array, delimiter=',')
np.savetxt('money_list12.csv', money_array, delimiter=',')

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

460
Action size:  4
State size:  31


In [23]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table13.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table13.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list13.csv', action_array, delimiter=',')
np.savetxt('money_list13.csv', money_array, delimiter=',')

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

341
Action size:  4
State size:  31


In [24]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table14.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table14.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list14.csv', action_array, delimiter=',')
np.savetxt('money_list14.csv', money_array, delimiter=',')

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

476
Action size:  4
State size:  31


In [25]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table15.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table15.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list15.csv', action_array, delimiter=',')
np.savetxt('money_list15.csv', money_array, delimiter=',')

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

233
Action size:  4
State size:  31


In [26]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table16.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table16.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list16.csv', action_array, delimiter=',')
np.savetxt('money_list16.csv', money_array, delimiter=',')

print(total_reward)

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

581
581
Action size:  4
State size:  31


In [27]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table17.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table17.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list17.csv', action_array, delimiter=',')
np.savetxt('money_list17.csv', money_array, delimiter=',')

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

466
Action size:  4
State size:  31


In [28]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table18.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table18.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list18.csv', action_array, delimiter=',')
np.savetxt('money_list18.csv', money_array, delimiter=',')

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

266
Action size:  4
State size:  31


In [29]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table19.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table19.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list19.csv', action_array, delimiter=',')
np.savetxt('money_list19.csv', money_array, delimiter=',')

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

343
Action size:  4
State size:  31


In [30]:
agent.eps = 1
total_reward = 0
for i in range(300):
    state = env.reset()
    done = False
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        action = agent.get_action(state)                    # decide on an action
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    agent.eps = agent.eps * 0.999
    total_reward += reward

env.close()
agent.eps = 0
np.savetxt('q_table20.csv', agent.q_table, delimiter=',')
np.savetxt('softmax_q_table20.csv', softmax_array(agent.q_table), delimiter=',')

total_reward = 0
action_array = np.zeros([10, 10])
money_array =  np.zeros([10, 11])
for i in range(10):
    state = env.reset()
    done = False
    j = 0
    while not done:
    #    action = env.action_space.sample() # choosing a random action
        money_array[i][j] = env.wealth
        action = agent.get_action(state)                    # decide on an action
        action_array[i][j] = action
        j = j+1
        next_state, reward, done, info = env.step(action)   # doing the random action in the environment. step은 이 4가지를 return 해준다.
        print("state: ", state, "action: ", action)
        print("Training Session: ", i, "    Total reward: ", total_reward, "    randomness: ", agent.eps)
        agent.train((state, action, next_state, reward, done))   # 해당 action에 대해 train한다
        state = next_state
        env.render()                       # show the screen of the game
        #print(agent.q_table)               # show q-table after every action
        print_softmax_array(agent.q_table)
        #time.sleep(0.3)                   # 약간의 딜레이 시간 추가
        clear_output(wait = True)         # 한 번에 1개의 action에 대한 q-table만 보이도록
    money_array[i][10] = reward
    total_reward += reward

env.close()

print(total_reward)
np.savetxt('action_list20.csv', action_array, delimiter=',')
np.savetxt('money_list20.csv', money_array, delimiter=',')

class QAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n # 현 environment에서 가능한 discrete number of states를 값으로 받는다
        print("State size: ", self.state_size)    # in this example, 16 states exist (4*4 grid)
        
        self.eps = 1.0                            # exploration vs. exploitation. 1.0 means 100% random
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self): # state의 갯수만큼 rows, action의 갯수만큼 column을 만들고, 랜덤하게 initializae한다.
        #self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        self.q_table = np.zeros([self.state_size, self.action_size])
            
        
    def get_action(self, state):
        '''
        Just going for the "best policy" was bad, because it completely depends on the initialization that we have made!
        Use randomness to search through the whole area, and as time passes we will reduce randomness
        '''
        q_state = self.q_table[state]                 # current state
        action_greedy = np.argmax(q_state)            # max q-value among the possible choices
        action_random = super().get_action(state)     # 상위 class인 agent의 get_action. 이건 random한거로 정의되어 있다
        if random.random() < self.eps:                # random하게 0~1 값과 eps 비교
            return action_random
        else:
            return action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        # current state is terminal
        if done:
            q_next = np.zeros([self.action_size])
#         elif state == next_state:
#             q_next = np.zeros([self.action_size])
        else:
            q_next
        
        # what the next action is based on the q-table
        q_target = reward + self.discount_rate*np.max(q_next)
        
        # update table
        q_update = q_target - self.q_table[state, action]             # 해당 행동이 table과 얼마나 차이 나는지 계산 (좋은 행동이면 양수)
        self.q_table[state, action] += self.learning_rate * q_update  # update q-table after applying learning rate
        
        # reduce randomness after each epoch
        
        # penalty 함수도 있어야한다.
        # reward heuristic 재구성! 가까워질수록 보상이 더 있는 것이 좋다.
agent = QAgent(env)

372
Action size:  4
State size:  31
