In [1]:
import time
import numpy as np
import pandas as pd

In [2]:
np.random.seed(2)

In [3]:
N_STATES = 6 # 1维世界的宽度
ACTIONS = ['left', 'right']    # agent可用的动作
EPSILON = 0.9   # 贪婪度  greedy
ALPHA = 0.1   # 学习率
GAMMA = 0.9   # 奖励递减值
MAX_EPISODES = 13  # 最大回合数
FRESH_TIME = 0.01   # 移动间隔时间, 想让他走慢点就把时间设置大点

In [4]:
def build_q_table(n_states, actions):
    table = pd.DataFrame(
        np.zeros((n_states, len(actions))),    # q_table全0初始
        columns = actions,  # columns 对应的是行为的名称
    )
    return table

In [72]:
print(np.zeros((N_STATES, len(ACTIONS))))
table2 = pd.DataFrame(np.array([[1,2],[3,4]]), columns=["a", 'b'], index=['aa','bb'])
print(table2)
print(table2.values)
print(type(table2))
print(table2.shape)
print("----------")
print(table2.index)
print(table2.columns)
print(table2.loc[table2.index[0], table2.columns[1]])
print("-----------")
print(table2.iloc[0,0])
print(table2.head(1))   # 前N行
print(table2.tail(1))   # 后
print("-----convert rows and colums------")
print(table2.T)
print("没一个0，就返回True")
print(np.array([1,2,3]).all())
print(np.array([1,2,3]).all()==0)
print(np.array([1,0,3]).all())
print(np.array([1,0,3]).all()==0)
print(np.array([0,0]).all())
print(np.array([0,0]).all()==0)
print(np.array([]).all())
print(np.array([]).all()==0)
print("---------")
print(np.random.uniform())   # deault: 0-1, 可以自己定义low/high
print(np.array([1,2,3]).argmax())  # #取出list中元素最大值所对应的索引
print(np.random.choice([1,2,3]))

[[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]
    a  b
aa  1  2
bb  3  4
[[1 2]
 [3 4]]
<class 'pandas.core.frame.DataFrame'>
(2, 2)
----------
Index(['aa', 'bb'], dtype='object')
Index(['a', 'b'], dtype='object')
2
-----------
1
    a  b
aa  1  2
    a  b
bb  3  4
-----convert rows and colums------
   aa  bb
a   1   3
b   2   4
没一个0，就返回True
True
False
False
True
False
True
True
False
---------
0.8835216841681864
2
3


In [73]:
# 在某个state节点选择行为
def choose_action(state, q_table):
    state_actions = q_table.iloc[state, :]   # 选出这个 state 的所有action值
    # 非贪婪 or 这个state还没有探索过
    if (np.random.uniform() > EPSILON) or (state_actions.all() == 0):
        action_name = np.random.choice(ACTIONS)
    else:
        action_name = state_actions.argmax()   # 贪婪模型
    return action_name

In [35]:
def get_env_feedback(S, A):
    # this is how agent will interact with the environment
    if A == 'right':  # move right
        if S == N_STATES - 2:   # terminate
            S_ = 'terminal'
            R = 1
        else:
            S_ = S + 1
            R = 0
    else:
        R = 0
        if S == 0:
            S_ = S   # reach the wall
        else:
            S_ = S - 1
    return S_, R

In [43]:
def update_env(S, episode, step_counter):
    # this is how environment be updated
    env_list = ['-']*(N_STATES-1) + ['T']   # ’-----------T‘ our environment
    print(env_list)
    if S == 'terminal':
        interaction = 'Episode %s: total_steps = %s ' % (episode+1, step_counter)
        print("\r{}".format(interaction), end='')
        time.sleep(2)
        print("\r                                  ", end='')
        
    else:
        env_list[5] = 'o'
        interaction = ''.join(env_list)
        print("inter:", interaction)
        print("2:", env_list)
        print("\r{}".format(interaction), end='')
        time.sleep(FRESH_TIME)

In [74]:
def rl():
    q_table = build_q_table(N_STATES, ACTIONS)   # 初始化q table
    print("q_table: \n", q_table)
    
    for episode in range(MAX_EPISODES):
        step_counter = 0
        S = 0  # 回合初始位置
        is_terminated = False  
        update_env(S, episode, step_counter)
        while not is_terminated:
            
            A = choose_action(S, q_table)
            # print("this time we choose:", A)
            S_, R = get_env_feedback(S, A)
            q_predict = q_table.loc[S, A]
            if S_ != 'terminal':
                # 实际的(状态-行为)值(回合没有结束)
                q_target = R + GAMMA * q_table.iloc[S_, :].max()
            else:
                q_target = R   # 实际的(状态-行为)值（回合结束）
                is_terminated = True
            
            q_table.loc[S, A] += ALPHA * (q_target - q_predict)  # q_table更新
            S = S_  # 探索者移动到下一个state
            
            update_env(S, episode, step_counter+1)   # 环境更新
            
            step_counter += 1
        return q_table

In [75]:
if __name__ == '__main__':
    q_table = rl()
    print("\r\nQ-table:\n")
    print(q_table)

q_table: 
    left  right
0   0.0    0.0
1   0.0    0.0
2   0.0    0.0
3   0.0    0.0
4   0.0    0.0
5   0.0    0.0
['-', '-', '-', '-', '-', 'T']
inter: -----o
2: ['-', '-', '-', '-', '-', 'o']
-----othis time we choose: right
['-', '-', '-', '-', '-', 'T']
inter: -----o
2: ['-', '-', '-', '-', '-', 'o']
-----othis time we choose: left
['-', '-', '-', '-', '-', 'T']
inter: -----o
2: ['-', '-', '-', '-', '-', 'o']
-----othis time we choose: right
['-', '-', '-', '-', '-', 'T']
inter: -----o
2: ['-', '-', '-', '-', '-', 'o']
-----othis time we choose: right
['-', '-', '-', '-', '-', 'T']
inter: -----o
2: ['-', '-', '-', '-', '-', 'o']
-----othis time we choose: right
['-', '-', '-', '-', '-', 'T']
inter: -----o
2: ['-', '-', '-', '-', '-', 'o']
-----othis time we choose: right
['-', '-', '-', '-', '-', 'T']
inter: -----o
2: ['-', '-', '-', '-', '-', 'o']
-----othis time we choose: left
['-', '-', '-', '-', '-', 'T']
inter: -----o
2: ['-', '-', '-', '-', '-', 'o']
-----othis time we choo

inter: -----o
2: ['-', '-', '-', '-', '-', 'o']
-----othis time we choose: left
['-', '-', '-', '-', '-', 'T']
inter: -----o
2: ['-', '-', '-', '-', '-', 'o']
-----othis time we choose: left
['-', '-', '-', '-', '-', 'T']
inter: -----o
2: ['-', '-', '-', '-', '-', 'o']
-----othis time we choose: right
['-', '-', '-', '-', '-', 'T']
inter: -----o
2: ['-', '-', '-', '-', '-', 'o']
-----othis time we choose: left
['-', '-', '-', '-', '-', 'T']
inter: -----o
2: ['-', '-', '-', '-', '-', 'o']
-----othis time we choose: left
['-', '-', '-', '-', '-', 'T']
inter: -----o
2: ['-', '-', '-', '-', '-', 'o']
-----othis time we choose: left
['-', '-', '-', '-', '-', 'T']
inter: -----o
2: ['-', '-', '-', '-', '-', 'o']
-----othis time we choose: left
['-', '-', '-', '-', '-', 'T']
inter: -----o
2: ['-', '-', '-', '-', '-', 'o']
-----othis time we choose: left
['-', '-', '-', '-', '-', 'T']
inter: -----o
2: ['-', '-', '-', '-', '-', 'o']
-----othis time we choose: right
['-', '-', '-', '-', '-', 'T']