In [100]:
import numpy as np
import matplotlib.pyplot as plt
from time import sleep
from IPython.display import clear_output as clear

In [101]:
# 超参数建立

# 0: Normal, 1: Treasure, 0: End
envMap = [0, 0, 0, 0, 0, 1, 0, 1, 2]
visMap = ['.', '*', '#']
ACTION = ['left', 'right']
EPSILOM = 0.5                                               # 贪心系数
GAMMA = 0.9
EPOCHS = 10                                                 # 训练次数
LEARNING_RATE = 0.2
qTable = np.zeros( ( len(envMap), len(ACTION) ) )               # 初始化 Q Table Q[s, a] -> 在 s 状态下 采取 a 行为的 Q 值
STEPCOUNT = []

In [102]:
# Environment 建立 -> 一维迷宫问题

def VisualMap():
    '''
    return: string of the map
    '''
    return [visMap[point] for point in envMap]

def GetReward(state, action):
    '''
    param:
        state: the coordinate of the agent
        action: the action agent will take
    return:
        nextState: as the agent take the action, the nextState it will be
        reward: this action's reward
    '''
    action = ACTION[action]
    if action == 'left':
        # 走不了
        if state == 0:
            nextState = state
            reward = -1
        # 正常走
        elif envMap[state - 1] == 0:
            nextState = state - 1
            reward = 0
        elif envMap[state - 1] == 1:
            nextState = state - 1
            reward = 1
        else:
            print("Error")
    elif action == 'right':
        # 走到终点
        if envMap[state + 1] == 2:
            nextState = 'end'
            reward = 3
        # 正常走
        elif envMap[state + 1] == 0:
            nextState = state + 1
            reward = 1
        elif envMap[state + 1] == 1:
            nextState = state + 1
            reward = 2
        else:
            print("Error")
    else:
        nextState = state
        # 惩罚不走的
        reward = -1
    
    return nextState, reward

def UpdateEnv(state, action, stepCounter):
    '''
    function:
        Visualize the gameplaying map
    '''
    originMap = VisualMap()
    
    # 标记 agent 所在位置

    if state == 'end':
        print("End!")
        print("Use %d steps" % stepCounter)
        return 

    originMap[state] = '+'
    sleep(1)
    clear()

    print(originMap)
    print("action:%s" % ACTION[action])
    print("Use %d steps" % stepCounter)


In [103]:
# Q-Learning 模型建立

def ChooseAction(table, state):
    '''
    param:
        table: the q table
        state: the state agent is in now
    return:
        action: the action based on the q table
    '''

    statesQ = table[state]

    # 搜索
    if np.random.uniform() > EPSILOM: 
        return np.random.randint(len(ACTION))
    # 贪心
    else:
        return np.argmax(statesQ)

def Train():
    for _ in range(EPOCHS):
        stepCounter = 0
        endLoop = False
        state = 0                               # search from the place 0
        while not endLoop:
            action = ChooseAction(qTable, state)
            nextState, reward = GetReward(state, action)
            # print(nextState, reward)
            # print(qTable)
            predictQ = qTable[state, action]    # 采取行动后需要迭代更新 Q 值

            # 结束
            if nextState == 'end':
                endLoop = True
                realQ = reward
            else:
                realQ = reward + GAMMA * np.max(qTable[state, :])

            qTable[state, action] = LEARNING_RATE * (realQ - predictQ)
            state = nextState
            stepCounter += 1
            # UpdateEnv(state, action, stepCounter)
        STEPCOUNT.append(stepCounter)
            

In [104]:
Train()
print(qTable)
print(STEPCOUNT)

[[-0.13707294  0.19607843]
 [ 0.02936132  0.19607843]
 [ 0.02936471  0.19607843]
 [ 0.02942231  0.19607843]
 [ 0.05882364  0.39215686]
 [ 0.02941178  0.19607843]
 [ 0.22571992  0.39215686]
 [ 0.07344     0.49999995]
 [ 0.          0.        ]]
[48, 14, 24, 15, 14, 10, 16, 8, 18, 8]
