In [1]:
# @title SARSA

import numpy as np

# 定义迷宫环境
maze = np.array([
    [0, 0, 0, 0],
    [0, -1, 0, -1],
    [0, 0, 0, -1],
    [-1, 0, 0, 1]
])

# 定义起始状态和终止状态
start_state = (3, 0)
goal_state = (3, 3)

# 定义动作空间
actions = [(0, 1), (0, -1), (-1, 0), (1, 0)]

# 初始化状态-动作值函数
Q = np.zeros((4, 4, 4))

# 定义参数
alpha = 0.1
gamma = 0.9
epsilon = 0.1
max_episodes = 100

# SARSA算法
for episode in range(max_episodes):
    state = start_state
    action = np.random.choice(range(4)) if np.random.rand() < epsilon else np.argmax(Q[state])

    while state != goal_state:
        # next_state = (state[0] + actions[action][0], state[1] + actions[action][1])
        a = state[0] + actions[action][0]
        b = state[1] + actions[action][1]
        if a > 3:
            a-=1
        elif b > 3:
            b-=1
        elif a < -4:
            a+= 1
        elif b < -4:
            b+= 1
        next_state = (a,b)
        reward = maze[next_state]
        next_action = np.random.choice(range(4)) if np.random.rand() < epsilon else np.argmax(Q[next_state])
        Q[state][action] += alpha * (reward + gamma * Q[next_state][next_action] - Q[state][action])

        state = next_state
        action = next_action

# 输出结果
for i in range(4):
    for j in range(4):
        print("State:", (i, j))
        print("Up:", Q[i][j][0])
        print("Down:", Q[i][j][1])
        print("Left:", Q[i][j][2])
        print("Right:", Q[i][j][3])
        print()

State: (0, 0)
Up: 0.0
Down: 0.0
Left: 0.0
Right: 0.0

State: (0, 1)
Up: 0.0
Down: 0.0
Left: 0.0008100000000000002
Right: 0.0

State: (0, 2)
Up: 0.14332007988258194
Down: 0.0
Left: 5.2830058699680125
Right: 0.0

State: (0, 3)
Up: 1.5674630322623528
Down: 0.8811492877210296
Left: 8.198004522953187
Right: -0.33941869602293595

State: (1, 0)
Up: 0.0
Down: 0.0
Left: 0.0
Right: 0.0

State: (1, 1)
Up: 0.5624395308717249
Down: 0.0
Left: 0.0
Right: 0.0

State: (1, 2)
Up: -0.16956804177412949
Down: -0.15877701030762584
Left: -7.290000000000002e-05
Right: 2.533795309785333

State: (1, 3)
Up: -0.19
Down: 0.19531958225870524
Left: 0.0
Right: 0.0

State: (2, 0)
Up: 1.2072143326596108
Down: 0.0
Left: 0.0
Right: 0.0

State: (2, 1)
Up: 4.184946618635917
Down: 0.0
Left: -0.1768246855497998
Right: 0.0

State: (2, 2)
Up: 0.7745932541698167
Down: 0.6310549480159503
Left: 0.0787228053124741
Right: 6.295730172517579

State: (2, 3)
Up: 0.5792823336656345
Down: 5.81638259865239
Left: -0.2705940067289908
Right:

In [6]:
# @title Q-обучение

import numpy as np

class QLearningAgent:
    def __init__(self, num_states, num_actions, learning_rate=0.1, discount_factor=0.9, epsilon=0.1):
        self.num_states = num_states
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.Q = np.zeros((num_states, num_actions))  # 初始化状态-动作值函数为0

    def choose_action(self, state):
        if np.random.uniform(0, 1) < self.epsilon:
            return np.random.choice(self.num_actions)  # 以epsilon的概率随机选择动作
        else:
            return np.argmax(self.Q[state])  # 否则选择具有最大Q值的动作

    def learn(self, state, action, reward, next_state):
        # 使用Q-learning更新Q值函数
        best_next_action = np.argmax(self.Q[next_state])  # 选择下一个状态下具有最大Q值的动作
        td_target = reward + self.discount_factor * self.Q[next_state, best_next_action]  # 计算TD目标
        td_error = td_target - self.Q[state, action]  # 计算TD误差
        self.Q[state, action] += self.learning_rate * td_error  # 更新Q值函数

# 创建一个简单的网格世界环境
# 在这个环境中，智能体需要找到一个特定位置，并获得+1的奖励，其他位置奖励为0
num_states = 5
num_actions = 4  # 上、下、左、右四个动作

# 初始化Q-learning智能体
agent = QLearningAgent(num_states, num_actions)

# 运行Q-learning算法进行训练
num_episodes = 1000
for episode in range(num_episodes):
    state = np.random.randint(num_states)  # 随机选择一个起始状态
    done = False
    while not done:
        action = agent.choose_action(state)  # 根据当前策略选择动作
        if action == 0:  # 上
            next_state = max(0, state - 1)
        elif action == 1:  # 下
            next_state = min(num_states - 1, state + 1)
        elif action == 2:  # 左
            next_state = max(0, state - 1)
        else:  # 右
            next_state = min(num_states - 1, state + 1)

        reward = 0
        if next_state == num_states - 1:  # 到达目标位置，获得+1的奖励
            reward = 1
            done = True

        # 更新Q值函数
        agent.learn(state, action, reward, next_state)

        state = next_state  # 更新状态

# 打印学习到的Q值函数
print("Learned Q-values:")
print(agent.Q)


Learned Q-values:
[[1.43659092 0.36577292 1.59253891 3.64937476]
 [1.54432707 4.08513067 1.51843521 2.40630573]
 [2.78177559 3.66249678 2.27370049 4.55156673]
 [3.284947   3.98496246 3.16216149 5.06385588]
 [4.51892753 1.17672809 1.12473567 1.73250984]]


In [7]:
# @title Двойное Q-обучение

import numpy as np

class DoubleQLearningAgent:
    def __init__(self, num_states, num_actions, learning_rate=0.1, discount_factor=0.9, epsilon=0.1):
        self.num_states = num_states
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.Q1 = np.zeros((num_states, num_actions))  # Q1表
        self.Q2 = np.zeros((num_states, num_actions))  # Q2表

    def choose_action(self, state):
        if np.random.uniform(0, 1) < self.epsilon:
            return np.random.choice(self.num_actions)  # 以epsilon的概率随机选择动作
        else:
            q_values = self.Q1[state] + self.Q2[state]  # 使用两个Q值函数的平均值来选择动作
            return np.argmax(q_values)  # 选择具有最大Q值的动作

    def learn(self, state, action, reward, next_state):
        if np.random.uniform(0, 1) < 0.5:  # 以0.5的概率选择更新哪个Q值函数
            best_next_action = np.argmax(self.Q1[next_state])
            td_target = reward + self.discount_factor * self.Q2[next_state, best_next_action]
            td_error = td_target - self.Q1[state, action]
            self.Q1[state, action] += self.learning_rate * td_error
        else:
            best_next_action = np.argmax(self.Q2[next_state])
            td_target = reward + self.discount_factor * self.Q1[next_state, best_next_action]
            td_error = td_target - self.Q2[state, action]
            self.Q2[state, action] += self.learning_rate * td_error

# 创建一个简单的网格世界环境
# 在这个环境中，智能体需要找到特定位置，并获得+1的奖励，其他位置奖励为0
num_states = 5
num_actions = 4  # 上、下、左、右四个动作

# 初始化Double Q-learning智能体
agent = DoubleQLearningAgent(num_states, num_actions)

# 运行Double Q-learning算法进行训练
num_episodes = 1000
for episode in range(num_episodes):
    state = np.random.randint(num_states)  # 随机选择一个起始状态
    done = False
    while not done:
        action = agent.choose_action(state)  # 根据当前策略选择动作
        if action == 0:  # 上
            next_state = max(0, state - 1)
        elif action == 1:  # 下
            next_state = min(num_states - 1, state + 1)
        elif action == 2:  # 左
            next_state = max(0, state - 1)
        else:  # 右
            next_state = min(num_states - 1, state + 1)

        reward = 0
        if next_state == num_states - 1:  # 到达目标位置，获得+1的奖励
            reward = 1
            done = True

        # 更新Q值函数
        agent.learn(state, action, reward, next_state)

        state = next_state  # 更新状态

# 打印学习到的Q值函数
print("Learned Q-values:")
print("Q1:")
print(agent.Q1)
print("Q2:")
print(agent.Q2)


Learned Q-values:
Q1:
[[0.43499351 0.         0.32911716 3.0301276 ]
 [1.48585265 1.55812233 0.82279627 3.46324513]
 [1.07178318 3.91453899 1.43966066 1.96652781]
 [1.22365307 3.24724382 1.86128169 4.37919984]
 [3.79371819 0.57276659 0.61031959 0.59604726]]
Q2:
[[0.11177675 0.19437398 0.         2.95097101]
 [0.51674766 0.68537353 0.74017839 3.46560802]
 [0.9801829  3.89999628 1.67943174 0.86921794]
 [1.75241339 2.73605752 2.14560506 4.40124908]
 [3.81455617 0.9422444  0.2509109  0.        ]]
