<a href="https://colab.research.google.com/github/Nanda74775/MMO_LAB/blob/main/MMO_LAB7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gym numpy matplotlib




In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Определение среды
env = gym.make('CartPole-v1')

In [None]:
# Дискретизация пространства состояний
n_bins = (6, 12, 6, 12)  # количество бинов для каждого измерения
lower_bounds = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -np.radians(50)]
upper_bounds = [env.observation_space.high[0], 0.5, env.observation_space.high[2], np.radians(50)]


In [None]:
def discretize_state(state, n_bins, lower_bounds, upper_bounds):
    ratios = [(state[i] - lower_bounds[i]) / (upper_bounds[i] - lower_bounds[i]) for i in range(len(state))]
    new_state = [int(round((n_bins[i] - 1) * ratios[i])) for i in range(len(state))]
    new_state = [min(n_bins[i] - 1, max(0, new_state[i])) for i in range(len(state))]
    return tuple(new_state)

In [None]:
def sarsa(env, episodes, alpha, gamma, epsilon, n_bins, lower_bounds, upper_bounds):
    q_table = np.zeros(n_bins + (env.action_space.n,))

    def choose_action(state, epsilon):
        if np.random.random() < epsilon:
            return env.action_space.sample()
        else:
            return np.argmax(q_table[state])

    for episode in range(episodes):
        state = discretize_state(env.reset(), n_bins, lower_bounds, upper_bounds)
        action = choose_action(state, epsilon)

        done = False
        while not done:
            next_state_raw, reward, done, _ = env.step(action)
            next_state = discretize_state(next_state_raw, n_bins, lower_bounds, upper_bounds)
            next_action = choose_action(next_state, epsilon)

            q_table[state + (action,)] += alpha * (reward + gamma * q_table[next_state + (next_action,)] - q_table[state + (action,)])

            state = next_state
            action = next_action

    return q_table

In [None]:
def q_learning(env, episodes, alpha, gamma, epsilon, n_bins, lower_bounds, upper_bounds):
    q_table = np.zeros(n_bins + (env.action_space.n,))

    def choose_action(state, epsilon):
        if np.random.random() < epsilon:
            return env.action_space.sample()
        else:
            return np.argmax(q_table[state])

    for episode in range(episodes):
        state = discretize_state(env.reset(), n_bins, lower_bounds, upper_bounds)

        done = False
        while not done:
            action = choose_action(state, epsilon)
            next_state_raw, reward, done, _ = env.step(action)
            next_state = discretize_state(next_state_raw, n_bins, lower_bounds, upper_bounds)

            q_table[state + (action,)] += alpha * (reward + gamma * np.max(q_table[next_state]) - q_table[state + (action,)])

            state = next_state

    return q_table

In [None]:
def double_q_learning(env, episodes, alpha, gamma, epsilon, n_bins, lower_bounds, upper_bounds):
    q_table1 = np.zeros(n_bins + (env.action_space.n,))
    q_table2 = np.zeros(n_bins + (env.action_space.n,))

    def choose_action(state, epsilon):
        if np.random.random() < epsilon:
            return env.action_space.sample()
        else:
            return np.argmax(q_table1[state] + q_table2[state])

    for episode in range(episodes):
        state = discretize_state(env.reset(), n_bins, lower_bounds, upper_bounds)

        done = False
        while not done:
            action = choose_action(state, epsilon)
            next_state_raw, reward, done, _ = env.step(action)
            next_state = discretize_state(next_state_raw, n_bins, lower_bounds, upper_bounds)

            if np.random.random() < 0.5:
                best_next_action = np.argmax(q_table1[next_state])
                q_table1[state + (action,)] += alpha * (reward + gamma * q_table2[next_state + (best_next_action,)] - q_table1[state + (action,)])
            else:
                best_next_action = np.argmax(q_table2[next_state])
                q_table2[state + (action,)] += alpha * (reward + gamma * q_table1[next_state + (best_next_action,)] - q_table2[state + (action,)])

            state = next_state

    return q_table1 + q_table2

In [None]:
# Параметры обучения
episodes = 1000
alpha = 0.1
gamma = 0.99
epsilon = 0.1

In [None]:
# Обучение
q_table_sarsa = sarsa(env, episodes, alpha, gamma, epsilon, n_bins, lower_bounds, upper_bounds)
q_table_q_learning = q_learning(env, episodes, alpha, gamma, epsilon, n_bins, lower_bounds, upper_bounds)
q_table_double_q = double_q_learning(env, episodes, alpha, gamma, epsilon, n_bins, lower_bounds, upper_bounds)


In [None]:
# Оценка политики
def evaluate_policy(env, q_table, episodes=100):
    total_rewards = []
    for _ in range(episodes):
        state = discretize_state(env.reset(), n_bins, lower_bounds, upper_bounds)
        done = False
        total_reward = 0
        while not done:
            action = np.argmax(q_table[state])
            next_state_raw, reward, done, _ = env.step(action)
            next_state = discretize_state(next_state_raw, n_bins, lower_bounds, upper_bounds)
            total_reward += reward
            state = next_state
        total_rewards.append(total_reward)
    return np.mean(total_rewards)


In [None]:
mean_reward_sarsa = evaluate_policy(env, q_table_sarsa)
mean_reward_q_learning = evaluate_policy(env, q_table_q_learning)
mean_reward_double_q = evaluate_policy(env, q_table_double_q)


In [None]:
print(f"SARSA mean reward: {mean_reward_sarsa}")

SARSA mean reward: 9.63


In [None]:
print(f"Q-Learning mean reward: {mean_reward_q_learning}")

Q-Learning mean reward: 10.85


In [None]:
print(f"Double Q-Learning mean reward: {mean_reward_double_q}")

Double Q-Learning mean reward: 10.92
