In [1]:
!pip install swig
!pip install gymnasium[box2d]
!pip install box2d


Collecting swig
  Downloading swig-4.4.0-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.4.0-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m73.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.4.0
Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp312-cp312-linux_x86_64.whl size=2398999 sha256=c79

In [6]:
import gymnasium as gym
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from collections import deque

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [8]:
# ---- Сеть DQN ----
class DQN(nn.Module):
  def __init__(self, obs_dim, act_dim):
    super().__init__()
    self.model = nn.Sequential(
      nn.Linear(obs_dim, 128),
      nn.ReLU(),
      nn.Linear(128, 128),
      nn.ReLU(),
      nn.Linear(128, act_dim)
    )

  def forward(self, x):
    return self.model(x)


# ---- Replay Buffer ----
class ReplayBuffer:
  def __init__(self, capacity=100000):
    self.buffer = deque(maxlen=capacity)

  def push(self, s, a, r, sn, done):
    self.buffer.append((s, a, r, sn, done))

  def sample(self, batch_size):
    batch = random.sample(self.buffer, batch_size)
    s, a, r, sn, d = zip(*batch)
    return np.array(s), a, r, np.array(sn), d

  def __len__(self):
    return len(self.buffer)


# ---- DQN обучение ----
def train_dqn(episodes=600):
  env = gym.make("LunarLander-v3")
  obs_dim = env.observation_space.shape[0]
  act_dim = env.action_space.n

  qnet = DQN(obs_dim, act_dim).to(device)
  qtarget = DQN(obs_dim, act_dim).to(device)
  qtarget.load_state_dict(qnet.state_dict())

  optimizer = optim.Adam(qnet.parameters(), lr=1e-3)
  buffer = ReplayBuffer()

  gamma = 0.99
  batch_size = 64
  epsilon = 1.0
  eps_min = 0.05
  eps_decay = 0.995

  rewards_history = []

  for ep in range(episodes):
    state, _ = env.reset()
    total_reward = 0

    for _ in range(1000):
      # epsilon-greedy
      if random.random() < epsilon:
        action = env.action_space.sample()
      else:
        st = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
        action = torch.argmax(qnet(st)).item()

      next_state, reward, terminated, truncated, _ = env.step(action)
      done = terminated or truncated
      total_reward += reward

      buffer.push(state, action, reward, next_state, done)
      state = next_state

      if len(buffer) > batch_size:
        s, a, r, sn, d = buffer.sample(batch_size)


        s = torch.tensor(s, dtype=torch.float32).to(device)
        sn = torch.tensor(sn, dtype=torch.float32).to(device)
        a = torch.tensor(a).long().to(device)
        r = torch.tensor(r, dtype=torch.float32).to(device)
        d = torch.tensor(d, dtype=torch.float32).to(device)

        # Q(s,a)
        qvals = qnet(s).gather(1, a.unsqueeze(1)).squeeze(1)

        # TD target
        next_q = qtarget(sn).max(1)[0]
        target = r + gamma * next_q * (1 - d)

        loss = nn.MSELoss()(qvals, target.detach())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

      if done:
        break

    rewards_history.append(total_reward)
    epsilon = max(eps_min, epsilon * eps_decay)

    # soft update target network
    if ep % 10 == 0:
      qtarget.load_state_dict(qnet.state_dict())

    print(f"Episode {ep} | Reward: {total_reward:.1f} | Epsilon: {epsilon:.3f}")

  return qnet, rewards_history


In [None]:
# ---- Запуск обучения ----
qnet, rewards = train_dqn(episodes=500)

Episode 0 | Reward: -348.2 | Epsilon: 0.995
Episode 1 | Reward: -288.9 | Epsilon: 0.990
Episode 2 | Reward: -43.9 | Epsilon: 0.985
Episode 3 | Reward: -273.8 | Epsilon: 0.980
Episode 4 | Reward: -468.0 | Epsilon: 0.975
Episode 5 | Reward: -132.5 | Epsilon: 0.970
Episode 6 | Reward: -198.8 | Epsilon: 0.966
Episode 7 | Reward: -508.8 | Epsilon: 0.961
Episode 8 | Reward: -329.2 | Epsilon: 0.956
Episode 9 | Reward: -80.1 | Epsilon: 0.951
Episode 10 | Reward: -221.9 | Epsilon: 0.946
Episode 11 | Reward: -101.0 | Epsilon: 0.942
Episode 12 | Reward: -92.6 | Epsilon: 0.937
Episode 13 | Reward: -376.4 | Epsilon: 0.932
Episode 14 | Reward: -88.7 | Epsilon: 0.928
Episode 15 | Reward: -54.8 | Epsilon: 0.923
Episode 16 | Reward: -27.5 | Epsilon: 0.918
Episode 17 | Reward: -121.5 | Epsilon: 0.914
Episode 18 | Reward: -182.0 | Epsilon: 0.909
Episode 19 | Reward: -83.9 | Epsilon: 0.905
Episode 20 | Reward: -28.6 | Epsilon: 0.900
Episode 21 | Reward: -99.4 | Epsilon: 0.896
Episode 22 | Reward: -75.9 | 

In [None]:
# ---- Визуализация ----
plt.plot(rewards)
plt.title("DQN Training Reward")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.grid()
plt.show()


In [None]:
import imageio
from IPython.display import HTML
from base64 import b64encode


In [None]:
torch.save(qnet.state_dict(), "dqn_enhansed_lunarlander.pt")


In [None]:
def play_colab(model, episodes=1):
  env = gym.make("LunarLander-v3", render_mode="rgb_array")
  frames = []

  for ep in range(episodes):
    state, _ = env.reset()
    done = False

    while not done:
      st = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
      action = torch.argmax(model(st)).item()

      state, _, term, trunc, _ = env.step(action)
      done = term or trunc

      frame = env.render()
      frames.append(frame)

  env.close()

  # save video
  imageio.mimsave("lander.mp4", frames, fps=60)

  mp4 = open("lander.mp4",'rb').read()
  data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
  return HTML(f"<video width=480 controls><source src='{data_url}' type='video/mp4'></video>")


In [None]:
obs_dim = 8         # у LunarLander-v3 всегда 8
act_dim = 4         # 4 действия

model = DQN(obs_dim, act_dim).to(device)
model.load_state_dict(torch.load("dqn_lunarlander.pt", map_location=device))
model.eval()

play_colab(model)
