Реализуйте алгоритм GAIL на среде Mountain Car. Перед этим сгенерируйте экспертные данные (из детерминированной стратегии с первой практики). Хорошей идеей будет добавить в state (observation) синус и косинус от временной метки t для лучшего обучения.

In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
import torch.optim as optim
import torch.nn.functional as F

from collections import deque
import random

In [19]:
!pip install gymnasium -q

^C


In [2]:
env = gym.make("MountainCar-v0")

In [3]:
def generate_expert_data(env, num_episodes=100):
    states = []
    actions = []

    for episode in range(num_episodes):
        obs, _ = env.reset()
        done = False
        t = 0  # Временная метка

        while not done:
            # Выбираем действие с помощью экспертной стратегии
            action = expert_policy(obs)

            # Совершаем шаг в среде
            next_obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            # Дополняем состояние синусом и косинусом временной метки t
            extended_state = np.append(obs, [np.sin(t), np.cos(t)])
            states.append(extended_state)
            actions.append(action)

            # Обновляем состояние и временную метку
            obs = next_obs
            t += 1

    return np.array(states, dtype=np.float32), np.array(actions, dtype=np.int64)

In [4]:
def expert_policy(obs):
    _, velocity = obs
    if velocity < 0:
        return 0  # Толкать влево
    else:
        return 2  # Толкать вправо

In [6]:
states, actions = generate_expert_data(env)

In [7]:
obs_dim = 4
act_dim = 3
expert_obs, expert_acts = generate_expert_data(env)

In [8]:
class Policy(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 64), nn.ReLU(),
            nn.Linear(64, act_dim)
        )

    def forward(self, obs):
        logits = self.net(obs)
        return Categorical(logits=logits)

    def get_action(self, obs):
        dist = self.forward(obs)
        return dist.sample().item()

In [9]:
class Discriminator(nn.Module):
    def __init__(self, obs_dim, act_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim + act_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
    
    def forward(self, obs, act):
        act_onehot = F.one_hot(act, num_classes=3).float()
        x = torch.cat([obs, act_onehot], dim=1)
        return self.net(x)

In [10]:
class TrajectoryBuffer:
    def __init__(self):
        self.obs, self.acts, self.rews = [], [], []

    def store(self, o, a, r):
        self.obs.append(o)
        self.acts.append(a)
        self.rews.append(r)

    def get(self):
        return (
            torch.tensor(np.array(self.obs), dtype=torch.float32),
            torch.tensor(np.array(self.acts), dtype=torch.long),
            torch.tensor(np.array(self.rews), dtype=torch.float32)
        )

In [11]:
policy = Policy(obs_dim, act_dim)
discrim = Discriminator(obs_dim, act_dim)

policy_opt = optim.Adam(policy.parameters(), lr=3e-4)
discrim_opt = optim.Adam(discrim.parameters(), lr=1e-4)

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
#!pip install scikit-learn

In [13]:

expert_obs_tensor = torch.tensor(expert_obs, dtype=torch.float32)
expert_acts_tensor = torch.tensor(expert_acts, dtype=torch.long)

In [None]:
for epoch in range(1000):
    buf = TrajectoryBuffer()
    obs, _ = env.reset()
    done = False
    t = 0

    while not done: 
        augmented_obs = np.append(obs, [np.sin(t), np.cos(t)])
        obs_tensor = torch.tensor(augmented_obs, dtype=torch.float32).unsqueeze(0)
        action = policy.get_action(obs_tensor)

        next_obs, _, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        buf.store(augmented_obs, action, 0)
        obs = next_obs
        t += 1

    agent_obs, agent_acts, _ = buf.get()

    for _ in range(5):
        discrim_opt.zero_grad()
        
        # Выбираем случайные экспертные данные
        expert_idx = np.random.choice(len(expert_obs_tensor), len(agent_obs), replace=False)
        expert_batch_obs = expert_obs_tensor[expert_idx]
        expert_batch_acts = expert_acts_tensor[expert_idx]
        
        expert_probs = discrim(expert_batch_obs, expert_batch_acts)
        agent_probs = discrim(agent_obs, agent_acts)
        
        disc_loss = (
            F.binary_cross_entropy(expert_probs, torch.ones_like(expert_probs)) +
            F.binary_cross_entropy(agent_probs, torch.zeros_like(agent_probs))
        ) / 2
        
        disc_loss.backward()
        discrim_opt.step()
    
    with torch.no_grad():
        agent_probs = discrim(agent_obs, agent_acts)
        rewards = -torch.log(agent_probs + 1e-8)

    # Обучение политики
    policy_opt.zero_grad()
    dists = policy(agent_obs)
    log_probs = dists.log_prob(agent_acts)
    loss = -(log_probs * rewards.squeeze()).mean()
    loss.backward()
    policy_opt.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Policy Loss {loss.item():.2f}, Disc Loss {disc_loss.item():.2f}")


Epoch 0: Policy Loss 0.76, Disc Loss 0.70
Epoch 10: Policy Loss 0.76, Disc Loss 0.68
Epoch 20: Policy Loss 0.79, Disc Loss 0.67
Epoch 30: Policy Loss 0.80, Disc Loss 0.65
Epoch 40: Policy Loss 0.83, Disc Loss 0.63
Epoch 50: Policy Loss 0.88, Disc Loss 0.60
Epoch 60: Policy Loss 0.95, Disc Loss 0.59
Epoch 70: Policy Loss 1.04, Disc Loss 0.55
Epoch 80: Policy Loss 1.11, Disc Loss 0.53
Epoch 90: Policy Loss 1.23, Disc Loss 0.48
Epoch 100: Policy Loss 1.28, Disc Loss 0.46
Epoch 110: Policy Loss 1.36, Disc Loss 0.43
Epoch 120: Policy Loss 1.34, Disc Loss 0.39
Epoch 130: Policy Loss 1.24, Disc Loss 0.40
Epoch 140: Policy Loss 1.28, Disc Loss 0.33
Epoch 150: Policy Loss 1.24, Disc Loss 0.30
Epoch 160: Policy Loss 1.16, Disc Loss 0.26
Epoch 170: Policy Loss 1.03, Disc Loss 0.29
Epoch 180: Policy Loss 0.98, Disc Loss 0.21
Epoch 190: Policy Loss 0.87, Disc Loss 0.21
Epoch 200: Policy Loss 0.78, Disc Loss 0.20
Epoch 210: Policy Loss 0.69, Disc Loss 0.18
Epoch 220: Policy Loss 0.62, Disc Loss 0.15

Протестируйте ваш алгоритм

In [19]:
for episode in range(20):
    obs, _ = env.reset()
    done = False
    total_reward = 0

    t = 0
    obses = []
    while not done:
        with torch.no_grad():
            augmented_obs = np.append(obs, [np.sin(t), np.cos(t)])
            obs_tensor = torch.tensor(augmented_obs, dtype=torch.float32).unsqueeze(0)
            action = policy.get_action(obs_tensor)

            next_obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            obses.append(obs[0])
            obs = next_obs
            total_reward += reward
            t += 1
        
    print(f"Test Episode {episode+1}: Total Reward = {total_reward}")
print(f"highest point {max(obses)}")
env.close()

Test Episode 1: Total Reward = -200.0
Test Episode 2: Total Reward = -200.0
Test Episode 3: Total Reward = -200.0
Test Episode 4: Total Reward = -200.0
Test Episode 5: Total Reward = -200.0
Test Episode 6: Total Reward = -200.0
Test Episode 7: Total Reward = -200.0
Test Episode 8: Total Reward = -200.0
Test Episode 9: Total Reward = -200.0
Test Episode 10: Total Reward = -200.0
Test Episode 11: Total Reward = -200.0
Test Episode 12: Total Reward = -200.0
Test Episode 13: Total Reward = -200.0
Test Episode 14: Total Reward = -200.0
Test Episode 15: Total Reward = -200.0
Test Episode 16: Total Reward = -200.0
Test Episode 17: Total Reward = -200.0
Test Episode 18: Total Reward = -200.0
Test Episode 19: Total Reward = -200.0
Test Episode 20: Total Reward = -200.0
highest point -0.4923360049724579
