In [3]:
pip install swig

Collecting swig
  Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.9 MB[0m [31m3.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/1.9 MB[0m [31m10.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.3.0


In [4]:
!pip install wheel
!pip install gymnasium[box2d]

Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Using cached box2d-py-2.3.5.tar.gz (374 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp311-cp311-linux_x86_64.whl size=2379450 sha256=c444e593ebb1380215bc40b33a39e397ecc61fc6e4b565f5319f25ba0794790d
  Stored in directory: /root/.cache/pip/wheels/ab/f1/0c/d56f4a2bdd12bae0a0693ec33f2f0daadb5eb9753c78fa5308
Successfully built box2d-py
Installing collected packages: box2d-py
Successfully installed box2d-py-2.3.5


In [5]:
import gymnasium as gym
import numpy as np
import random
from collections import deque
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
from IPython.display import clear_output

In [11]:


# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the DQN model architecture
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

# Define Replay Memory for experience replay
class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        experiences = random.sample(self.memory, batch_size)
        states, actions, rewards, next_states, dones = zip(*experiences)
        return np.vstack(states), actions, rewards, np.vstack(next_states), dones

    def __len__(self):
        return len(self.memory)

# Define the DQN agent
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size

        # Define hyperparameters
        self.gamma = 0.99  # discount factor
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.batch_size = 64
        self.memory_size = 100000
        self.update_target_every = 100  # frequency to update target network

        # Create Q networks (policy and target)
        self.policy_net = DQN(state_size, action_size).to(device)
        self.target_net = DQN(state_size, action_size).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()  # target network is only used for inference

        # Define optimizer
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)

        # Create replay memory
        self.memory = ReplayMemory(self.memory_size)

        # Initialize step counter (for target network updates)
        self.step_counter = 0

    def select_action(self, state):
        # Epsilon-greedy action selection
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)

        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        with torch.no_grad():
            action_values = self.policy_net(state)
        return torch.argmax(action_values).item()

    def learn(self):
        # Check if memory has enough samples
        if len(self.memory) < self.batch_size:
            return

        # Sample from memory
        states, actions, rewards, next_states, dones = self.memory.sample(self.batch_size)

        # Convert to PyTorch tensors
        states = torch.FloatTensor(states).to(device)
        actions = torch.LongTensor(actions).unsqueeze(1).to(device)
        rewards = torch.FloatTensor(rewards).unsqueeze(1).to(device)
        next_states = torch.FloatTensor(next_states).to(device)
        dones = torch.FloatTensor(dones).unsqueeze(1).to(device)

        # Get current Q values
        current_q_values = self.policy_net(states).gather(1, actions)

        # Compute target Q values
        with torch.no_grad():
            next_q_values = self.target_net(next_states).max(1, keepdim=True)[0]
        target_q_values = rewards + (self.gamma * next_q_values * (1 - dones))

        # Compute loss
        loss = F.smooth_l1_loss(current_q_values, target_q_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        # Clip gradients to stabilize training
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        # Update epsilon (exploration rate)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        # Update target network
        self.step_counter += 1
        if self.step_counter % self.update_target_every == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())

    def save_model(self, filename="lunar_lander_dqn.pth"):
        torch.save(self.policy_net.state_dict(), filename)

    def load_model(self, filename="lunar_lander_dqn.pth"):
        self.policy_net.load_state_dict(torch.load(filename, map_location=torch.device('cpu')))
        self.target_net.load_state_dict(self.policy_net.state_dict())

# Function to plot training progress
def plot_scores(scores, filename=None):
    clear_output(True)
    plt.figure(figsize=(10, 6))
    plt.plot(scores)
    plt.title('DQN Training Progress')
    plt.xlabel('Episode')
    plt.ylabel('Score')

    # Add a horizontal line at score 200 (considered "solved")
    plt.axhline(y=200, color='r', linestyle='-', alpha=0.3)
    plt.text(0, 210, 'Task considered solved when score >= 200', color='r')

    # Calculate and display moving average
    window_size = 100
    if len(scores) >= window_size:
        moving_avg = np.convolve(scores, np.ones(window_size)/window_size, mode='valid')
        plt.plot(range(window_size-1, len(scores)), moving_avg, color='orange')
        plt.text(len(scores)-1, moving_avg[-1], f'Moving Avg: {moving_avg[-1]:.2f}')

    if filename:
        plt.savefig(filename)
    plt.show()

# Main training function
def train_lunar_lander(n_episodes=1500, max_steps=1000, target_score=250,
                       solve_threshold=100, render=False):
    # Create environment
    env = gym.make('LunarLander-v3')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n

    # Create agent
    agent = DQNAgent(state_size, action_size)

    # Training variables
    scores = []
    avg_scores = []
    solved = False

    # Training loop
    for episode in range(n_episodes):
        state, _ = env.reset()
        score = 0

        for t in range(max_steps):
            # Select and perform an action
            action = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            # Store the transition in memory
            agent.memory.push(state, action, reward, next_state, done)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization
            agent.learn()

            # Update score
            score += reward

            # Check if episode is done
            if done:
                break

        # Record score for this episode
        scores.append(score)

        # Calculate average score over last 100 episodes
        avg_score = np.mean(scores[-solve_threshold:]) if len(scores) >= solve_threshold else np.mean(scores)
        avg_scores.append(avg_score)
        print(f"Episode {episode}/{n_episodes}, Score: {score:.2f}, Avg Score: {avg_score:.2f}, Epsilon: {agent.epsilon:.4f}")
        # Print progress
        if episode % 10 == 0:
            print(f"Episode {episode}/{n_episodes}, Score: {score:.2f}, Avg Score: {avg_score:.2f}, Epsilon: {agent.epsilon:.4f}")
            # Plot progress
            if episode % 100 == 0:
                plot_scores(scores)

        # Check if environment is solved
        if len(scores) >= solve_threshold and avg_score >= target_score and not solved:
            print(f"\nEnvironment solved in {episode} episodes! Average Score: {avg_score:.2f}")
            # Save the model
            agent.save_model()
            solved = True

    # Close environment
    env.close()

    # Plot final scores
    plot_scores(scores, "dqn_lunar_lander_training.png")

    return agent, scores

# Function to evaluate and visualize a trained agent
def evaluate_agent(agent, n_episodes=5, render=True):
    vectory = 0
    # Create environment
    env = gym.make('LunarLander-v3', render_mode='human' if render else None)

    for episode in range(n_episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False

        while not done:
            # Select action with trained policy (no exploration)
            agent.epsilon = 0  # Turn off exploration
            action = agent.select_action(state)

            # Take action
            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            total_reward += reward

            if render:
                env.render()
        if total_reward >= 200:
            vectory += 1

        print(f"Episode {episode+1}/{n_episodes}, Total Reward: {total_reward:.2f}")

    env.close()
    return vectory / n_episodes * 100

In [7]:


if __name__ == "__main__":
    #Train agent
    print("Starting training...")
    agent, scores = train_lunar_lander(n_episodes=1500)
    env = gym.make('LunarLander-v3')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n


  from pkg_resources import resource_stream, resource_exists
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)


In [8]:
import os
if os.path.exists("lunar_lander_dqn.pth"):
    agent.load_model("lunar_lander_dqn.pth")
else:
    print("Model file not found! Train the model first.")


Model file not found! Train the model first.


In [10]:
agent = DQNAgent(state_size, action_size)
agent.load_model('/content/lunar_lander_dqn.pth')  # Adjust path if needed


In [None]:
evaluate_agent(agent, n_episodes=5, render=True)


Episode 1/5, Total Reward: 50.62
Episode 2/5, Total Reward: 276.17
Episode 3/5, Total Reward: 268.30
Episode 4/5, Total Reward: 286.34
Episode 5/5, Total Reward: 250.47


In [None]:
evaluate_agent(agent, n_episodes=10, render=True)

Episode 1/10, Total Reward: 286.81
Episode 2/10, Total Reward: 275.60
Episode 3/10, Total Reward: 254.57
Episode 4/10, Total Reward: 49.03
Episode 5/10, Total Reward: 269.58
Episode 6/10, Total Reward: 265.25
Episode 7/10, Total Reward: 254.48
Episode 8/10, Total Reward: 308.15
Episode 9/10, Total Reward: 29.93
Episode 10/10, Total Reward: 311.56


In [None]:
evaluate_agent(agent, n_episodes=10, render=True)

Episode 1/10, Total Reward: 223.13
Episode 2/10, Total Reward: 264.46
Episode 3/10, Total Reward: 316.01
Episode 4/10, Total Reward: 299.24
Episode 5/10, Total Reward: 233.68
Episode 6/10, Total Reward: 249.46
Episode 7/10, Total Reward: 269.83
Episode 8/10, Total Reward: 287.11
Episode 9/10, Total Reward: 232.85
Episode 10/10, Total Reward: 323.04


In [None]:
evaluate_agent(agent, n_episodes=10, render=True)

Episode 1/10, Total Reward: 284.64
Episode 2/10, Total Reward: 56.87
Episode 3/10, Total Reward: 14.26
Episode 4/10, Total Reward: 51.45
Episode 5/10, Total Reward: 287.74
Episode 6/10, Total Reward: 265.98
Episode 7/10, Total Reward: 150.20
Episode 8/10, Total Reward: 267.95
Episode 9/10, Total Reward: 298.94
Episode 10/10, Total Reward: 227.20


In [13]:
accuracy = evaluate_agent(agent, n_episodes=10, render=True)
print(f"Accuracy: {accuracy:.2f}%")

Episode 1/10, Total Reward: 280.26
Episode 2/10, Total Reward: 305.16
Episode 3/10, Total Reward: 247.60
Episode 4/10, Total Reward: 252.97
Episode 5/10, Total Reward: 286.11
Episode 6/10, Total Reward: 15.45
Episode 7/10, Total Reward: 286.37
Episode 8/10, Total Reward: 311.36
Episode 9/10, Total Reward: 268.53
Episode 10/10, Total Reward: 277.06
Accuracy: 90.00%


In [12]:
accuracy = evaluate_agent(agent, n_episodes=100, render=True)
print(f"Accuracy: {accuracy:.2f}%")

Episode 1/100, Total Reward: 185.97
Episode 2/100, Total Reward: 206.16
Episode 3/100, Total Reward: 255.55
Episode 4/100, Total Reward: 243.17
Episode 5/100, Total Reward: 289.27
Episode 6/100, Total Reward: 301.70
Episode 7/100, Total Reward: 154.98
Episode 8/100, Total Reward: 278.87
Episode 9/100, Total Reward: 45.83
Episode 10/100, Total Reward: 37.32
Episode 11/100, Total Reward: 263.13
Episode 12/100, Total Reward: 254.77
Episode 13/100, Total Reward: 176.46
Episode 14/100, Total Reward: 232.71
Episode 15/100, Total Reward: 261.63
Episode 16/100, Total Reward: 44.25
Episode 17/100, Total Reward: 286.35
Episode 18/100, Total Reward: 291.59
Episode 19/100, Total Reward: 253.65
Episode 20/100, Total Reward: 301.06
Episode 21/100, Total Reward: 274.52
Episode 22/100, Total Reward: 307.73
Episode 23/100, Total Reward: 324.60
Episode 24/100, Total Reward: 269.08
Episode 25/100, Total Reward: 251.06
Episode 26/100, Total Reward: 258.53
Episode 27/100, Total Reward: 272.72
Episode 28/10

In [None]:
!pip install stable-baselines3[extra]

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.6.0-py3-none-any.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cublas_cu12-12.4.5.8-py

In [None]:
agent=DQNAgent(state_size, action_size)
agent.load_model("lunar_lander_dqn.pth")

In [None]:
import os
import imageio
def evaluate_visualize_agent(agent, n_episodes=5, render=True, output_dir='recorded_episodes'):
  # Create clean output directory
    if os.path.exists(output_dir):
        for f in os.listdir(output_dir):
            os.remove(os.path.join(output_dir, f))
    else:
        os.makedirs(output_dir)

    # Create environment
    # env = gym.make('LunarLander-v3', render_mode='human' if render else None)
        # Set up environment
    env = gym.make("LunarLander-v3", render_mode='rgb_array')

    # Create output directory
    output_dir = "/content/videos"
    os.makedirs(output_dir, exist_ok=True)

    # Combined frames list
    frames = []

    # Run multiple episodes
    for episode in range(n_episodes):
        state, _ = env.reset()
        done = False
        total_reward = 0

        while not done:
            action = agent.select_action(state)
            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward

            # Capture frame (robust rendering)
            try:
                frame = env.render()
                if frame is not None:
                    frames.append(frame)
            except:
                print(f"Error capturing frame for episode {episode+1}. Skipping frame.")
                continue

        print(f"Episode {episode+1} finished with reward: {total_reward}")

    # Save all frames in one video
    if frames:
        video_path = os.path.join(output_dir, f"combined_episodes.mp4")
        imageio.mimsave(video_path, frames, fps=30)
        print(f"Saved combined video at {video_path}")
    else:
        print("No frames to save.")

    env.close()


In [None]:
evaluate_visualize_agent(agent, n_episodes=10, render=True)

Episode 1 finished with reward: 287.84658165603423
Episode 2 finished with reward: 274.971799199413
Episode 3 finished with reward: 229.93445248200294
Episode 4 finished with reward: 48.71753943355736
Episode 5 finished with reward: 268.7542876764501
Episode 6 finished with reward: 281.1494532418045
Episode 7 finished with reward: 260.98906836214815
Episode 8 finished with reward: 21.043566370809003
Episode 9 finished with reward: 296.220292990281




Episode 10 finished with reward: 289.5145731969461
Saved combined video at /content/videos/combined_episodes.mp4


In [None]:
from IPython.display import HTML
from base64 import b64encode

mp4 = open('/content/videos/combined_episodes.mp4', 'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML(f'<video width=600 controls><source src="{data_url}" type="video/mp4"></video>')