In [1]:
!pip install pygame

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m


#### ***INVERTED PENDULUM***

In [1]:
import gym
import math
import pygame
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random

In [5]:
# Initialize Pygame
pygame.init()

# Screen dimensions
width, height = 800, 600
x_cart_scale, pendulum_len = 150, 200
cart_width, cart_height = 100, 5
y_cart = height - 100

# Set up the display
scrn = pygame.display.set_mode((width, height))

# Create the CartPole-v1 environment
env = gym.make("CartPole-v1", render_mode="rgb_array")

# Hyperparameters
gamma = 0.99
lr = 0.001
epsilon = 1.0
epsilon_decay = 0.99
epsilon_min = 0.01
batch_size = 64
target_update = 5

# Replay buffer
memory = deque(maxlen=10000)

# Neural network for DQN
class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(env.observation_space.shape[0], 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, env.action_space.n)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Initialize DQN and target networks
policy_net = DQN()
target_net = DQN()
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=lr)

# Function to select action based on epsilon-greedy policy
def select_action(state, epsilon):
    if random.random() < epsilon:
        return env.action_space.sample()
    with torch.no_grad():
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        q_values = policy_net(state)
        return q_values.argmax().item()

# Function to optimize the model
def optimize_model():
    if len(memory) < batch_size:
        return
    batch = random.sample(memory, batch_size)
    states, actions, rewards, next_states, dones = zip(*batch)
    
    states = torch.tensor(states, dtype=torch.float32)
    actions = torch.tensor(actions, dtype=torch.long).unsqueeze(1)
    rewards = torch.tensor(rewards, dtype=torch.float32)
    next_states = torch.tensor(next_states, dtype=torch.float32)
    dones = torch.tensor(dones, dtype=torch.float32)

    current_q_values = policy_net(states).gather(1, actions)
    next_q_values = target_net(next_states).max(1)[0].detach()
    expected_q_values = rewards + (gamma * next_q_values * (1 - dones))

    loss = nn.functional.mse_loss(current_q_values.squeeze(), expected_q_values)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Reset the environment to start
observation,_ = env.reset()

# Game loop
running = True
for episode in range(200):  # Train over 500 episodes
    observation,_ = env.reset()
    episode_reward = 0
    for t in range(999):
        scrn.fill((0, 0, 0))

        # Event handling
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                running = False

        # Select action using epsilon-greedy strategy
        action = select_action(observation, epsilon)

        # Step the environment forward using the selected action
        next_observation, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        memory.append((observation, action, reward, next_observation, done))

        observation = next_observation
        episode_reward += reward

        # Unpack observation values for visualization
        x_cart = int(observation[0] * x_cart_scale) + width // 2  # Scale and centre the cart
        ang = observation[2]

        # Calculate pendulum position
        x_p = x_cart + pendulum_len * math.sin(ang)
        y_p = y_cart - pendulum_len * math.cos(ang)

        # Draw the pendulum and cart
        pygame.draw.line(scrn, (0, 0, 255), (x_p, y_p), (x_cart, y_cart), 2)
        pygame.draw.circle(scrn, (0, 255, 0), (int(x_p), int(y_p)), 15)
        pygame.draw.rect(scrn, (255, 0, 0), pygame.Rect(x_cart - cart_width // 2, y_cart, cart_width, cart_height))

        # Update the display
        pygame.display.update()

        # Optimize the model
        optimize_model()

        # Check if the episode is terminated
        if done:
            break

        # Exit the loop if the game is closed
        if not running:
            break

        # Pause for a brief moment (adjustable)
        pygame.time.wait(50)  # in milliseconds

    # Decay epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

    # Update target network
    if episode % target_update == 0:
        target_net.load_state_dict(policy_net.state_dict())

    print(f"Episode {episode}, Reward: {episode_reward}")

# Close the environment and quit Pygame
env.close()
pygame.quit()

Episode 0, Reward: 42.0
Episode 1, Reward: 14.0
Episode 2, Reward: 54.0
Episode 3, Reward: 11.0
Episode 4, Reward: 11.0
Episode 5, Reward: 35.0
Episode 6, Reward: 17.0
Episode 7, Reward: 11.0
Episode 8, Reward: 29.0
Episode 9, Reward: 9.0
Episode 10, Reward: 24.0
Episode 11, Reward: 59.0
Episode 12, Reward: 20.0
Episode 13, Reward: 17.0
Episode 14, Reward: 20.0
Episode 15, Reward: 12.0
Episode 16, Reward: 15.0
Episode 17, Reward: 14.0
Episode 18, Reward: 13.0
Episode 19, Reward: 18.0
Episode 20, Reward: 18.0
Episode 21, Reward: 23.0
Episode 22, Reward: 25.0
Episode 23, Reward: 26.0
Episode 24, Reward: 15.0
Episode 25, Reward: 13.0
Episode 26, Reward: 43.0
Episode 27, Reward: 34.0
Episode 28, Reward: 16.0
Episode 29, Reward: 16.0
Episode 30, Reward: 9.0
Episode 31, Reward: 12.0
Episode 32, Reward: 62.0
Episode 33, Reward: 17.0
Episode 34, Reward: 51.0
Episode 35, Reward: 49.0
Episode 36, Reward: 27.0
Episode 37, Reward: 17.0
Episode 38, Reward: 48.0
Episode 39, Reward: 9.0
Episode 40, R

In [3]:
env.step(action)

(array([-0.03440778,  0.38110155,  0.01173588, -0.5279972 ], dtype=float32),
 1.0,
 False,
 False,
 {})

In [6]:
pygame.quit()