# TP4, INF8225 2025, Projet


## Imports

In [1]:
from IPython.display import clear_output

%pip install torch torchvision --index-url https://download.pytorch.org/whl/cu121
%pip install numpy
%pip install swig
%pip install box2d
%pip install pygame
%pip install gymnasium
%pip install "gymnasium[box2d]"
%pip install matplotlib

clear_output()

In [2]:
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt
from IPython.display import HTML
import matplotlib.animation as animation
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
import torchvision.transforms as T

### Initialisation

In [3]:
print(torch.cuda.is_available())
print(torch.version.cuda)
print(torch.cuda.get_device_name(0))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

True
12.1
NVIDIA GeForce GTX 1050


## Data Declaration

In [4]:
# Inspired by : https://github.com/pangyyen/carRacing-DeepRL/blob/main/ppo/ppo.ipynb

env = gym.make("CarRacing-v3", render_mode="rgb_array_list") # , domain_randomize=False, continuous=True
print("Observation space: ", env.observation_space) # (low, high, shape, dtype)
print("Action space: ", env.action_space)

SEED = 42

go_up = np.array([0.0, 1.0, 0.0])
no_action = np.array([0.0, 0.0, 0.0])

observation, info = env.reset(seed=SEED)

def show_animation():
	show_animation_frames(env.render())

def show_animation_frames(frames):
	fig = plt.figure(figsize=(7, 5))
	plt.axis('off')
	im = plt.imshow(frames[0])

	def animate(i):
		im.set_data(frames[i])
		return im,

	anim = animation.FuncAnimation(fig, animate, frames=len(frames), repeat=False)
	plt.close(fig)
	display(HTML(anim.to_jshtml()))

Observation space:  Box(0, 255, (96, 96, 3), uint8)
Action space:  Box([-1.  0.  0.], 1.0, (3,), float32)


## Implementation

### DQN

### PPO

In [5]:
# TODO - Remove this cell when finish implementation

observation, info = env.reset(seed=SEED)
for i in range(100):
	#action = env.action_space.sample()
	observation, reward, terminated, truncated, info = env.step(go_up)
	#print(f"Step: {i}, Reward: {reward}, Terminated: {terminated}, Truncated: {truncated}, Info: {info}")

	if terminated or truncated:
		observation, info = env.reset()
env.close()

show_animation()

#### Model

In [None]:
transform = T.Compose([
	T.ToTensor(),
	T.Resize((84, 84)),
	T.Normalize((0.5,), (0.5,))
])

class Actor(nn.Module):
	def __init__(self, action_dim):
		# state_dim: Dimension de l'état actuel
		# action_dim: Dimension de l'état d'action
		
		super().__init__()
			
		self.cnn = nn.Sequential(
			nn.Conv2d(3, 32, 8, stride=4), nn.ReLU(),
			nn.Conv2d(32, 64, 4, stride=2), nn.ReLU(),
			nn.Conv2d(64, 64, 3, stride=1), nn.ReLU(),
			nn.Flatten()
		)
		self.fc = nn.Sequential(
			nn.Linear(3136, 512), nn.ReLU(),
			nn.Linear(512, action_dim)
		)
		self.log_std = nn.Parameter(torch.zeros(action_dim))

	def forward(self, x):
		x = self.cnn(x)
		mean = self.fc(x)
		std = self.log_std.exp()
		return mean, std
	
class Critic(nn.Module):
	def __init__(self):
		super().__init__()
		self.cnn = nn.Sequential(
			nn.Conv2d(3, 32, 8, stride=4), nn.ReLU(),
			nn.Conv2d(32, 64, 4, stride=2), nn.ReLU(),
			nn.Conv2d(64, 64, 3, stride=1), nn.ReLU(),
			nn.Flatten()
		)
		self.fc = nn.Sequential(
			nn.Linear(3136, 512), nn.ReLU(),
			nn.Linear(512, 1)
		)

	def forward(self, x):
		x = self.cnn(x)
		return self.fc(x)

# class PPO:
# 	def __init__(self, env, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, action_std_init=0.6):
# 		self.env = env
# 		self.obs_dim = env.observation_space.shape[0]
# 		self.act_dim = env.action_space.shape[0]

class PPO:
	def __init__(self, env, lr=3e-4, gamma=0.99, eps_clip=0.2, K_epochs=4):
		self.env = env
		self.action_dim = env.action_space.shape[0]
		self.actor = Actor(self.action_dim).to(device)
		self.critic = Critic().to(device)
		self.optimizer = optim.Adam(list(self.actor.parameters()) + list(self.critic.parameters()), lr=lr)
		self.gamma = gamma
		self.eps_clip = eps_clip
		self.K_epochs = K_epochs

	def select_action(self, state):
		state = transform(state).unsqueeze(0).to(device)
		mean, std = self.actor(state)
		dist = Normal(mean, std)
		action = dist.sample()
		log_prob = dist.log_prob(action).sum(-1)
		return action.clamp(-1, 1).cpu().numpy()[0], log_prob.item()

	def evaluate(self, states, actions):
		mean, std = self.actor(states)
		dist = Normal(mean, std)
		log_probs = dist.log_prob(actions).sum(-1)
		entropy = dist.entropy().sum(-1)
		values = self.critic(states).squeeze(-1)
		return log_probs, values, entropy

	def compute_returns(self, rewards, dones, next_value):
		returns = []
		R = next_value
		for step in reversed(range(len(rewards))):
			R = rewards[step] + self.gamma * R * (1 - dones[step])
			returns.insert(0, R)
		return returns

	def update(self, memory):
		states = torch.stack(memory['states']).to(device)
		actions = torch.stack(memory['actions']).to(device)
		old_log_probs = torch.tensor(memory['log_probs']).to(device)
		returns = torch.tensor(memory['returns']).to(device)

		for _ in range(self.K_epochs):
			log_probs, values, entropy = self.evaluate(states, actions)
			advantages = returns - values.detach()

			ratio = (log_probs - old_log_probs).exp()
			surr1 = ratio * advantages
			surr2 = torch.clamp(ratio, 1 - self.eps_clip, 1 + self.eps_clip) * advantages
			actor_loss = -torch.min(surr1, surr2).mean()
			critic_loss = nn.MSELoss()(values, returns)
			loss = actor_loss + 0.5 * critic_loss - 0.01 * entropy.mean()

			self.optimizer.zero_grad()
			loss.backward()
			self.optimizer.step()	

In [None]:
observation, info = env.reset(seed=SEED)

max_episodes = 30
max_timesteps = 200

ppo = PPO(env)
all_rewards = []
frames = []

for ep in range(max_episodes):
	state, _ = env.reset()
	memory = {'states': [], 'actions': [], 'log_probs': [], 'rewards': [], 'dones': []}
	total_reward = 0

	for _ in range(max_timesteps):
		state_t = transform(state).to(device)
		memory['states'].append(state_t)

		action, log_prob = ppo.select_action(state)
		next_state, reward, done, trunc, _ = env.step(action)

		memory['actions'].append(torch.FloatTensor(action).to(device))
		memory['log_probs'].append(log_prob)
		memory['rewards'].append(reward)
		memory['dones'].append(done)
		frames.append(next_state)

		state = next_state
		total_reward += reward

		if done or trunc:
			break

	with torch.no_grad():
		next_state_t = transform(state).unsqueeze(0).to(device)
		next_value = ppo.critic(next_state_t).item()
	memory['returns'] = ppo.compute_returns(memory['rewards'], memory['dones'], next_value)

	ppo.update(memory)
	print(f"Episode {ep+1} - Reward: {total_reward:.1f}")

env.close()

show_animation_frames(frames)

3
Episode 1 - Reward: -6.9


In [13]:
state, _ = env.reset(seed=SEED)
total_reward = 0

ppo.actor.eval()
#while True:
for i in range(100):
	state_tensor = transform(state).unsqueeze(0).to(device)

	with torch.no_grad():
		mean, _ = ppo.actor(state_tensor)
		action = mean.squeeze(0).cpu().numpy()
	
	action = np.clip(action, -1, 1)
	state, reward, done, trunc, _ = env.step(action)
	total_reward += reward

	if done or trunc:
		break

env.close()
show_animation()
print(f"Test reward: {total_reward:.2f}")

Test reward: 7.67
