In [None]:
#!apt-get update
#!apt-get install -y swig python3-dev

In [None]:
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
# !pip install -r requirements.txt

In [None]:
import os
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor

# ───────────────────────────────────────────────────────────────────────────────
# CONFIGURACIÓN DE RUTAS Y PARÁMETROS
# ───────────────────────────────────────────────────────────────────────────────
logs_dir = "logs"
os.makedirs(logs_dir, exist_ok=True)
train_monitor_file = os.path.join(logs_dir, "train_monitor.csv")

total_timesteps = int(2e5)
n_eval_episodes = 30

# ───────────────────────────────────────────────────────────────────────────────
# 1. ENTRENAMIENTO CON PPO
# ───────────────────────────────────────────────────────────────────────────────
# 1.1. Creamos y envolvemos el env
env_train = gym.make("CarRacing-v3", render_mode="rgb_array")
env_train = Monitor(env_train, filename=train_monitor_file)

# 1.2. Instanciamos PPO
model = PPO(
    policy="CnnPolicy",
    env=env_train,
    verbose=1,
    tensorboard_log=None,
    n_steps=2048,
    learning_rate=3e-4,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    ent_coef=0.0,
    clip_range=0.2,
    gae_lambda=0.95,
)

# 1.3. Entrenamos
model.learn(total_timesteps=total_timesteps, progress_bar=True)

# 1.4. Guardamos
model_path = "ppo_carracing"
model.save(model_path)
del model

# ───────────────────────────────────────────────────────────────────────────────
# 2. GRAFICA DE RECOMPENSA EN ENTRENAMIENTO
# ───────────────────────────────────────────────────────────────────────────────
# 2.1. Leer CSV generado por Monitor
df_train = pd.read_csv(train_monitor_file, skiprows=1)
rewards_train = df_train["r"].values
episodes_train = np.arange(1, len(rewards_train) + 1)

# 2.2. (Opcional) Promedio móvil
window = 50
if len(rewards_train) >= window:
    mov_avg = np.convolve(rewards_train, np.ones(window) / window, mode="valid")
else:
    mov_avg = None

# 2.3. Graficar
plt.figure(figsize=(10,5))
plt.plot(episodes_train, rewards_train, label="Recompensa por episodio", color="tab:blue")
if mov_avg is not None:
    plt.plot(episodes_train[window-1:], mov_avg,
             label=f"Promedio móvil ({window} ep)", color="tab:orange")
plt.xlabel("Episodio")
plt.ylabel("Recompensa total")
plt.title("CarRacing-v3: Recompensa en Entrenamiento")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

# ───────────────────────────────────────────────────────────────────────────────
# 3. EVALUACIÓN Y SU GRÁFICA
# ───────────────────────────────────────────────────────────────────────────────
# 3.1. Recargamos el modelo en un entorno nuevo
eval_env = gym.make("CarRacing-v3", render_mode="rgb_array")
model = PPO.load(model_path, env=eval_env)

# 3.2. Loop de evaluación manual
rewards_eval = []
for epi in range(n_eval_episodes):
    obs, _ = eval_env.reset()
    done = False
    total_r = 0.0

    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, r, terminated, truncated, info = eval_env.step(action)
        done = terminated or truncated
        total_r += r

    rewards_eval.append(total_r)

rewards_eval = np.array(rewards_eval)
episodes_eval = np.arange(1, n_eval_episodes + 1)
mean_eval = rewards_eval.mean()
std_eval = rewards_eval.std()

# 3.3. Graficar resultados de eval
plt.figure(figsize=(8,4))
plt.plot(episodes_eval, rewards_eval, '-o', color="tab:green",
         label="Recompensa episodio")
plt.axhline(mean_eval, color="tab:red", linestyle="--",
            label=f"Promedio: {mean_eval:.2f} ± {std_eval:.2f}")
plt.xlabel("Episodio de evaluación")
plt.ylabel("Recompensa total")
plt.title(f"CarRacing-v3: Evaluación en {n_eval_episodes} episodios")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

print(f"Eval → Recompensa media: {mean_eval:.2f}; Desviación: {std_eval:.2f}")
