# 1. Train the model

In [1]:
import gymnasium as gym  # Updated to gymnasium
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
import os

# Test Environment
environment_name = "CarRacing-v3"

# Create environment
env = gym.make(environment_name, render_mode="human")  # Add render_mode for compatibility

episodes = 5
for episode in range(1, episodes + 1):
    state, info = env.reset()  # Updated to include info in reset
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, terminated, truncated, info = env.step(action)  # Updated for gymnasium's step output
        done = terminated or truncated  # Combine termination conditions
        score += reward

    print(f"Episode: {episode} Score: {score}")

env.close()

# Example actions and observations
print(env.action_space.sample())
print(env.observation_space.sample())

# Train Model
log_path = os.path.join('Training', 'Logs')

# Ensure environment compatibility with Stable-Baselines3
env = gym.make(environment_name)

# Initialize the model with PPO and CNN policy
model = PPO("CnnPolicy", env, verbose=1, tensorboard_log=log_path)

# Set total timesteps to a higher value for longer training duration
total_timesteps = 1000000  # Adjust this for a 4-5 hour training duration

# Train the model for the specified number of timesteps
model.learn(total_timesteps=total_timesteps)

# Save Model after training
ppo_path = os.path.join('Training', 'Saved Models', 'PPO_Driving_model')
model.save(ppo_path)

# Evaluate and Test
evaluate_policy(model, env, n_eval_episodes=10, render=True)

env.close()

# Test the trained model
obs, info = env.reset()  
while True:
    action, _states = model.predict(obs)
    obs, rewards, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    env.render()

    if done:
        break

env.close()

# Save the model periodically during training (Optional)
# Example: Save the model every 10,000 timesteps
for i in range(0, total_timesteps, 10000):
    model.learn(total_timesteps=100000)
    model.save(f"{ppo_path}_checkpoint_{i}")
    print(f"Checkpoint saved at timestep {i}")



2024-12-25 14:06:01.727094: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735153561.856225  325115 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735153561.887162  325115 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-25 14:06:02.202904: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Episode: 1 Score: -33.7748344370866
Episode: 2 Score: -31.03448275862104
Episode: 3 Score: -41.176470588235894
Episode: 4 Score: -29.577464788732716
Episode: 5 Score: -37.88819875776463
[-0.25640416  0.4906586   0.87362874]
[[[172 139  72]
  [ 39  39 149]
  [ 15 255  11]
  ...
  [245  86 208]
  [ 70 148  20]
  [156 181 165]]

 [[ 61 208 110]
  [229 117 184]
  [165 199 193]
  ...
  [ 68  96 146]
  [ 46 220 185]
  [ 49  47   7]]

 [[ 33 226 107]
  [188  49 191]
  [251 144  52]
  ...
  [179 125  34]
  [ 64 241 197]
  [189 185 236]]

 ...

 [[ 96   7  79]
  [122  53 238]
  [200 156 105]
  ...
  [ 68  17  62]
  [185 102   7]
  [230  92 127]]

 [[216 132  91]
  [225  85 247]
  [ 95 231 238]
  ...
  [148 138  10]
  [109  58  45]
  [238 160 238]]

 [[ 18 117 219]
  [199  31 113]
  [ 72 146  82]
  ...
  [227  56 235]
  [ 38 176 144]
  [ 93 252   5]]]
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


[W1225 14:09:45.900482698 NNPACK.cpp:61] Could not initialize NNPACK! Reason: Unsupported hardware.


Logging to Training/Logs/PPO_7
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -52.1    |
| time/              |          |
|    fps             | 79       |
|    iterations      | 1        |
|    time_elapsed    | 25       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | -47.6       |
| time/                   |             |
|    fps                  | 50          |
|    iterations           | 2           |
|    time_elapsed         | 81          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.005525681 |
|    clip_fraction        | 0.0505      |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.24       |
|    explained_variance   | 0.00273     |

  gym.logger.warn(


Logging to Training/Logs/PPO_8
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | 662      |
| time/              |          |
|    fps             | 88       |
|    iterations      | 1        |
|    time_elapsed    | 23       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1e+03       |
|    ep_rew_mean          | 772         |
| time/                   |             |
|    fps                  | 61          |
|    iterations           | 2           |
|    time_elapsed         | 66          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.059840664 |
|    clip_fraction        | 0.436       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.44       |
|    explained_variance   | 0.982       |

KeyboardInterrupt: 

In [None]:
# 2. Test the model

In [2]:
from stable_baselines3 import PPO
import gymnasium as gym
import os

# Define the environment name
environment_name = "CarRacing-v3"

# Create the environment
env = gym.make(environment_name, render_mode="human")  # Add render_mode for visualization

# Specify the path to the saved model
ppo_path = os.path.join('Training', 'Saved Models', 'PPO_Driving_model.zip')

# Load the saved model
model = PPO.load(ppo_path)

# Test the model
obs, info = env.reset()  # Reset the environment
while True:
    # Use the model to predict actions
    action, _states = model.predict(obs)
    obs, rewards, terminated, truncated, info = env.step(action)  # Step through the environment
    done = terminated or truncated
    env.render()  # Render the environment

    if done:
        break  # Exit the loop if the episode ends

env.close()  # Close the environment
