In [2]:
import sys
import os
import random
import math
import numpy as np
from copy import deepcopy
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

In [1]:
from stable_baselines3.common.env_checker import check_env
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.monitor import Monitor

In [5]:
sys.path.append(os.path.abspath('../environment'))
sys.path.append(os.path.abspath('../model'))
sys.path.append(os.path.abspath('../src'))

# Import the environment and agent
from environment_gym import BlackjackGameGym
from utilities import test_ppo_model

In [6]:
# Set up log directory
log_dir = "./logs/ppo/"
os.makedirs(log_dir, exist_ok=True)

# Define hyperparameter search space
param_grid = {
    "learning_rate": [3e-4, 1e-4, 5e-5],  
    "batch_size": [32, 64, 128],
    "gamma": [0.95, 0.99],
    "n_epochs": [10, 20],
    "clip_range": [0.1, 0.2]
}

# Generate all hyperparameter combinations
param_combinations = list(itertools.product(*param_grid.values()))

In [9]:
# Wrap evaluation environment
eval_env = BlackjackGameGym()
eval_env.reset()

# Track the best model
best_model = None
best_model_params = None
best_win_rate = -1

# Train and evaluate for each combination
for i, params in enumerate(param_combinations):
    print(f"\n Training Model {i+1}/{len(param_combinations)} with params: {params}")

    # Assign hyperparameters
    learning_rate, batch_size, gamma, n_epochs, clip_range = params

    # Define model with selected hyperparameters
    model = PPO(
        policy="MlpPolicy",
        env=BlackjackGameGym(),
        device="cpu",
        verbose=0,
        learning_rate=learning_rate,
        gamma=gamma,
        batch_size=batch_size,
        n_epochs=n_epochs,
        clip_range=clip_range
    )

    # Evaluation callback
    eval_callback = EvalCallback(
        eval_env,
        best_model_save_path=log_dir + f"best_model{i}/",
        log_path=log_dir + "results/",
        eval_freq=20000,
        deterministic=True,
        render=False
    )

    # Train model
    total_timesteps = 200000
    model.learn(total_timesteps=total_timesteps, callback=eval_callback)

    # Evaluate final performance
    test_results = test_ppo_model(model, eval_env, num_test_games=10000, true_count=2)

    # Print test results for this model
    print(f"\n Results for Model {i}:")
    print(f"Hyperparameters: {params}")
    print(f"Win Rate: {test_results['win_rate'] * 100:.2f}%")
    print(f"Loss Rate: {test_results['loss_rate'] * 100:.2f}%")
    print(f"Tie Rate: {test_results['tie_rate'] * 100:.2f}%")
    print(f"Average Reward: {test_results['average_reward']:.2f}")

    # Save model
    model.save(log_dir + f"ppo_model{i}")

    # Track best model based on win rate
    if test_results["win_rate"] > best_win_rate:
        best_win_rate = test_results["win_rate"]
        best_model = model
        best_model_params = params

# Print best model parameters and results
print("\nüèÜ Best Model Hyperparameters (based on win rate):")
print(f"Hyperparameters: {best_model_params}")
print(f"Best Win Rate: {best_win_rate * 100:.2f}%")
print(f"Loss Rate: {test_results['loss_rate'] * 100:.2f}%")
print(f"Tie Rate: {test_results['tie_rate'] * 100:.2f}%")
print(f"Average Reward: {test_results['average_reward']:.2f}")

# Save the best model separately
if best_model:
    best_model_dir = "best_trained_models"
    best_model.save(os.path.join(best_model_dir, "ppo_best_model"))
    print("\n Best model saved as 'ppo_best_model'")


 Training Model 1/72 with params: (0.0003, 32, 0.95, 10, 0.1)




Eval num_timesteps=20000, episode_reward=-0.40 +/- 0.80
Episode length: 1.00 +/- 0.00
New best mean reward!
Eval num_timesteps=40000, episode_reward=-1.00 +/- 0.00
Episode length: 1.00 +/- 0.00
Eval num_timesteps=60000, episode_reward=1.00 +/- 0.00
Episode length: 1.20 +/- 0.40
New best mean reward!
Eval num_timesteps=80000, episode_reward=0.20 +/- 0.98
Episode length: 1.40 +/- 0.49
Eval num_timesteps=100000, episode_reward=0.20 +/- 0.98
Episode length: 1.40 +/- 0.49
Eval num_timesteps=120000, episode_reward=-0.60 +/- 0.80
Episode length: 1.60 +/- 0.80
Eval num_timesteps=140000, episode_reward=-0.60 +/- 0.80
Episode length: 1.80 +/- 0.75
Eval num_timesteps=160000, episode_reward=0.60 +/- 0.80
Episode length: 1.80 +/- 0.75
Eval num_timesteps=180000, episode_reward=0.00 +/- 0.89
Episode length: 1.00 +/- 0.00
Eval num_timesteps=200000, episode_reward=-0.20 +/- 0.75
Episode length: 1.80 +/- 1.17
Average Reward over 10000 episodes: -1.79
Win Rate: 41.47%
Loss Rate: 50.42%
Tie Rate: 8.11%

 

