In [None]:
# Basic needs
import gymnasium as gym
from external.pydominion.dominion.Game import Game
import sys
# For maskable PPO
from sb3_contrib.ppo_mask import MaskablePPO
from sb3_contrib.common.wrappers import ActionMasker
# For DQN
from stable_baselines3 import DQN

# For recursion error, colorama was causing issues, dont worry about this
#import sys
#import os
#os.environ['CLICOLOR'] = '0'

# Outstanding Considerations

**Environment Questions**
- How long is a good limit for number of turns? An average game lasts 15-40 turns
- For our action space, the number of valid actions seems to be a max of 23... should we change?

**Results we want from training an agent**
- A graph of Reward vs Iteration for each player
- Some sort of graphs to show agent training, such as loss
- See what the deck looks like at the end of a game

**For training timestep values:**
- Quick test: 25,000
- Short trainings (learn basic moves maybe): 100,000
- Intermediate (start learning some strategy): 500,000
- Full Training (needed for decent gameplay): 1,000,000

# Play the game with agent
User Input
- Change `model_dir` as needed to location of where model was saved. Should be log_dir + model_name from the `run_masked_ppo.py` script
- If needed, change the cell with the comment "Load the model" to work for masked_ppo vs dqn
- If needed, change the cell with the comment "Play the game!" to work for masked_ppo vs dqn

In [None]:
model_dir = "logs/masked_ppo_5/ppo_masked_dominion"

In [None]:
# Register gym environment
gym.register(
    id="Dominion-v1",
    entry_point="DominionEnv:DominionEnv"
)

In [None]:
# Make the environment
env = gym.make("Dominion-v1", 
         num_players=2, 
         card_set=["Cellar", "Market", "Militia", "Mine", "Moat", 
                   "Remodel", "Smithy", "Village", "Throne Room", "Workshop"],
         quiet_flag = False,
         debug_flag = False
        )

In [None]:
# Load the model (switch as needed)
model = MaskablePPO.load(model_dir)
#model = DQN.load(model_dir)

In [None]:
# Play the game!
obs, _ = env.reset()
while True:
    #action, _ = model.predict(obs) # Use for DQN
    action, _ = model.predict(obs, action_masks=env.unwrapped.get_action_mask()) # Use for masked_ppo
    #print(f"Predicted action: {action}") # Use for debugging
    obs, reward, terminated, truncated, info = env.step(action)

    if terminated or truncated:
        break

# Train with Maskable PPO
See documentation [here](https://sb3-contrib.readthedocs.io/en/master/modules/ppo_mask.html)

- One timestep = one call to "step" function
- One episode = one game played
- Since PPO runs in chuncks (2048 per iteration) then 2048 (chunck size) / 583 (time it take to run one episode) = 3.5 iterations played per chunck

User input:
- NONE

This is just a testing grounds if you need it

In [None]:
#Register
gym.register(
    id="Dominion-v1",
    entry_point="DominionEnv:DominionEnv"
)

# Make environment space
env = gym.make("Dominion-v1", 
         num_players=2, 
         card_set=["Cellar", "Market", "Militia", "Mine", "Moat", 
                   "Remodel", "Smithy", "Village", "Throne Room", "Workshop"],
         quiet_flag=True,
         debug_flag = True
        )

# Mask the environment so that it only includes valid action choices
#env = ActionMasker(env, lambda env: env.get_action_mask())
env = ActionMasker(env, lambda env: env.unwrapped.get_action_mask())

In [None]:
model = MaskablePPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000, reset_num_timesteps=False)

# Train with Regular DQN
See documentation [here](https://stable-baselines3.readthedocs.io/en/master/modules/dqn.html)

**User input**:
- NONE

This is just a testing grounds if you need it

In [None]:
#Register
gym.register(
    id="Dominion-v1",
    entry_point="DominionEnv:DominionEnv"
)

# Make environment space
env = gym.make("Dominion-v1", 
         num_players=2, 
         card_set=["Cellar", "Market", "Militia", "Mine", "Moat", 
                   "Remodel", "Smithy", "Village", "Throne Room", "Workshop"],
         quiet_flag=True,
         debug_flag = True
        )

In [None]:
sys.stdout = open("logs/DQN_training_output1.log", "w")
sys.stderr = sys.stdout

# Train the model
model = DQN("MlpPolicy", env, verbose=0)
model.learn(total_timesteps=250000, reset_num_timesteps=False)
print("Done learning")
sys.stdout.close()
print("Closed output file")

# Save the model
model.save("dqn_cartpole")
print("Saved model")

# Test anything within pydominion as needed
**User input:**
- NONE

This is just a testing grounds if you need it

In [None]:
game = Game(
    numplayers=2,
    initcards=["Cellar", "Market", "Militia", "Mine", "Moat", 
                "Remodel", "Smithy", "Village", "Throne Room", "Workshop"],
    validate_only=False,
    prosperity=False,
    potions = False,
    shelters = False,
    card_path="external/pydominion/dominion/cards"
)

In [None]:
# Start game to have something
game.start_game()
# Get the options, usually helpful to look into
options = game.current_player._choice_selection()