In [25]:
import gymnasium as gym
import sys
from pathlib import Path
from tqdm import tqdm
import numpy as np



root = Path.cwd()
if (root / "environments").exists():
    project_root = root
elif (root.parent / "environments").exists():
    project_root = root.parent
else:
    raise RuntimeError("Cannot locate project root containing 'environments' package'")

sys.path.insert(0, str(project_root))

from environments.war_card_game_env import WarCardGameEnv
from agents.card_game_war_agent import CardGameWarAgent, initialze_q_values, get_q_table_values,get_cards_played_index


In [14]:
gym.register(
    id="WarCardGame-v0",
    entry_point=WarCardGameEnv,
    max_episode_steps=13,
)

In [15]:
env = gym.make("WarCardGame-v0")

In [16]:
observation, info = env.reset()

In [5]:
print("Initial observation:", observation)
print("Info:", info)

Initial observation: {'player1_cards': [], 'player2_cards': [], 'round_number': 0}
Info: {'player1_played_cards': [], 'player2_played_cards': [], 'round_number': 0}


In [17]:
action = env.action_space.sample()
print("Sampled action:", env.action_space.sample())

Sampled action: 9


In [18]:
observation, reward, terminated, truncated, info = env.step(action)

In [19]:
print("Initial observation:", observation)
print("Reward", reward)
print("Terminated:", terminated)
print("Truncated:", truncated)
print("Info:", info)

Initial observation: {'player1_cards': [10], 'player2_cards': [3], 'round_number': 1}
Reward 1
Terminated: False
Truncated: False
Info: {'player1_played_cards': [10], 'player2_played_cards': [3], 'round_number': 1}


In [None]:

observation, info = env.reset()
episode_over = False
total_reward = 0
print("\nStarting new episode...\n")
print("Initial observation:", observation)
while not episode_over:

    action =  np.random.choice([card for card in range(13) if card not in observation["player1_cards"]])

    observation, reward, terminated, truncated, info = env.step(action)
    print("Observation:", observation)
    print("Reward:", reward)
    print("Terminated:", terminated)
    print("Truncated:", truncated)
    print("Info:", info)

    total_reward += reward
    episode_over = terminated or truncated

print(f"Episode finished! Total reward: {total_reward}")

print(f"{env.}")
env.close()



Starting new episode...

Initial observation: {'player1_cards': [], 'player2_cards': [], 'round_number': 0}
Observation: {'player1_cards': [2], 'player2_cards': [0], 'round_number': 1}
Reward: 1
Terminated: False
Truncated: False
Info: {'player1_played_cards': [2], 'player2_played_cards': [0], 'round_number': 1}
Observation: {'player1_cards': [2, 7], 'player2_cards': [0, 9], 'round_number': 2}
Reward: -1
Terminated: False
Truncated: False
Info: {'player1_played_cards': [2, 7], 'player2_played_cards': [0, 9], 'round_number': 2}
Observation: {'player1_cards': [0, 2, 7], 'player2_cards': [0, 6, 9], 'round_number': 3}
Reward: -1
Terminated: False
Truncated: False
Info: {'player1_played_cards': [0, 2, 7], 'player2_played_cards': [0, 6, 9], 'round_number': 3}
Observation: {'player1_cards': [0, 2, 7, 9], 'player2_cards': [0, 5, 6, 9], 'round_number': 4}
Reward: 1
Terminated: False
Truncated: False
Info: {'player1_played_cards': [0, 2, 7, 9], 'player2_played_cards': [0, 5, 6, 9], 'round_numbe

In [10]:
learning_rate = 0.01
n_episodes = 1000
start_epsilon = 1.0
epsilon_decay = start_epsilon / (n_episodes / 2)  # reduce the exploration over time
final_epsilon = 0.1
env = gym.make("WarCardGame-v0")
env.reset()



agent = CardGameWarAgent(
    env=env,
    learning_rate=learning_rate,
    initial_epsilon=start_epsilon,
    epsilon_decay=epsilon_decay,
    final_epsilon=final_epsilon,
)

In [11]:
agent.q_values

{0: array([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]]),
 1: array([[[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]],
 
        ...,
 
        [[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
        

In [12]:
for episode in tqdm(range(n_episodes)):
    obs, info = env.reset()
    done = False

    # play one episode
    while not done:
        action = agent.get_action(obs)
        next_obs, reward, terminated, truncated, info = env.step(action)

        # update the agent
        agent.update(obs, action, reward, terminated, next_obs)

        # update if the environment is done and the current obs
        done = terminated or truncated
        obs = next_obs

    agent.decay_epsilon()

  0%|          | 0/1000 [00:00<?, ?it/s]

Updating Q-values
Obs: {'player1_cards': [], 'player2_cards': [], 'round_number': 0}, Action: 3, Reward: -1, Terminated: False, Next_obs: {'player1_cards': [3], 'player2_cards': [4], 'round_number': 1}
Updating Q-values
Obs: {'player1_cards': [3], 'player2_cards': [4], 'round_number': 1}, Action: 11, Reward: 1, Terminated: False, Next_obs: {'player1_cards': [3, 11], 'player2_cards': [4, 8], 'round_number': 2}





IndexError: index 11 is out of bounds for axis 0 with size 11