## Jon's Base Code

In [13]:
from enum import Enum
from abc import ABC, abstractmethod
from typing import Tuple, Dict, Any, List
import numpy as np
class WellState(Enum):
    UNKNOWN = 0
    MISS = 1
    HIT = 2

ship_schema = {
    "carrier":    {"length": 5, "count": 1},
    "battleship": {"length": 4, "count": 1},
    "submarine":  {"length": 3, "count": 1},
    "destroyer":  {"length": 2, "count": 2},
}

class BattleshipAI(ABC):
    """
    Abstract Base Class for a Battleship AI.
    Students will inherit from this class to create their own AI implementation.
    """

    def __init__(self, player_id: str, board_shape: Tuple[int, int], ship_schema: Dict[str, Any]):
        """
        Initializes the AI.

        Parameters
        ----------
        player_id : str
            A unique identifier for the player (e.g., 'player_1').
        board_shape : Tuple[int, int]
            The dimensions of the game board (rows, columns).
        ship_schema : Dict[str, Any]
            A dictionary describing the ships to be sunk (lengths and counts).
        """
        self.player_id = player_id
        self.board_shape = board_shape
        self.ship_schema = ship_schema
        self.board_state = np.full(board_shape, WellState.UNKNOWN, dtype=WellState)

    @abstractmethod
    def select_next_move(self) -> Tuple[int, int]:
        """
        Determine the next well to target. This is the core method students must implement.

        The method should return a tuple of (row, column) for the next shot.
        The AI should not target wells that have already been fired upon.

        Returns
        -------
        Tuple[int, int]
            The (row, column) coordinates for the next missile strike.
        """
        pass

    def record_shot_result(self, move: Tuple[int, int], result: WellState) -> None:
        """
        Updates the AI's internal board state with the result of a shot.

        Parameters
        ----------
        move : Tuple[int, int]
            The (row, column) of the shot.
        result : WellState
            The result of the shot (HIT or MISS).
        """
        row, col = move
        if self.board_state[row, col] == WellState.UNKNOWN:
            self.board_state[row, col] = result
        else:
            print(f"Warning ({self.player_id}): Attempted to record a result for an already targeted well {move}.")

    def has_won(self) -> bool:
        """
        Checks if the AI has won the game.

        Returns
        -------
        bool
            True if all opponent ships are sunk, False otherwise.
        """
        total_ship_segments = sum(ship['length'] * ship['count'] for ship in self.ship_schema.values())
        current_hits = np.sum(self.board_state == WellState.HIT)
        #print(f"Player {self.player_id} has {current_hits} hits out of {total_ship_segments} total ship segments.")
        return current_hits >= total_ship_segments

class PlacementAI(ABC):
    """Base class for ship placement algorithms."""

    def __init__(self, board_shape: Tuple[int, int], ship_schema: Dict[str, Any]):
        self.board_shape = board_shape
        self.ship_schema = ship_schema

    @abstractmethod
    def generate_placement(self) -> List[Dict[str, Any]]:
        """Return a placement schema for all ships."""
        raise NotImplementedError


## Code

In [35]:
import random
import numpy as np
from collections import deque, namedtuple, Counter
from itertools import product
from typing import Tuple, Dict, Any, List
import torch
import torch.nn as nn
import torch.optim as optim
from enum import Enum
from tqdm import tqdm

# --- WellState enum ---
class WellState(Enum):
    UNKNOWN = 0
    MISS    = 1
    HIT     = 2

# --- Neural network definitions ---
class QNetwork(nn.Module):
    """
    ConvNet + FC for Q-values over board positions.
    Input: one-hot channels [UNKNOWN, HIT, MISS] of shape (3, H, W)
    Output: Q-values of shape (H*W,)
    """
    def __init__(self, height: int, width: int):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, padding=1), nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, padding=1), nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.ReLU(),
        )
        conv_out_size = 64 * height * width
        self.fc = nn.Sequential(
            nn.Linear(conv_out_size, 512), nn.ReLU(),
            nn.Linear(512, height * width)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        h = self.conv(x)
        h = h.view(h.size(0), -1)
        return self.fc(h)

class PlacementNetwork(nn.Module):
    """
    Network to output placement mask for all ships at once.
    Outputs logits for each ship-channel over H*W cells, plus orientation logits.
    """
    def __init__(self, height: int, width: int, num_ship_types: int):
        super().__init__()
        self.height = height
        self.width = width
        self.num_ship_types = num_ship_types
        self.conv = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1), nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, padding=1), nn.ReLU(),
        )
        sz = 32 * height * width
        self.fc_mask = nn.Linear(sz, num_ship_types * height * width)
        self.fc_orient = nn.Linear(sz, num_ship_types)

    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        h = self.conv(x)
        h = h.view(h.size(0), -1)
        mask_logits = self.fc_mask(h)
        orient_logits = self.fc_orient(h)
        mask_logits = mask_logits.view(-1, self.num_ship_types, self.height, self.width)
        return mask_logits, orient_logits

# --- Replay buffer ---
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state', 'done'))

class ReplayBuffer:
    def __init__(self, capacity: int):
        self.buffer = deque(maxlen=capacity)

    def push(self, *args):
        self.buffer.append(Transition(*args))

    def sample(self, batch_size: int):
        batch = random.sample(self.buffer, batch_size)
        return Transition(*zip(*batch))

    def __len__(self):
        return len(self.buffer)

# --- Agent integrating both shooting and placement ---
class BattleshipDQNAgent(BattleshipAI, PlacementAI):
    def __init__(self,
                 player_id: str,
                 board_shape: Tuple[int,int],
                 ship_schema: Dict[str,Any],
                 device: torch.device = torch.device('cpu')):
        BattleshipAI.__init__(self, player_id, board_shape, ship_schema)
        PlacementAI.__init__(self, board_shape, ship_schema)
        H, W = board_shape
        self.device = device
        self.num_ship_types = len(ship_schema)
        self.q_net = QNetwork(H, W).to(device)
        self.target_q = QNetwork(H, W).to(device)
        self.placement_net = PlacementNetwork(H, W, self.num_ship_types).to(device)
        self.target_q.load_state_dict(self.q_net.state_dict())
        self.q_optimizer = optim.Adam(self.q_net.parameters(), lr=1e-4)
        self.pl_optimizer = optim.Adam(self.placement_net.parameters(), lr=1e-4)
        self.shoot_buffer = ReplayBuffer(100000)
        self.place_buffer = ReplayBuffer(100000)

    def _encode_board(self) -> np.ndarray:
        H, W = self.board_shape
        state = np.zeros((3, H, W), dtype=np.float32)
        for r, c in product(range(H), range(W)):
            val = self.board_state[r, c].value
            state[val, r, c] = 1.0
        return state

    def select_next_move(self, epsilon: float) -> Tuple[int,int]:
        flat_state = self._encode_board()
        if random.random() < epsilon:
            unk = np.argwhere(self.board_state == WellState.UNKNOWN)
            return tuple(unk[random.randrange(len(unk))])
        qs = self.q_net(torch.tensor(flat_state, dtype=torch.float32, device=self.device).unsqueeze(0))
        q_vals = qs.detach().cpu().view(-1)
        q_vals[self.board_state.flatten() != WellState.UNKNOWN] = -float('inf')
        idx = torch.argmax(q_vals).item()
        return divmod(idx, self.board_shape[1])

    def record_shot_result(self, move: Tuple[int,int], result: WellState) -> None:
        super().record_shot_result(move, result)

    def generate_placement(self) -> List[Dict[str,Any]]:
        # Random sequential placement to ensure valid initial board
        H, W = self.board_shape
        placements = []
        occupied = set()
        for ship_name, specs in self.ship_schema.items():
            length, count = specs['length'], specs['count']
            for _ in range(count):
                while True:
                    ori = random.choice(['H', 'V'])
                    if ori == 'H':
                        r = random.randrange(0, H)
                        c = random.randrange(0, W - length + 1)
                    else:
                        r = random.randrange(0, H - length + 1)
                        c = random.randrange(0, W)
                    coords = [(r + (i if ori=='V' else 0), c + (i if ori=='H' else 0)) for i in range(length)]
                    if all((rr, cc) not in occupied for rr, cc in coords):
                        for coord in coords:
                            occupied.add(coord)
                        placements.append({
                            'ship': ship_name,
                            'row': r,
                            'col': c,
                            'orientation': ori,
                            'length': length
                        })
                        break
        return placements

# --- Trainer ---
class BattleshipTrainer:
    def __init__(self,
                 agent: BattleshipDQNAgent,
                 env_factory,
                 max_turns: int = 100,
                 sink_bonus: float = 5.0,
                 win_bonus: float = 20.0,
                 loss_penalty: float = -10.0,
                 invalid_penalty: float = -5.0):
        self.agent = agent
        self.env_factory = env_factory
        self.max_turns = max_turns
        self.sink_bonus = sink_bonus
        self.win_bonus = win_bonus
        self.loss_penalty = loss_penalty
        self.invalid_penalty = invalid_penalty
        self.gamma = 0.99
        self.batch_size = 64
        self.epsilon = 1.0
        self.epsilon_min = 0.1
        self.epsilon_decay = 1e-5
        self.update_target_every = 1000
        self.step_count = 0

    def train(self, num_episodes: int):
        for ep in tqdm(range(num_episodes), desc="Training Episodes"):
            env = self.env_factory(self.agent)
            state = env.reset()
            placements = self.agent.generate_placement()
            valid = env.place_ships(placements)
            if not valid:
                continue
            for _ in tqdm(range(self.max_turns), desc=f"Episode {ep}", leave=False):
                action = self.agent.select_next_move(self.epsilon)
                next_state, raw_reward, done, info = env.step(action)
                reward = 3.0 if raw_reward == WellState.HIT else -0.5
                if info.get('sunk'):
                    reward += self.sink_bonus
                if done:
                    reward += self.win_bonus if info.get('winner')==self.agent.player_id else self.loss_penalty
                self.agent.shoot_buffer.push(state, action, reward, next_state, done)
                state = next_state
                self._optimize_q()
                self.epsilon = max(self.epsilon_min, self.epsilon - self.epsilon_decay)
                self.step_count += 1
                if self.step_count % self.update_target_every == 0:
                    self.agent.target_q.load_state_dict(self.agent.q_net.state_dict())
                if done:
                    break

    def evaluate(self, num_episodes: int) -> Dict[str, Any]:
        results = {'wins': 0, 'losses': 0, 'turns_to_win': []}
        for _ in range(num_episodes):
            env = self.env_factory(self.agent)
            env.reset()
            for turn in range(1, self.max_turns + 1):
                action = self.agent.select_next_move(0.0)
                _, _, done, info = env.step(action)
                if done:
                    if info.get('winner') == self.agent.player_id:
                        results['wins'] += 1
                        results['turns_to_win'].append(turn)
                    else:
                        results['losses'] += 1
                    break
        total = num_episodes
        return {
            'win_rate': results['wins']/total,
            'avg_turns_to_win': np.mean(results['turns_to_win']) if results['turns_to_win'] else None,
            'turns_histogram': Counter(results['turns_to_win'])
        }

    def _optimize_q(self):
        if len(self.agent.shoot_buffer) < self.batch_size:
            return
        trans = self.agent.shoot_buffer.sample(self.batch_size)
        pass

In [37]:
class BattleshipEnvironment:
    """
    Self-play environment for two BattleshipDQNAgent instances sharing weights.

    Agents take turns firing until one sinks all opponent ships or max_turns reached.

    Methods:
    - reset(): clears boards, places ships for both agents, returns initial observation for player1.
    - place_ships(placements1, placements2=None): applies placements lists to each agent.
    - step(action): current player fires at action; returns (obs, reward, done, info).
    """
    def __init__(self, agent: BattleshipDQNAgent, max_turns: int = 100):
        # Prototype agent becomes player1
        self.agent1 = agent
        # Clone board_state and ship information for player2 but share networks
        self.agent2 = BattleshipDQNAgent(
            player_id="player_2",
            board_shape=agent.board_shape,
            ship_schema=agent.ship_schema,
            device=agent.device
        )
        # Share network weights
        self.agent2.q_net      = self.agent1.q_net
        self.agent2.target_q   = self.agent1.target_q
        self.agent2.placement_net = self.agent1.placement_net
        self.max_turns = max_turns
        # Placeholders for ship positions: dict player -> list of coords per ship
        self._positions: Dict[str, List[List[Tuple[int,int]]]] = {}
        self.current_player = None
        self.turns = 0

    def reset(self) -> np.ndarray:
        # Reset board_state for both
        shape = self.agent1.board_shape
        self.agent1.board_state = np.full(shape, WellState.UNKNOWN, dtype=WellState)
        self.agent2.board_state = np.full(shape, WellState.UNKNOWN, dtype=WellState)
        # Place ships using each agent's policy
        p1 = self.agent1.generate_placement()
        p2 = self.agent2.generate_placement()
        valid = self.place_ships(p1, p2)
        if not valid:
            raise RuntimeError("Invalid initial ship placements")
        # Start with player1
        self.current_player = self.agent1
        self.turns = 0
        return self._encode_board(self.current_player)

    def place_ships(self,
                    placements1: List[Dict[str, Any]],
                    placements2: List[Dict[str, Any]] = None) -> bool:
        # Clear old positions
        self._positions = {}
        # Place player1's ships onto player2's hidden board
        self._positions[self.agent1.player_id] = []
        ok1 = self._apply_placement(self.agent1, placements1,
                                     self._positions[self.agent1.player_id])
        # Player2 placements: if not provided, sample again
        if placements2 is None:
            placements2 = self.agent2.generate_placement()
        self._positions[self.agent2.player_id] = []
        ok2 = self._apply_placement(self.agent2, placements2,
                                     self._positions[self.agent2.player_id])
        return ok1 and ok2

    def _apply_placement(self, agent, placements, out_list) -> bool:
        """
        Decode each placement dict {ship, row, col, orientation, length} into coordinate lists.
        Append each to out_list. Return False on overlap or out-of-bounds.
        """
        occupied = set()
        H, W = agent.board_shape
        for pl in placements:
            r, c = pl['row'], pl['col']
            ori = pl['orientation']  # 'H' or 'V'
            length = pl['length']
            coords = []
            for i in range(length):
                rr = r + (i if ori=='V' else 0)
                cc = c + (i if ori=='H' else 0)
                if rr<0 or rr>=H or cc<0 or cc>=W or (rr,cc) in occupied:
                    return False
                coords.append((rr,cc))
            for coord in coords:
                occupied.add(coord)
            out_list.append(coords)
        return True

    def step(self, action: Tuple[int,int]) -> Tuple[np.ndarray, float, bool, Dict[str,Any]]:
        """
        Execute current_player firing at 'action'.
        Returns (next_observation, reward, done, info).
        info may contain 'sunk':True/False, 'winner':player_id.
        """
        attacker = self.current_player
        defender = self.agent2 if attacker is self.agent1 else self.agent1
        # Check hit/miss
        hit = False; sunk = False
        for ship_coords in self._positions[defender.player_id]:
            if action in ship_coords:
                hit = True
                ship_coords.remove(action)
                if not ship_coords:
                    sunk = True
                break
        result = WellState.HIT if hit else WellState.MISS
        attacker.record_shot_result(action, result)
        # Compute reward externally; here return raw result
        raw_reward = result
        # Check win
        done = all(len(s)==0 for s in self._positions[defender.player_id])
        info = {'sunk': sunk}
        if done:
            info['winner'] = attacker.player_id
        # Prepare next turn
        self.turns += 1
        if not done and self.turns >= self.max_turns:
            done = True
            info['winner'] = None  # tie/loss
        # Swap current player
        self.current_player = defender
        next_obs = self._encode_board(self.current_player)
        return next_obs, raw_reward, done, info

    def _encode_board(self, agent) -> np.ndarray:
        """
        Return flat vector [UNKNOWN,HIT,MISS] one-hot flattened.
        """
        H, W = agent.board_shape
        state = np.zeros((3, H, W), dtype=np.float32)
        for r in range(H):
            for c in range(W):
                val = agent.board_state[r, c].value
                state[val, r, c] = 1.0
        return state

# Factory to use with Trainer
from functools import partial
env_factory = lambda agent: BattleshipEnvironment(agent)


In [41]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
agent = BattleshipDQNAgent("player_1", (8,11), ship_schema, device)
trainer = BattleshipTrainer(agent, env_factory)
trainer.train(20)

Training Episodes:   0%|                                                                        | 0/20 [00:00<?, ?it/s]
Episode 0:   0%|                                                                               | 0/100 [00:00<?, ?it/s][A
                                                                                                                       [A




Episode 1:   0%|                                                                               | 0/100 [00:00<?, ?it/s][A
Training Episodes:  10%|██████▍                                                         | 2/20 [00:00<00:01, 14.99it/s][A




Episode 2:   0%|                                                                               | 0/100 [00:00<?, ?it/s][A
                                                                                                                       [A




Episode 3:   0%|                                                                               | 0/100 [00:00<?, ?it/s][A
Training Episodes:  20%|████████████▊                                                   | 4/20 [00:00<00:01, 14.23it/s][A




Episode 4:   0%|                                                                               | 0/100 [00:00<?, ?it/s][A
                                                                                                                       [A




Episode 5:   0%|                                                                               | 0/100 [00:00<?, ?it/s][A
Training Episodes:  30%|███████████████████▏                                            | 6/20 [00:00<00:01, 13.29it/s][A




Episode 6:   0%|                                                                               | 0/100 [00:00<?, ?it/s][A
                                                                                                                       



[A
Episode 7:   0%|                                                                               | 0/100 [00:00<?, ?it/s][A




Training Episodes:  40%|█████████████████████████▌                                      | 8/20 [00:00<00:00, 12.13it/s][A




Episode 8:   0%|                                                                               | 0/100 [00:00<?, ?it/s][A




                                                                                                                       [A




Episode 9:   0%|                                                                               | 0/100 [00:00<?, ?it/s][A




Training Episodes:  50%|███████████████████████████████▌                               | 10/20 [00:00<00:00, 10.71it/s][A




Episode 10:   0%|                                                                              | 0/100 [00:00<?, ?it/s][A
                                                                                                                       [A




Episode 11:   0%|                                                                              | 0/100 [00:00<?, ?it/s][A
Training Episodes:  60%|█████████████████████████████████████▊                         | 12/20 [00:01<00:00,  9.28it/s][A




Episode 12:   0%|                                                                              | 0/100 [00:00<?, ?it/s][A




Training Episodes:  65%|████████████████████████████████████████▉                      | 13/20 [00:01<00:00,  7.75it/s][A
Episode 13:   0%|                                                                              | 0/100 [00:00<?, ?it/s][A
Training Episodes:  70%|████████████████████████████████████████████                   | 14/20 [00:01<00:00,  6.15it/s][A




Episode 14:   0%|                                                                              | 0/100 [00:00<?, ?it/s][A
Episode 14:  86%|██████████████████████████████████████████████████████████▍         | 86/100 [00:00<00:00, 854.77it/s][A
Training Episodes:  75%|███████████████████████████████████████████████▎               | 15/20 [00:01<00:01,  4.86it/s][A




Episode 15:   0%|                                                                              | 0/100 [00:00<?, ?it/s][A
Episode 15:  97%|█████████████████████████████████████████████████████████████████▉  | 97/100 [00:00<00:00, 967.36it/s][A
Training Episodes:  80%|██████████████████████████████████████████████████▍            | 16/20 [00:02<00:00,  4.21it/s][A




Episode 16:   0%|                                                                              | 0/100 [00:00<?, ?it/s][A
Training Episodes:  85%|█████████████████████████████████████████████████████▌         | 17/20 [00:02<00:00,  3.77it/s][A




Episode 17:   0%|                                                                              | 0/100 [00:00<?, ?it/s][A
Episode 17:  82%|███████████████████████████████████████████████████████▊            | 82/100 [00:00<00:00, 803.21it/s][A
Training Episodes:  90%|████████████████████████████████████████████████████████▋      | 18/20 [00:03<00:00,  3.12it/s][A




Episode 18:   0%|                                                                              | 0/100 [00:00<?, ?it/s][A
Episode 18:  66%|████████████████████████████████████████████▉                       | 66/100 [00:00<00:00, 658.09it/s][A
Training Episodes:  95%|███████████████████████████████████████████████████████████▊   | 19/20 [00:03<00:00,  2.68it/s][A




Episode 19:   0%|                                                                              | 0/100 [00:00<?, ?it/s][A
Episode 19:  75%|███████████████████████████████████████████████████                 | 75/100 [00:00<00:00, 747.22it/s][A
Training Episodes: 100%|███████████████████████████████████████████████████████████████| 20/20 [00:04<00:00,  4.89it/s][A






In [43]:
metrics = trainer.evaluate(10)
print(metrics)

{'win_rate': 0.0, 'avg_turns_to_win': None, 'turns_histogram': Counter()}


In [39]:
# --- Textual Visualization Utilities ---
def print_board(agent: BattleshipDQNAgent):
    H, W = agent.board_shape
    char_map = {WellState.UNKNOWN: '~', WellState.MISS: 'O', WellState.HIT: 'X'}
    print('   ' + ' '.join(f"{c:2d}" for c in range(W)))
    for r in range(H):
        row = ' '.join(char_map[agent.board_state[r, c]] for c in range(W))
        print(f"{r:2d} {row}")
    print()

def simulate_text_episode(agent: BattleshipDQNAgent, env_factory, max_turns: int = 100):
    env = env_factory(agent)
    env.reset()
    print("Initial board:")
    print_board(agent)
    for turn in range(1, max_turns+1):
        action = agent.select_next_move(0.0)
        print(f"Turn {turn}: firing at {action}")
        _, _, done, info = env.step(action)
        print_board(agent)
        if done:
            if info.get('winner') == agent.player_id:
                print(f"Game over: {agent.player_id} wins in {turn} turns")
            else:
                print("Game over: opponent wins or tie.")
            break

# Example:
simulate_text_episode(agent, env_factory)


Initial board:
    0  1  2  3  4  5  6  7  8  9 10
 0 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 1 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 2 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 3 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 4 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 5 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 6 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 7 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~

Turn 1: firing at (3, 4)
    0  1  2  3  4  5  6  7  8  9 10
 0 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 1 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 2 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 3 ~ ~ ~ ~ O ~ ~ ~ ~ ~ ~
 4 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 5 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 6 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 7 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~

Turn 2: firing at (1, 0)
    0  1  2  3  4  5  6  7  8  9 10
 0 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 1 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 2 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 3 ~ ~ ~ ~ O ~ ~ ~ ~ ~ ~
 4 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 5 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 6 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 7 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~

Game over: opponent wins or tie.


In [45]:
# --- Initial Placement Inspection ---
def print_initial_placements(agent: BattleshipDQNAgent, env_factory):
    """
    Prints the initial ship placements for both players.
    """
    env = env_factory(agent)
    env.reset()
    print(f"Placements for {agent.player_id}:")
    for pl in env._positions[agent.player_id]:
        print(pl)
    print(f"Placements for opponent ({env.agent2.player_id}):")
    for pl in env._positions[env.agent2.player_id]:
        print(pl)
    print()


In [47]:
print_initial_placements(agent, env_factory)


Placements for player_1:
[(0, 3), (0, 4), (0, 5), (0, 6), (0, 7)]
[(4, 5), (4, 6), (4, 7), (4, 8)]
[(1, 3), (1, 4), (1, 5)]
[(6, 2), (6, 3)]
[(1, 2), (2, 2)]
Placements for opponent (player_2):
[(2, 8), (3, 8), (4, 8), (5, 8), (6, 8)]
[(1, 7), (1, 8), (1, 9), (1, 10)]
[(3, 5), (4, 5), (5, 5)]
[(3, 4), (4, 4)]
[(7, 1), (7, 2)]



In [49]:
    device = torch.device('cpu')
    agent = BattleshipDQNAgent("player_test", (8,11), {
        "carrier":    {"length": 5, "count": 1},
        "battleship": {"length": 4, "count": 1},
        "submarine":  {"length": 3, "count": 1},
        "destroyer":  {"length": 2, "count": 2},
    }, device)
    env_factory = lambda a: BattleshipEnvironment(a)

    # Print initial ship placements to verify initialization
    print_initial_placements(agent, env_factory)

    # Optionally simulate one text episode
    simulate_text_episode(agent, env_factory)

Placements for player_test:
[(4, 1), (4, 2), (4, 3), (4, 4), (4, 5)]
[(4, 7), (5, 7), (6, 7), (7, 7)]
[(2, 9), (3, 9), (4, 9)]
[(5, 3), (5, 4)]
[(5, 10), (6, 10)]
Placements for opponent (player_2):
[(0, 6), (1, 6), (2, 6), (3, 6), (4, 6)]
[(2, 9), (3, 9), (4, 9), (5, 9)]
[(0, 8), (0, 9), (0, 10)]
[(4, 10), (5, 10)]
[(5, 7), (5, 8)]

Initial board:
    0  1  2  3  4  5  6  7  8  9 10
 0 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 1 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 2 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 3 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 4 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 5 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 6 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 7 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~

Turn 1: firing at (1, 4)
    0  1  2  3  4  5  6  7  8  9 10
 0 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 1 ~ ~ ~ ~ O ~ ~ ~ ~ ~ ~
 2 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 3 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 4 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 5 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 6 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 7 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~

Turn 2: firing at (3, 5)
    0  1  2  3  4  5  6  7  8  9 10
 0 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 1 ~ ~ ~ ~ O ~ ~ ~ ~ ~ ~
 2 ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
 3 ~ ~ ~ ~ ~ ~