# Programming Exercise: Function Approximation

In [None]:
# Necessary Imports

from typing import Any, Dict, Callable
from dataclasses import dataclass
from functools import cached_property, partial

import numpy as np
import gymnasium as gym
from gymnasium import spaces

***Below is the introduction of RL team***

## Problem statement
The task of the programming exercise is to program an agent that learns to play Tic-Tac-Toe against different opponents. 

## Opponent Policies

We load and initialize the opponent policies of increasing strength from .json-files. They are stored in global variables and can therefore be easily changed at any point of the notebook. The opponent policies are of increasing strength.

For loading the policies, the folder `Opponent_Policies` containing the .json-files of the policies should lie in the same directory as this notebook.  

In [None]:
"""
Copyright by Reinforcement Learning 23/24 team of Saarland University
"""

# Load opponent policy from .json-file. 

import json
from pathlib import Path

opponent_policy_file = Path('Opponent_Policies') # Change filename to play against different policy.

with open(opponent_policy_file / 'policy1.json') as json_file:
    opponent_policy_1 = json.load(json_file)

with open(opponent_policy_file / 'policy2.json') as json_file:
    opponent_policy_2 = json.load(json_file)

with open(opponent_policy_file / 'policy3.json') as json_file:
    opponent_policy_3 = json.load(json_file)

with open(opponent_policy_file / 'policy4.json') as json_file:
    opponent_policy_4 = json.load(json_file)

# Set opponent policy
opponent_policy_dict = opponent_policy_1 # Change to play against different opponent policy.

## Gymnasium Environment for Tic-Tac-Toe

We implement a Gymnasium environment simulating a game of Tic-Tac-Toe. We thereby use
- as possible field values  $V = \{0,1,2\}$, whereby $v = 0$ stands for a 'O'-field, $v = 1$ for an empty field, and $v = 2$ for a 'X'-field.
- as state space $S = V^{3 \times 3}$. A state `s` is stored as a `list[list[int]]`, `s[i][j]` refers then to the value in the i-th row in the j-th column.
- action space $A = V \times V = \{(0,0),(0,1),(0,2),(1,0),(1,1),(1,2), (2,0),(2,1),(2,2)\}$.

### Environment Dynamics

We implement Tic-Tac-Toe as a sequential decision problem. The agent plays against a specified opponent policy (see above). One step of the environment looks as follows: 
1. Perform the move of the agent. The agent marks fields with 'X'.
2. Check whether this has finished the game, i.e. win for the agent or draw. If the game is finished, terminate episode and compute the reward. 
3. Perform the game of the opponent. The opponent marks fields with 'O'.
4. Check whether this has finished the game, i.e. win for the opponent or draw. If the game is finished, terminate episode and compute the reward. 

#### Initial state
We randomize whether the agent or the opponent starts with the first move. Hence, the initial state of the sequential decision problem is either
- a completely empty field, for the case that the agent has the first move, or
- a field with one '0', for the case that the opponent has the first move. 


#### Rewards
Rewards are only gained when the game is finished: 
- Reward of 1, if the agent wins. 
- Reward of 0, if the game ends in a draw. 
- Reward of -1, if the opponent wins. 

#### Executable Actions 
Notice that not all actions are always executable: If a field `s[i][j]` is non-empty, then the action $(i,j)$ is not executable. If the agent tries to perform a non-executable action, the environment raises an Exception. Hence, make sure that the agent only picks executable actions (the opponent policy chooses only executable actions as well).

In [None]:
"""
Copyright by Reinforcement Learning 23/24 team of Saarland University
"""

# Some preliminary and auxiliary definitions 

# Definitions of possible field values
CROSS, EMPTY, CIRCLE = 2, 1, 0  

def get_rows(state: list[list[int]]) -> [list[list[int]], list[list[int]], list[list[int]]]:
    """
    Helper function: Returns list of rows, list of columns, and list of diagonals
    
    """

    # Compute rows
    rows = state

    # Compute columns
    columns = []
    for j in range(3):
        column = []
        for i in range(3):
            column.append(state[i][j])
        columns.append(column)
    
    #Compute diagonals
    diagonal0 = []
    diagonal1 = []
    for i in range(3):
        diagonal0.append(state[i][i])
        diagonal1.append(state[2-i][i])
    
    # Return rows, columns, and diagonals. 
    return rows, columns, [diagonal0, diagonal1]

# Gymnasium environment for Tic-Tac-Toe
class SysadminEnv(gym.Env):

    def __init__(
        self,
    ) -> None:
        
        super().__init__()
        self.action_space = spaces.MultiDiscrete([3,3]) # Action space 
        self.observation_space = spaces.MultiDiscrete([[3,3,3],[3,3,3],[3,3,3]]) # State space
        self.reset_counter = 0


    @property
    def get_reset_counter(self):
        return self.reset_counter
    

    @property
    def occupied_fields(self) -> int | None:
        """
        Returns the number of occupied fields.

        """
        if not hasattr(self, "_state"):
            return None
        
        res = 0
        for l in self._state:
            for v in l:
                if v != EMPTY:
                    res = res + 1
            
        return res
    
    
    @property
    def game_finished(self) -> int | None:
        """
        Returns None if game is not finished.

        Returns 0 if circle wins.
        Returns 1 if it is a draw.
        Returns 2 if crosses wins. 

        """
        rows, columns, diagonals = get_rows(self._state)

        for l in rows + columns + diagonals:
            if all(v == CROSS for v in l):
                return 2
            if all(v == CIRCLE for v in l):
                return 0
            
        if self.occupied_fields == 9:
            return 1
        else:
            return None
        
    
    def opponent_policy (self) -> [int,int]:
        """
            Takes random action from the list of moves of the opponent policy.
        """

        if not hasattr(self, "_state"):
            raise Exception("Unable to find opponent move in uninitialized environment.")
        
        opponent_action_list = opponent_policy_dict[self._state.__str__()]
        return opponent_action_list[np.random.choice(len(opponent_action_list))]

    
    def perform_move(self, move: [int, int], cross: bool):
        """
        Returns the number of occupied fields.

        """
        if not hasattr(self, "_state"):
            raise Exception("Unable to perform move in uninitialized environment.")
#         print(move)
        if self._state[move[0]][move[1]] != EMPTY:
            raise Exception("Unable to perform move on occupied field.")

        if cross: 
            self._state[move[0]][move[1]] = 2
        else:
            self._state[move[0]][move[1]] = 0

        
    def reset(
        self, *, seed: int | None = None, options: dict[str, Any] | None = None
    ) -> tuple[np.ndarray, dict[str, Any]]:
        """
        Resets the environment to its initial state.
        
        """
        
        super().reset(seed=seed)

        # increment reset_counter
        self.reset_counter += 1

        # All fields are empty initially
        self._state = [[EMPTY,EMPTY,EMPTY],[EMPTY,EMPTY,EMPTY],[EMPTY,EMPTY,EMPTY]]

        # Random choice whether agent or opponent makes the first move. 
        # In case of opponent, first move of opponent is performed.
        if np.random.random() < 0.5:   
            self.perform_move(self.opponent_policy(), False)
        
        return self._state, dict()
   

    def step(self, action: [int,int]) -> tuple[np.ndarray, float, bool, bool, dict[str, Any]]:
        """
        Performs a step in the environment given an action of the agent.

        Return: new_state, reward, done, truncated, information_dictionary (last two return values are irrelevant for our purposes)  

        """
        
        # Perform agent's move
        self.perform_move(action, True)

        # Check whether game is finished and compute the return
        finished = self.game_finished  
        if self.game_finished == 2:
            return self._state, 1, True, False, dict()
        if self.game_finished == 1:
            return self._state, 0, True, False, dict()


        # Perform opponent's move
        self.perform_move(self.opponent_policy(), False)

        # Check whether game is finished and compute the return
        finished = self.game_finished  
        if finished is None:
            return self._state, 0, False, False, dict()
        elif finished == 0:
            return self._state, -1, True, False, dict()
        elif finished == 1:
            return self._state, 0, True, False, dict()    


    def display(self):
        """
        Prints the current state of the field to the command line. 

        """

        if not hasattr(self, "_state"):
           raise Exception("Unable to visualize uninitialized environment.")
       
        res = [["","",""],["","",""],["","",""]] 

        for i in range(3):
            for j in range(3):
                v = self._state[i][j]
                if v == CROSS:
                    res[i][j] = "X"
                elif v == EMPTY:
                    res[i][j] = " "
                elif v == CIRCLE:
                    res[i][j] = "O"
                else: 
                    raise Exception("Invalid value in TicTacToe Field")

        for l in res: 
            print(l)
    
        print("\n")

# Register environment
gym.register("Sysadmin-ED", partial(SysadminEnv))
env = gym.make("Sysadmin-ED")

# Example on how to experiment with the environment. 

env.reset(seed=42)

print(env.step((1,2)))

env.display()


***Above is the instruction provided by the RL 23/24 team***

## Exercise (Description) 

Briefly describe the learning algorithm you have implemented. When using function approximation, describe in particular the features that you are using. 

Parameterized function approximators are used to represent the action-value function. For the approximators, instead of the linear ones, we choose to implement deep neural networks (DNNs) to avoid handcrafting a suitable set of features.  

The architecture is shown as below.   

```python
DQN(  
  (layer1): Linear(in_features=9, out_features=32, bias=True)  
  (layer2): Linear(in_features=32, out_features=32, bias=True)  
  (layer3): Linear(in_features=32, out_features=9, bias=True)  
)  
```
The number of parameters:  1673

The input is a 9-dimensional vector, which is the flattened version of the 3x3 board. The output is a 9-dimensional vector, which is the Q-values of the 9 actions. To avoid the problem of predicting the Q-values of non-executable actions, we mask them by setting their Q-values to -inf (during action and while training). The value with the highest Q-value is chosen as the action with probability of 1-epsilon, otherwise a random action is chosen. 

### Training
Deep learning often requires large amounts of data -- thus we needed to implement bootstrapping to generate more data. We use the following bootstrapping strategy:
- We start with a random policy and play 1000 episodes.
- We then use the trained policy to play 1000 episodes and store the transitions in a replay buffer.
- We use batch gradient descent to train the network on the replay buffer, which speeds up the training process.

We use two networks -- the policy and target. The policy network is updated using the Adam optimizer based on the Q-value errors. The target network is updated as a running average of the policy network. 

The policy network's target in the loss is initialized as 0, then all the invalid states are masked as -inf, and the reward + the best action value (generated usign the target network) from the next state is chosen as the target Q-value. The loss is then computed as the `SmoothL1Loss` between the target and the predicted Q-values.

The epsilon-greedy policy is used to balance exploration and exploitation. The epsilon is exponentially annealed from 0.8 to 0.05 during training.

## Exercise (Implementation of DQN)

Implement a learning algorithm that learns to play Tic-Tac-Toe. For function approximation learning, the function `RL_algorithm` should return the learned feature weights.

In the case that you are not using function approximation but a different RL algorithm, the arguments and return types of the functions below are allowed to be changed.

In [None]:
# Given hyperparameter gamma
gamma = 1

from collections import deque, namedtuple
import random
from itertools import count
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
os.environ['CUDA_VISIBLE_DEVICES'] ='0'


Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward', 'action_mask'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)
    
    def push(self, *args):
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)
    
class DQN(nn.Module):

    def __init__(self, n_observations, n_actions):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(n_observations, 32)
        self.layer2 = nn.Linear(32, 32)
        self.layer3 = nn.Linear(32, n_actions)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)
    
    def get_parameter_number(self):
        total_num = sum(p.numel() for p in self.parameters())
        trainable_num = sum(p.numel() for p in self.parameters() if p.requires_grad)
        return {'Total': total_num, 'Trainable': trainable_num}
    
# Hyperparameters for training
BATCH_SIZE = 80  # for mini-batch gradient descent
EPS_START = 0.8  # e-greedy threshold start value
EPS_END = 0.05   # e-greedy threshold end value
EPS_DECAY = 1000 # e-greedy threshold decay, controls the rate of exponential decay of epsilon, higher means a slower decay
TAU = 0.01       # for soft update of parameters for the target net 
LR = 1e-4        # learning rate


n_actions = 9 # The number of actions

n_observations = 9 # The number of state observations

def agent_policy(state):
    """
    Policy of the agent: Given the environment state and feature weights, returns the best estimated performable action. 

    """
    
    state = torch.tensor(state, device=device, dtype=torch.float).flatten()
    available_actions_mask = available_actions(state)
                
    with torch.no_grad():
        action_index = policy_net(state)
        action_index = torch.tensor([action_index[i] if available_actions_mask[i]!=0. else -float("Inf") for i in range(n_actions) ], device = device, dtype = torch.float)
        action_index = torch.argmax(action_index).item()
        return (np.floor(action_index / 3).astype(int), int(action_index% 3) )
    
def training_algorithm(num_episodes: int, env: gym.Env, training_episode):
    """
    Reinforcement learning algorithm: For function approximation learning, learn the feature weights for the given number of training episodes, i.e. env.reset() is allowed to be called num_episodes many times. 
    
    Inputs: 
    num_episodes: number of training episodes
    env: gymnasium environment
    training_episode: the start point for updating the networks

    """

    # counts how many times we select the actions, controls the update of the network
    step = 0
    episode = list(range(num_episodes)) # for the progress bar
    with tqdm(episode) as pbar:
        for i in pbar:
            pbar.set_description("Training: "+str(step))
            
            # initialize the state and its respective action mask
            state, info = env.reset()
            state = torch.tensor(state, device=device, dtype=torch.float).flatten() 
            action_mask = torch.tensor(available_actions(state),device=device, dtype=torch.int)

            while True:
                # generate action and based on that update the env
                action_index, action = select_action(state, action_mask)
                observation, reward, done, _ , _ = env.step(action)
                reward = torch.tensor([reward], device=device)
                
                # if game is not over, generate new action mask for next state
                if done:
                    next_state = None
                    action_mask = None
                else:
                    next_state = torch.tensor(observation, device=device, dtype=torch.float).flatten()
                    action_mask = torch.tensor(available_actions(next_state),device=device, dtype=torch.int16)
                
                # stack the data into memory
                memory.push(state, action_index, next_state, reward, action_mask)

                state = next_state

                # Soft update of the target network's weights
                # θ′ ← τ θ + (1 −τ )θ′
                if (step >= training_episode) :
                    optimize_model()
                    target_net_state_dict = target_net.state_dict()
                    policy_net_state_dict = policy_net.state_dict()
                    for key in policy_net_state_dict:
                        target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
                    target_net.load_state_dict(target_net_state_dict)

                if done:
                    break
                step += 1

    print('Complete')
    
def select_action(state, available_actions_mask):
    """
    for action selection during training 
    Inputs: 
    state: current state for policy to map into a new action
    available_actions_mask: mask for avoiding choosing invalid actions
    
    Returns: action_index, action 
    """
    # exponential decay of epsilon
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        np.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    
    if sample > eps_threshold:
        with torch.no_grad():
            # optimal
            action_index = policy_net(state)
            action_index = torch.tensor([action_index[i] if available_actions_mask[i]!=0. else -float("Inf") for i in range(n_actions) ], device = device, dtype = torch.float)
            action_index = torch.argmax(action_index).item()
    else:
        # random
        available_actions_mask = [i for i in range(n_actions) if available_actions_mask[i]!=0 ]
        action_index = np.random.choice(available_actions_mask)
    return torch.tensor(action_index, device = device, dtype=torch.int16), torch.tensor([np.floor(action_index / 3).astype(int), int(action_index % 3)], device = device, dtype = torch.int16)

def optimize_model():
    """
    Updating the networks 
    """
    # Sample a batch from the memory, if not enough samples, skip the update
    if len(memory) < BATCH_SIZE:
        return 
    transitions = memory.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions))


    # Compute a mask of non-final states and concatenate the batch elements
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None]).view(-1,9)
    
    # extract the batch elements
    state_batch = torch.cat(batch.state).view(BATCH_SIZE,-1)
    action_batch = torch.tensor(batch.action, device = device, dtype=torch.int64).view(BATCH_SIZE,-1)
    reward_batch = torch.cat(batch.reward).view(BATCH_SIZE,-1)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions which would have been taken according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)
    
    # Compute V(s_{t+1}) for all next states.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    _next_state = torch.tensor(np.zeros((BATCH_SIZE, 9)), device = device, dtype = torch.float)

    with torch.no_grad():
        _next_state[non_final_mask] = target_net(non_final_next_states)
        for i in range(BATCH_SIZE):
            if non_final_mask[i]:
                _next_state[i] = torch.tensor([_next_state[i][j] if batch.action_mask[i][j]!=0. else -float("Inf") for j in range(n_actions) ], device = device, dtype = torch.float) 
        next_state_values = _next_state.max(1).values.view(BATCH_SIZE,-1)
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * gamma) + reward_batch
    
    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values)

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()

def available_actions(state):
    """
    Returns a list of all possible actions that can be performed in the given state. 

    """
    mask = []
    for i in range(9):
        if state[i] == 1.0:
            mask.append(1)
        else:
            mask.append(0)

#     assert np.shape(available_actions) == (n_actions,), "available_actions must be a list of length 9."
#     assert np.count_nonzero(available_actions) == n_actions - env.occupied_fields or 0 , "error in available action counting."

    return torch.tensor(mask,device =device, dtype = torch.int16)

## Policy Evaluation 

We evaluate the learned polices multiple times against the different opponent policies using the script below.  

### Opponent 1

In [None]:
def evaluate_policy(eval_episodes: int):
    """
    Evaluates the agent's policy described by the learned weights by simulating the given number of episodes. 
    Returns the overall number of wins, draws, looses, and the statistical mean of the episode returns.

    """

    returns = []
    wins, draws, looses = 0,0,0

    for episode in range(eval_episodes):
        
        state = env.reset()[0]
        done = False
        
        while not done:
            action = agent_policy(state)
            _, reward, done, _,_ = env.step(action)
            if done: 
                if reward == 1: 
                    wins = wins + 1
                elif reward == 0:
                    draws = draws + 1
                elif reward == -1:
                    looses = looses + 1

                returns.append(reward)  
          
    return wins, draws, looses, np.mean(returns)


opponent_policy_dict = opponent_policy_1 # Change to play against different opponent policy.

# Policy testing
training_episodes = 5000 # Number of training episodes
test_episodes = 100 # Number of test episodes
test_runs = 5 # Number of test runs

for i in range (test_runs):
    env = gym.make("Sysadmin-ED") 

    # create the policy and target nets and make them identical
    policy_net = DQN(n_observations, n_actions).to(device)
    target_net = DQN(n_observations, n_actions).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    print(policy_net)
    print(policy_net.get_parameter_number())
    optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)

    # controls the exponential decay of epsilon
    steps_done = 0
    
    # initialize the replay memory
    memory = ReplayMemory(5000)
    
    # start training  
    training_algorithm(training_episodes, env, 3500)
    
    # Check that number of episodes is not exceeded
    if env.get_reset_counter > training_episodes:
            raise RuntimeError(f"Exceeded maximal number of calls of reset function")
    
    wins, draws, looses, average_return = evaluate_policy(test_episodes) # evaluate the learned policy
    print(f"Training iteration {i}: Wins: {wins}, Draws: {draws}, Looses: {looses}, Average Return: {average_return}") # print results of the current test run


### Opponent 2

In [None]:
def evaluate_policy(eval_episodes: int):
    """
    Evaluates the agent's policy described by the learned weights by simulating the given number of episodes. 
    Returns the overall number of wins, draws, looses, and the statistical mean of the episode returns.

    """

    returns = []
    wins, draws, looses = 0,0,0

    for episode in range(eval_episodes):
        
        state = env.reset()[0]
        done = False
        
        while not done:
            action = agent_policy(state)
            _, reward, done, _,_ = env.step(action)
            if done: 
                if reward == 1: 
                    wins = wins + 1
                elif reward == 0:
                    draws = draws + 1
                elif reward == -1:
                    looses = looses + 1

                returns.append(reward)  
          
    return wins, draws, looses, np.mean(returns)


opponent_policy_dict = opponent_policy_2 # Change to play against different opponent policy.

# Policy testing
training_episodes = 5000 # Number of training episodes
test_episodes = 100 # Number of test episodes
test_runs = 5 # Number of test runs

for i in range (test_runs):
    env = gym.make("Sysadmin-ED") 

    # create the policy and target nets and make them identical
    policy_net = DQN(n_observations, n_actions).to(device)
    target_net = DQN(n_observations, n_actions).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    print(policy_net)
    print(policy_net.get_parameter_number())
    optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)

    # controls the exponential decay of epsilon
    steps_done = 0
    
    # initialize the replay memory
    memory = ReplayMemory(5000)
    
    # start training  
    training_algorithm(training_episodes, env, 3500)
    
    # Check that number of episodes is not exceeded
    if env.get_reset_counter > training_episodes:
            raise RuntimeError(f"Exceeded maximal number of calls of reset function")
    
    wins, draws, looses, average_return = evaluate_policy(test_episodes) # evaluate the learned policy
    print(f"Training iteration {i}: Wins: {wins}, Draws: {draws}, Looses: {looses}, Average Return: {average_return}") # print results of the current test run


# Opponent 3

In [None]:
def evaluate_policy(eval_episodes: int):
    """
    Evaluates the agent's policy described by the learned weights by simulating the given number of episodes. 
    Returns the overall number of wins, draws, looses, and the statistical mean of the episode returns.

    """

    returns = []
    wins, draws, looses = 0,0,0

    for episode in range(eval_episodes):
        
        state = env.reset()[0]
        done = False
        
        while not done:
            action = agent_policy(state)
            _, reward, done, _,_ = env.step(action)
            if done: 
                if reward == 1: 
                    wins = wins + 1
                elif reward == 0:
                    draws = draws + 1
                elif reward == -1:
                    looses = looses + 1

                returns.append(reward)  
          
    return wins, draws, looses, np.mean(returns)


opponent_policy_dict = opponent_policy_3 # Change to play against different opponent policy.

# Policy testing
training_episodes = 5000 # Number of training episodes
test_episodes = 100 # Number of test episodes
test_runs = 5 # Number of test runs

for i in range (test_runs):
    env = gym.make("Sysadmin-ED") 

    # create the policy and target nets and make them identical
    policy_net = DQN(n_observations, n_actions).to(device)
    target_net = DQN(n_observations, n_actions).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    print(policy_net)
    print(policy_net.get_parameter_number())
    optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)

    # controls the exponential decay of epsilon
    steps_done = 0
    
    # initialize the replay memory
    memory = ReplayMemory(5000)
    
    # start training  
    training_algorithm(training_episodes, env, 3500)
    
    # Check that number of episodes is not exceeded
    if env.get_reset_counter > training_episodes:
            raise RuntimeError(f"Exceeded maximal number of calls of reset function")
    
    wins, draws, looses, average_return = evaluate_policy(test_episodes) # evaluate the learned policy
    print(f"Training iteration {i}: Wins: {wins}, Draws: {draws}, Looses: {looses}, Average Return: {average_return}") # print results of the current test run


### Opponent 4

In [None]:
def evaluate_policy(eval_episodes: int):
    """
    Evaluates the agent's policy described by the learned weights by simulating the given number of episodes. 
    Returns the overall number of wins, draws, looses, and the statistical mean of the episode returns.

    """

    returns = []
    wins, draws, looses = 0,0,0

    for episode in range(eval_episodes):
        
        state = env.reset()[0]
        done = False
        
        while not done:
            action = agent_policy(state)
            _, reward, done, _,_ = env.step(action)
            if done: 
                if reward == 1: 
                    wins = wins + 1
                elif reward == 0:
                    draws = draws + 1
                elif reward == -1:
                    looses = looses + 1

                returns.append(reward)  
          
    return wins, draws, looses, np.mean(returns)


opponent_policy_dict = opponent_policy_4 # Change to play against different opponent policy.

# Policy testing
training_episodes = 5000 # Number of training episodes
test_episodes = 100 # Number of test episodes
test_runs = 5 # Number of test runs

for i in range (test_runs):
    env = gym.make("Sysadmin-ED") 

    # create the policy and target nets and make them identical
    policy_net = DQN(n_observations, n_actions).to(device)
    target_net = DQN(n_observations, n_actions).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
    print(policy_net)
    print(policy_net.get_parameter_number())
    # controls the exponential decay of epsilon
    steps_done = 0
    
    # initialize the replay memory
    memory = ReplayMemory(5000)
    
    # start training  
    training_algorithm(training_episodes, env, 3500)
    
    # Check that number of episodes is not exceeded
    if env.get_reset_counter > training_episodes:
            raise RuntimeError(f"Exceeded maximal number of calls of reset function")
    
    wins, draws, looses, average_return = evaluate_policy(test_episodes) # evaluate the learned policy
    print(f"Training iteration {i}: Wins: {wins}, Draws: {draws}, Looses: {looses}, Average Return: {average_return}") # print results of the current test run
