# Assignment

In [2]:
# Import 

import numpy as np
import matplotlib.pyplot as plt
from degree_freedom_queen import *
from degree_freedom_king1 import *
from degree_freedom_king2 import *
from generate_game import *
from Chess_env import *
from chess_student import *

size_board = 4

## The Environment

You can find the environment in the file Chess_env, which contains the class Chess_env. To define an object, you need to provide the board size considered as input. In our example, size_board=4. 
Chess_env is composed by the following methods:

1. Initialise_game. The method initialises an episode by placing the three pieces considered (Agent's king and queen, enemy's king) in the chess board. The outputs of the method are described below in order.

     S $\;$ A matrix representing the board locations filled with 4 numbers: 0, no piece in that position; 1, location of the 
     agent's king; 2 location of the queen; 3 location of the enemy king.
     
     X $\;$ The features, that is the input to the neural network. See the assignment for more information regarding the            definition of the features adopted. To personalise this, go into the Features method of the class Chess_env() and change        accordingly.
     
     allowed_a $\;$ The allowed actions that the agent can make. The agent is moving a king, with a total number of 8                possible actions, and a queen, with a total number of $(board_{size}-1)\times 8$ actions. The total number of possible actions correspond      to the sum of the two, but not all actions are allowed in a given position (movements to locations outside the borders or      against chess rules). Thus, the variable allowed_a is a vector that is one (zero) for an action that the agent can (can't)      make. Be careful, apply the policy considered on the actions that are allowed only.
     

2. OneStep. The method performs a one step update of the system. Given as input the action selected by the agent, it updates the chess board by performing that action and the response of the enemy king (which is a random allowed action in the settings considered). The first three outputs are the same as for the Initialise_game method, but the variables are computed for the position reached after the update of the system. The fourth and fifth outputs are:

     R $\;$ The reward. To change this, look at the OneStep method of the class where the rewards are set.
     
     Done $\;$ A variable that is 1 if the episode has ended (checkmate or draw).
     
     
3. Features. Given the chessboard position, the method computes the features.

This information and a quick analysis of the class should be all you need to get going. The other functions that the class exploits are uncommented and constitute an example on how not to write a python code. You can take a look at them if you want, but it is not necessary.






In [3]:
## INITIALISE THE ENVIRONMENT

env=Chess_Env(size_board)

In [4]:
## PRINT 5 STEPS OF AN EPISODE CONSIDERING A RANDOM AGENT

S,X,allowed_a=env.Initialise_game()                       # INTIALISE GAME
print(env.Features())

print(S)                                                  # PRINT CHESS BOARD (SEE THE DESCRIPTION ABOVE)

print('check? ',env.check)                                # PRINT VARIABLE THAT TELLS IF ENEMY KING IS IN CHECK (1) OR NOT (0)
print('dofk2 ',np.sum(env.dfk2_constrain).astype(int))    # PRINT THE NUMBER OF LOCATIONS THAT THE ENEMY KING CAN MOVE TO


for i in range(5):
    
    a,_=np.where(allowed_a==1)                  # FIND WHAT THE ALLOWED ACTIONS ARE
    a_agent=np.random.permutation(a)[0]         # MAKE A RANDOM ACTION

    S,X,allowed_a,R,Done=env.OneStep(a_agent)   # UPDATE THE ENVIRONMENT
    
    
    ## PRINT CHESS BOARD AND VARIABLES
    print('')
    print(S)
    print(R,'', Done)
    print('check? ',env.check)
    print('dofk2 ',np.sum(env.dfk2_constrain).astype(int))
    
    
    # TERMINATE THE EPISODE IF Done=True (DRAW OR CHECKMATE)
    if Done:
        break

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[[0 0 0 3]
 [0 0 0 0]
 [0 0 2 0]
 [0 1 0 0]]
check?  0
dofk2  0

[[0 0 0 0]
 [0 0 0 3]
 [2 0 0 0]
 [0 1 0 0]]
0  0
check?  0
dofk2  2

[[0 0 0 0]
 [0 0 0 0]
 [2 1 0 3]
 [0 0 0 0]]
0  0
check?  0
dofk2  2

[[0 0 0 0]
 [0 0 0 3]
 [0 1 0 0]
 [2 0 0 0]]
0  0
check?  0
dofk2  3

[[0 0 0 3]
 [2 0 0 0]
 [0 1 0 0]
 [0 0 0 0]]
0  0
check?  0
dofk2  1

[[0 0 3 0]
 [2 0 0 0]
 [0 0 0 0]
 [0 1 0 0]]
0  0
check?  0
dofk2  1


In [11]:
# INITIALISE THE PARAMETERS OF YOUR NEURAL NETWORK AND...
# PLEASE CONSIDER TO USE A MASK OF ONE FOR THE ACTION MADE AND ZERO OTHERWISE IF YOU ARE NOT USING VANILLA GRADIENT DESCENT...
# WE SUGGEST A NETWORK WITH ONE HIDDEN LAYER WITH SIZE 200.
from Q_values import Q_values

S,X,allowed_a=env.Initialise_game()
N_a=np.shape(allowed_a)[0]   # TOTAL NUMBER OF POSSIBLE ACTIONS

N_in=np.shape(X)[0]    ## INPUT SIZE
N_h=200                ## NUMBER OF HIDDEN NODES

## INITALISE YOUR NEURAL NETWORK...
q_value = Q_values([N_in, N_h, N_a])
# HYPERPARAMETERS SUGGESTED (FOR A GRID SIZE OF 4)

epsilon_0 = 0.2     # STARTING VALUE OF EPSILON FOR THE EPSILON-GREEDY POLICY
beta = 0.00005      # THE PARAMETER SETS HOW QUICKLY THE VALUE OF EPSILON IS DECAYING (SEE epsilon_f BELOW)
gamma = 0.85        # THE DISCOUNT FACTOR
eta = 0.0035        # THE LEARNING RATE

N_episodes = 100000 # THE NUMBER OF GAMES TO BE PLAYED

def agent_action(value_function_model, X, allowed_a):
    (V,_) = value_function_model(X)
    allow_v = np.copy(V)
    allow_v[np.where(allowed_a.flatten()!=1)] = 0
    if (np.max(allow_v) == 0):
        return np.random.permutation(np.where(allowed_a.flatten()==1)[0])[0]
    else:
        return np.argmax(allow_v)

def play_one_game(value_function):
    S,X,allowed_a=env.Initialise_game()     # INITIALISE GAME
    Done=0                                  # SET Done=0 AT THE BEGINNING
    i=1                                     # COUNTER FOR THE NUMBER OF ACTIONS (MOVES) IN AN EPISODE
    while Done==0:
        a_agent = agent_action(value_function, X, allowed_a)
        S,X,allowed_a,R,Done=env.OneStep(a_agent)
        if Done:
            return (R,i)
        if i>16:
            return (R,i)
        i=i+1
def period_validate(function_model, validate_games):
    # SAVING VARIABLES
    R_save_temp = np.zeros([validate_games, 1])
    N_moves_save_temp = np.zeros([validate_games, 1])
    for n in range(validate_games):
        (R, n_moves) = play_one_game(function_model)
        R_save_temp[n]=np.copy(R)
        N_moves_save_temp[n]=np.copy(n_moves)
    return (np.mean(R_save_temp),np.mean(N_moves_save_temp))

def train(N_episodes, epsilon_0):
    # TRAINING LOOP BONE STRUCTURE
    R_seq = []
    N_step_seq = []
    x_axis = []
    dead_seq = []

    for n in range(N_episodes):
        epsilon_f = epsilon_0 / (1 + beta * n)   ## DECAYING EPSILON
        Done=0                                   ## SET DONE TO ZERO (BEGINNING OF THE EPISODE)
        S,X,allowed_a=env.Initialise_game()
        ## INITIALISE GAME
        while Done==0:                           ## START THE EPISODE
            ## THIS IS A RANDOM AGENT, CHANGE IT...
            (qv, neuron_value) = q_value.q_values(X)
            a_agent=epsilon_greedy(qv, np.asarray(allowed_a), epsilon_f)

            S_next,X_next,allowed_a_next,R,Done=env.OneStep(a_agent)

            ## THE EPISODE HAS ENDED, UPDATE...BE CAREFUL, THIS IS THE LAST STEP OF THE EPISODE
            if Done==1:
                q_value.update_q_func(eta, neuron_value, a_agent, R, qv)
                break
            # IF THE EPISODE IS NOT OVER...
            else:
                (Q_next, _) = q_value.q_values(X_next)
                future_R = R + gamma * np.max(Q_next)
                q_value.update_q_func(eta, neuron_value, a_agent, future_R, qv)
            # NEXT STATE AND CO. BECOME ACTUAL STATE...
            S=np.copy(S_next)
            X=np.copy(X_next)
            allowed_a=np.copy(allowed_a_next)

        ## print process
        ## temp validate
        if (n % 1000 == 0 ):
            (avg_R, avg_n) = period_validate(lambda X_in: q_value.q_values(X_in), 100)
            print(f"\r Training process {np.round(n/N_episodes*100,2)}%  "
                  f"avg_R: {avg_R}, avg_steps:{avg_n}", end="", flush=True)

            x_axis.append(n)
            R_seq.append(avg_R)
            N_step_seq.append(avg_n)
            dead_neuron = 0
            for (w,b) in q_value.nn.W_bias:
                for neuron_idx in range(w.shape[1]):
                    if np.where(w[:, neuron_idx]==0)[0].shape[0] == w[:, neuron_idx].shape[0]:
                        dead_neuron+=1

            plt.figure(1)
            plt.plot(n, dead_neuron)

train(N_episodes, epsilon_0)

 Training process 5.0%  avg_R: 0.25, avg_steps:11.57

In [6]:
# PERFORM N_episodes=1000 EPISODES MAKING RANDOM ACTIONS AND COMPUTE THE AVERAGE REWARD AND NUMBER OF MOVES
def random_test():
    N_episodes=1000
    R_save_random = np.zeros([N_episodes, 1])
    N_moves_save_random = np.zeros([N_episodes, 1])
    for n in range(N_episodes):
        (R,i) = play_one_game(lambda X_in: (np.random.random(32), [] ))
        R_save_random[n]=np.copy(R)
        N_moves_save_random[n]=np.copy(i)
    print('Random_Agent, Average reward:',np.mean(R_save_random),'Number of steps: ',np.mean(N_moves_save_random))
random_test()

Random_Agent, Average reward: 0.209 Number of steps:  6.11
