In [18]:
from chess_env import ChessEnv
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import random


def build_model(input_shape, num_actions):
    model = keras.Sequential([
        layers.Input(shape=input_shape),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(num_actions, activation='linear')
    ])
    return model


In [None]:
input_shape = (8, 8, 6)  # Adjust based on your observation space
num_actions =  len(list(chess.Board().legal_moves))  # Adjust based on your action space
model = build_model(input_shape, num_actions)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')


In [22]:
chess_env = ChessEnv()
print(chess_env.board)
print(len(list(chess_env.board.legal_moves)))


r n b q k b n r
p p p p p p p p
. . . . . . . .
. . . . . . . .
. . . . . . . .
. . . . . . . .
P P P P P P P P
R N B Q K B N R
20


In [24]:
env = ChessEnv()
num_episodes = 100
input_shape = (8, 8, 6)  # Adjust based on your observation space
num_actions =  len(list(chess_env.board.legal_moves))  # Adjust based on your action space
model = build_model(input_shape, num_actions)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')


for episode in range(num_episodes):
    state = env.reset()
    state = np.expand_dims(state, axis=0)

    total_reward = 0
    done = False
    while not done:
        # Select action using epsilon-greedy policy or any exploration strategy
        action = np.random.choice(num_actions)
        next_state, reward, done, info = env.step(action)
        next_state = np.expand_dims(next_state, axis=0)

        target = reward + 0.99 * np.max(model.predict(next_state))
        with tf.GradientTape() as tape:
            q_values = model(state, training=True)
            action_mask = tf.one_hot(action, num_actions)
            q_values = tf.reduce_sum(tf.multiply(q_values, action_mask), axis=1)
            loss = keras.losses.mean_squared_error(target, q_values)

        gradients = tape.gradient(loss, model.trainable_variables)
        model.optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        total_reward += reward
        state = next_state

    print(f'Episode: {episode}, total_reward: {total_reward}')


Episode: 0, total_reward: 180
Episode: 1, total_reward: -691
Episode: 2, total_reward: 1898
Episode: 3, total_reward: -2601
Episode: 4, total_reward: -288
Episode: 5, total_reward: -233
Episode: 6, total_reward: -499
Episode: 7, total_reward: -272
Episode: 8, total_reward: -244
Episode: 9, total_reward: -440
Episode: 10, total_reward: -436
Episode: 11, total_reward: -30
Episode: 12, total_reward: -3727
Episode: 13, total_reward: 3101
Episode: 14, total_reward: -325
Episode: 15, total_reward: 805
Episode: 16, total_reward: -1361
Episode: 17, total_reward: -540
Episode: 18, total_reward: -981
Episode: 19, total_reward: -181
Episode: 20, total_reward: -432
Episode: 21, total_reward: -471
Episode: 22, total_reward: -405
Episode: 23, total_reward: -261
Episode: 24, total_reward: -438
Episode: 25, total_reward: -164
Episode: 26, total_reward: -217
Episode: 27, total_reward: -392


KeyboardInterrupt: 