In [42]:
# Imports that are need 
import torch
import torch.nn as nn
import torch.nn.functional as F
import gym
import gym_chess
import random
import numpy as np
import chess
import chess.svg
from IPython.display import display, SVG
import os

In [43]:
# Actor Network Definition
class Actor(nn.Module):
    def __init__(self, input_size, output_size):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return F.softmax(self.fc3(x), dim=-1)

In [44]:
# Critic Network Definition
class Critic(nn.Module):
    def __init__(self, input_size):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [45]:

def board_to_tensor(board):
    # Convert board to binary feature planes
    pieces = ['p', 'n', 'b', 'r', 'q', 'k', 'P', 'N', 'B', 'R', 'Q', 'K']
    state = np.zeros(768)  # 8x8x12 flattened
    
    for i in range(64):
        piece = board.piece_at(i)
        if piece:
            piece_idx = pieces.index(piece.symbol())
            state[i + piece_idx * 64] = 1
            
    return torch.FloatTensor(state)


In [46]:
# Training Function Definition
# 
# Trains a chess AI agent using the OpenAI Gym chess environment.

# This function sets up the chess environment, runs a specified number of training episodes, and prints the total reward for each episode.

# Args:
#     num_episodes (int): The number of training episodes to run.

# 

def train_chess_ai(num_episodes=100, save_path='chess_model'):
    
    #l
     # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    
    # Initialize the chess environment
    env = gym.make('Chess-v0')
    
    # Initialize Actor-Critic networks with the dimensions already defined
    actor_net = Actor(input_size=768, output_size=4672)
    critic_net = Critic(input_size=768)
    
    # Initialize optimizers
    actor_optimizer = torch.optim.Adam(actor_net.parameters())
    critic_optimizer = torch.optim.Adam(critic_net.parameters())
    
    training_history = []
    
    def create_move_lookup():
        # Create a mapping of all possible moves
        moves = []
        for from_square in range(64):
            for to_square in range(64):
                moves.append((from_square, to_square))
        return moves

    def select_legal_action(action_probs, legal_moves):
        # Convert action_probs to numpy array
        probs = action_probs.detach().numpy()[0]
        
        # Get all legal moves and their indices
        legal_moves_list = list(legal_moves)
        move_lookup = create_move_lookup()
        
        # Convert moves to indices
        move_indices = []
        for move in legal_moves_list:
            from_square = move.from_square
            to_square = move.to_square
            idx = move_lookup.index((from_square, to_square))
            move_indices.append(idx)
        
        # Filter and normalize probabilities
        legal_probs = probs[move_indices]
        legal_probs = legal_probs / legal_probs.sum()
        
        # Select move
        selected_idx = np.random.choice(len(legal_moves_list), p=legal_probs)
        
        return legal_moves_list[selected_idx]

    
    def update_networks(actor_net, critic_net, actor_optimizer, critic_optimizer, state, action, reward, done):
        # Implement the network update logic here
        pass
    
    for episode in range(num_episodes):
        state = env.reset()
        done = False
        total_reward = 0
        episode_data = []
        
        while not done:
            #Display current board state 
            print (env.render())
            # Convert state to tensor
            state_tensor = board_to_tensor(state).unsqueeze(0)
            
            # Get action probabilities from Actor
            action_probs = actor_net(state_tensor)
            
            # Get state value from Critic
            state_value = critic_net(state_tensor)
            
            # Select action based on probabilities (only from legal moves)
            legal_moves = env.legal_moves
            # Filter action_probs to only legal moves and sample
            action = select_legal_action(action_probs, legal_moves)
            
            # Execute the move
            next_state, reward, done, info = env.step(action)
            
            # Store transition
            episode_data.append({
                'state': state,
                'action': action,
                'reward': reward,
                'value': state_value.item()
            })
            
            # Update networks here
            update_networks(actor_net, critic_net, 
                          actor_optimizer, critic_optimizer,
                          state_tensor, action, reward, done)
            
            total_reward += reward
            state = next_state
        
        training_history.append({
            'episode': episode + 1,
            'total_reward': total_reward,
            'moves': episode_data
        })
        
        print(f"Episode {episode + 1}, Total Reward: {total_reward}")
        
        # Save models and history periodically
        if (episode + 1) % 100 == 0:
            torch.save(actor_net.state_dict(), f'{save_path}_actor.pth')
            torch.save(critic_net.state_dict(), f'{save_path}_critic.pth')
            np.save(f'{save_path}_history.npy', training_history)


In [47]:
# Model Initialization and Testing
input_size = 768  # Example input size (board state as a flat vector)
output_size = 4672  # Example output size (number of possible moves in chess)
actor_net = Actor(input_size=input_size, output_size=output_size)
critic_net = Critic(input_size=input_size)

In [48]:
# Test with Dummy Data
dummy_state = torch.rand(1, input_size)
action_probs = actor_net(dummy_state)
print(f"Action probabilities: {action_probs}")
state_value = critic_net(dummy_state)
print(f"State value: {state_value}")

Action probabilities: tensor([[0.0002, 0.0002, 0.0002,  ..., 0.0002, 0.0002, 0.0002]],
       grad_fn=<SoftmaxBackward0>)
State value: tensor([[0.0171]], grad_fn=<AddmmBackward0>)


In [49]:
# Run Training
train_chess_ai(save_path='models/chess_ai')

♜ ♞ ♝ ♛ ♚ ♝ ♞ ♜
♟ ♟ ♟ ♟ ♟ ♟ ♟ ♟
⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘
⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘
⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘
⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘
♙ ♙ ♙ ♙ ♙ ♙ ♙ ♙
♖ ♘ ♗ ♕ ♔ ♗ ♘ ♖
♜ ♞ ♝ ♛ ♚ ♝ ♞ ♜
♟ ♟ ♟ ♟ ♟ ♟ ♟ ♟
⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘
⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘
⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘
⭘ ⭘ ⭘ ⭘ ♙ ⭘ ⭘ ⭘
♙ ♙ ♙ ♙ ⭘ ♙ ♙ ♙
♖ ♘ ♗ ♕ ♔ ♗ ♘ ♖
♜ ♞ ♝ ♛ ♚ ♝ ♞ ♜
⭘ ♟ ♟ ♟ ♟ ♟ ♟ ♟
♟ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘
⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘
⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘
⭘ ⭘ ⭘ ⭘ ♙ ⭘ ⭘ ⭘
♙ ♙ ♙ ♙ ⭘ ♙ ♙ ♙
♖ ♘ ♗ ♕ ♔ ♗ ♘ ♖
♜ ♞ ♝ ♛ ♚ ♝ ♞ ♜
⭘ ♟ ♟ ♟ ♟ ♟ ♟ ♟
♟ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘
⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘
⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ♙
⭘ ⭘ ⭘ ⭘ ♙ ⭘ ⭘ ⭘
♙ ♙ ♙ ♙ ⭘ ♙ ♙ ⭘
♖ ♘ ♗ ♕ ♔ ♗ ♘ ♖
⭘ ♞ ♝ ♛ ♚ ♝ ♞ ♜
♜ ♟ ♟ ♟ ♟ ♟ ♟ ♟
♟ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘
⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘
⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ♙
⭘ ⭘ ⭘ ⭘ ♙ ⭘ ⭘ ⭘
♙ ♙ ♙ ♙ ⭘ ♙ ♙ ⭘
♖ ♘ ♗ ♕ ♔ ♗ ♘ ♖
⭘ ♞ ♝ ♛ ♚ ♝ ♞ ♜
♜ ♟ ♟ ♟ ♟ ♟ ♟ ♟
♟ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘
⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘
⭘ ⭘ ♙ ⭘ ⭘ ⭘ ⭘ ♙
⭘ ⭘ ⭘ ⭘ ♙ ⭘ ⭘ ⭘
♙ ♙ ⭘ ♙ ⭘ ♙ ♙ ⭘
♖ ♘ ♗ ♕ ♔ ♗ ♘ ♖
⭘ ♞ ♝ ♛ ♚ ♝ ♞ ♜
♜ ♟ ♟ ♟ ♟ ♟ ♟ ⭘
♟ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ♟
⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘
⭘ ⭘ ♙ ⭘ ⭘ ⭘ ⭘ ♙
⭘ ⭘ ⭘ ⭘ ♙ ⭘ ⭘ ⭘
♙ ♙ ⭘ ♙ ⭘ ♙ ♙ ⭘
♖ ♘ ♗ ♕ ♔ ♗ ♘ ♖
⭘ ♞ ♝ ♛ ♚ ♝ ♞ ♜
♜ ♟ ♟ ♟ ♟ ♟ ♟ ⭘
♟ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ♟
⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘ ⭘
⭘ ⭘ ♙ ⭘ ⭘ ⭘ ⭘ ♙
⭘ ♙ ⭘ ⭘ ♙ ⭘ ⭘ ⭘
♙ ⭘ ⭘ ♙ 