In [1]:
import gym
from gym_chess import ChessEnvV2
import torch
import random
import numpy as np
from torch import nn
import torch.nn.functional as F
from torch.distributions import Categorical

import matplotlib.pyplot as plt
from IPython import display
from tqdm.notebook import tqdm

In [116]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [267]:
class ChessPolicyNet(nn.Module):
    def __init__(self):
        super(ChessPolicyNet, self).__init__()
        self.conv1 = nn.Conv2d(12, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(64 * 8 * 8, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 64)
        self.fc4_start = nn.Linear(64, 64)
        self.fc4_end = nn.Linear(64, 64*64)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x_start = self.fc4_start(x)
        x_end = self.fc4_end(x).view(1, 64, 64)

        print(x_start)
        print(x_end)
        return x_start, x_end

In [268]:
from collections import namedtuple
SavedAction = namedtuple('SavedAction', ['start_prob', 'end_prob'])

class ChessAI:
    def __init__(self):
        self.net = ChessPolicyNet().to(device)
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=5e-3)
        self.mean_reward = None
        self.games = 0
        self.gamma = 0.99
        self.eps = np.finfo(np.float32).eps.item()

    def __call__(self, observation):
        board = np.array(observation['board'])
        one_hot_board = np.zeros((8, 8, 12))
        for i in range(8):
            for j in range(8):
                piece = board[i][j]
                if piece != 0:
                    if piece > 0:
                        one_hot_board[i][j][abs(piece) - 1] = 1
                    else:
                        one_hot_board[i][j][abs(piece) + 5] = 1
        one_hot_board = np.transpose(one_hot_board, (2, 0, 1))
        x = torch.from_numpy(one_hot_board).float().unsqueeze(0)
        x = x.to(device)
        x_start, x_end = self.net(x)

        start_probs = F.softmax(x_start, dim=1).squeeze()
        end_probs = F.softmax(x_end, dim=1).squeeze()


        m_start = Categorical(start_probs)
        start_tensor = m_start.sample(sample_shape=torch.Size([]))
        start_item = start_tensor.item()
        start_pos = (start_item // 8, start_item % 8)

        m_end = Categorical(end_probs[start_item])
        end_tensor = m_end.sample(sample_shape=torch.Size([]))
        end_item = end_tensor.item()
        end_pos = (end_item // 8, end_item % 8)

        self.memory.append(SavedAction(m_start.log_prob(start_tensor), m_end.log_prob(end_tensor)))
        return start_pos, end_pos

    def init_game(self, observation, possible_moves):
        self.memory = []
        self.rewards = []
        self.total_reward = 0

    def update(self, observation, reward, terminated, truncated, info, status):
        self.total_reward += reward
        self.rewards.append(reward)
        if terminated:
            self.games += 1
            if self.mean_reward is None:
                self.mean_reward = self.total_reward
            else:
                self.mean_reward = self.mean_reward * 0.95 + self.total_reward * (1.0 - 0.95)

            self.optimizer.zero_grad()
            # calculate discounted reward and make it normal distributed
            discounted = []
            R = 0
            for r in self.rewards[::-1]:
                R = r + self.gamma * R
                discounted.insert(0, R)
            discounted = torch.Tensor(discounted)
            discounted = (discounted - discounted.mean()) / (discounted.std() + self.eps)

            policy_losses = []
            for mem, discounted_reward in zip(self.memory, discounted):
                start_prob, end_prob = mem
                policy_losses.append(-((start_prob + end_prob) * discounted_reward))

            loss = torch.stack(policy_losses).sum()
            loss.backward()
            self.optimizer.step()

            if self.games % 2000 == 0:
                self.save(f"models/model_{self.games}.pt")

    def load(self, PATH):
        checkpoint = torch.load(PATH)
        self.net.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.games = checkpoint['games']
        self.mean_reward = checkpoint['mean_reward']

    def save(self, PATH):
        torch.save({
            'games': self.games,
            'model_state_dict': self.net.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'mean_reward': self.mean_reward}, PATH)

In [269]:
from IPython.core.display_functions import clear_output


# Play Game Example etwas mit Gawron's Code gemerged
def play_game_AI_against_random(episodes=2, steps=50, modelpath=None):
    env = gym.make("ChessVsRandomBot-v2", log=False)
    env.moves_max = steps
    total_rewards = 0
    average_rewards = 0
    steps_needed = 0
    did_legal_move = 0
    done = False
    observation = env.reset()
    #policy = ACPolicy()
    policy = ChessAI()
    if modelpath is not None:
        policy.load(modelpath)
    policy.init_game(observation, env.possible_moves)

    for i in range(episodes):
        #print("\n", "=" * 10, "NEW GAME", "=" * 10)
        #env.render()
        episode_reward = 0

        for j in range(steps):

            #moves = env.possible_moves
            #move = random.choice(moves)
            move = policy(observation)
            print(move)
            print("\n\n\n")
            for move in env.possible_moves:
                print(move)
            action = env.move_to_action(move)
            if move in env.possible_moves:
                did_legal_move += 1
            #Clear prints
            #for i in range(20):
            #   clear_output(wait=True)

            # Eigene Aktion an das Spiel weitergeben
            observation, step_reward, done, info = env.step(action)
            episode_reward += step_reward
            policy.update(observation, step_reward, done, False, None, None)

            if done:
                env.render()
                print(">" * 5, "GAME", i, "REWARD:", episode_reward)
                #steps_needed = j
                break

        if not done:
            #print("Not done, updating policy")
            policy.update(observation, 0, True, False, None, None)
        else:
            done = False


        # Episode zu Ende
        observation = env.reset()
        policy.init_game(observation, env.possible_moves)

        total_rewards += episode_reward
        average_rewards = 0.05 * episode_reward + (1 - 0.05) * average_rewards

    #policy.save(f"models/model_{policy.games}.pt")
    print("\n")
    print("#" * 40)
    print("#" * 40)
    print("#" * 40)
    print("\nAVERAGE SCORE: ", average_rewards)
    print("\nTOTAL REWARD: ", total_rewards)
    print("\nTOTAL LEGAL MOVES: ", did_legal_move)
    print("\nLEGAL MOVES AVERAGE: ", did_legal_move / episodes)

In [270]:
device = "cuda"

In [271]:
play_game_AI_against_random(1, 5, "models/model_1000.pt")

tensor([[-15.4474, -25.6004,  16.7986,   0.9408,  -9.4818,   1.4616,  -0.6489,
           8.2739,   4.2529, -10.9186, -22.5281,  -8.9942,   8.2626,  -0.9930,
          -5.1427,   0.6675, -15.3259,  13.1084,  -1.9304,   5.6925, -16.6272,
           2.5188, -18.1875, -25.2354,   1.9778, -13.8683,  10.4875,  -9.1018,
           9.5307, -14.7131,  -8.7599,  16.6326,  -0.6079,  -9.0558, -14.4124,
           6.6649,   2.2202,  -2.0030,  -1.7415, -15.6497,  -4.8630,  12.6843,
         -19.8256, -14.6819, -17.8932,  23.2788,  11.5280,  11.0472,  -8.3830,
          -6.5264, -11.6927, -10.7742,  11.2424,  16.5358,   3.6741,  -4.6306,
           1.3768,   1.5987,  63.1055,   6.0451,  13.0604, -24.9358,  -2.8851,
           0.9793]], device='cuda:0', grad_fn=<AddmmBackward0>)
tensor([[[ -2.0576,   8.7340,  33.7056,  ...,  -2.0465,  -6.9271,   7.6680],
         [  8.7443,   4.6823,  25.1787,  ...,  -3.4114, -12.9048,  -4.7832],
         [-27.0721,   2.4943,  -1.2878,  ...,  -9.5713,  13.3593,  -2.6

  logger.deprecation(
  if not isinstance(done, (bool, np.bool8)):
  logger.warn(
  logger.warn("Casting input x to numpy array.")
  logger.warn(f"{pre} is not within the observation space.")
