In [None]:
import gym
from gym_chess import ChessEnvV2
import torch
import random
import numpy as np
from torch import nn
import torch.nn.functional as F
from torch.distributions import Categorical

import matplotlib.pyplot as plt
from IPython import display
from tqdm.notebook import tqdm

In [None]:
# Der Code von Gawron

def render(env, img):
    img.set_data(env.render())
    display.display(plt.gcf())
    display.clear_output(wait=True)

def play_game_gawron(policy, episodes=2000, do_render = False, seed=100):
    random.seed(seed)
    torch.manual_seed(seed)
    if do_render:
        env = gym.make("CartPole-v1", render_mode="rgb_array")
    else:
        env = gym.make("CartPole-v1")
    observation, info = env.reset(seed=seed)
    policy.init_game(observation)

    if do_render:
        plt.ion()
        plt.axis('off')
        img = plt.imshow(env.render())

    status = {}
    episode = 0
    status['steps'] = 0
    status['episode_reward'] = 0
    status['average_reward'] = 0
    total_reward = 0


    with tqdm(total=episodes) as pbar:
        pbar.set_postfix(status)
        while True:
            try:
                action = policy(observation)
                observation, reward, terminated, truncated, info = env.step(action)
                status['steps'] += 1
                status['episode_reward'] += reward
                if do_render:
                    render(env, img)
                policy.update(observation, reward, terminated, truncated, info, pbar)

                if terminated or status['steps'] > 1000:
                    episode += 1
                    if episode > pbar.total:
                        break
                    total_reward += status['episode_reward']
                    status['average_reward'] = 0.05 * status['episode_reward'] + (1 - 0.05) * status['average_reward']
                    if status['average_reward'] > env.spec.reward_threshold:
                        print(f"Solved! Running reward is now {status['average_reward']} and "
                              f"the last episode runs to {status['steps']} time steps!")
                        break

                    pbar.set_postfix(status, refresh=episode % 10 == 0)
                    pbar.update()
                    status['steps'] = 0

                    status['episode_reward'] = 0
                    observation, info = env.reset()
                    policy.init_game(observation)

            except KeyboardInterrupt:
                break
    env.close()

In [None]:
import numpy as np
import torch

class ActorCriticBoardNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.actor = None
        self.obs_size = 768

        # Define the layers of your custom policy network
        self.fc1 = nn.Linear(self.obs_size, 64)
        self.fc2 = nn.Linear(64, 32)
        #self.actor = nn.Linear(32, self.num_moves)
        self.critic = nn.Linear(32, 1)

    def forward(self, observation, num_moves):
        # Convert the observation into a numerical format
        board = np.array(observation['board'])
        one_hot_board = np.zeros((8, 8, 12))
        for i in range(8):
            for j in range(8):
                piece = board[i][j]
                if piece != 0:
                    piece_idx = abs(piece) - 1
                    one_hot_board[i][j][piece_idx] = 1
                    if piece > 0:
                        one_hot_board[i][j][piece_idx] = 1
                    else:
                        one_hot_board[i][j][6 + piece_idx] = 1
        flattened_board = one_hot_board.flatten()
        obs_tensor = torch.FloatTensor(flattened_board)

        # Feed the observation into the policy network
        x = F.relu(self.fc1(obs_tensor))
        x = F.relu(self.fc2(x))
        #action_probs = F.softmax(self.actor(x), dim=-1)[:num_moves]
        self.actor = nn.Linear(32, num_moves)
        action_probs = F.softmax(self.actor(x), dim=-1)[:num_moves]
        state_value = self.critic(x)

        return action_probs, state_value

In [None]:
from collections import namedtuple
SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])

class ACPolicy:

    def __init__(self):
        self.net = ActorCriticBoardNetwork()
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=5e-3)
        self.mean_reward = None
        self.games = 0
        self.gamma = 0.99
        self.eps = np.finfo(np.float32).eps.item()

    def __call__(self, observation, possible_moves):

        self.ACTIONS = possible_moves

        probs, value = self.net(observation, len(possible_moves))
        m = Categorical(probs)
        action = m.sample()
        self.memory.append(SavedAction(m.log_prob(action), value))
        self.last_observation = observation

        return self.ACTIONS[action.item()]

    def init_game(self, observation, possible_moves):
        self.memory = []
        self.rewards = []
        self.total_reward = 0

    def update(self, observation, reward, terminated, truncated, info, status):
        self.total_reward += reward
        self.rewards.append(reward)
        if terminated:
            self.games += 1
            if self.mean_reward is None:
                self.mean_reward = self.total_reward
            else:
                self.mean_reward = self.mean_reward * 0.95 + self.total_reward * (1.0 - 0.95)

            # calculate discounted reward and make it normal distributed
            discounted = []
            R = 0
            for r in self.rewards[::-1]:
                R = r + self.gamma * R
                discounted.insert(0, R)
            discounted = torch.tensor(discounted)
            # discounted = (discounted - discounted.mean()) / (discounted.std() + self.eps)

            policy_losses = []
            value_losses = []
            for mem, discounted_reward in zip(self.memory, discounted):
                advantage = discounted_reward - mem.value.item()
                policy_losses.append(-(mem.log_prob * advantage))

                value_losses.append(F.smooth_l1_loss(mem.value, discounted_reward.unsqueeze(0)))

            self.optimizer.zero_grad()
            loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
            loss.backward()
            self.optimizer.step()

            if self.games % 1000 == 0:
                self.save(f"model_{self.games}.pt")

    def load(self, PATH):
        checkpoint = torch.load(PATH)
        self.net.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.games = checkpoint['games']
        self.mean_reward = checkpoint['mean_reward']

    def save(self, PATH):
        torch.save({
            'games': self.games,
            'model_state_dict': self.net.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'mean_reward': self.mean_reward}, PATH)

In [None]:
from IPython.core.display_functions import clear_output


# Play Game Example etwas mit Gawron's Code gemerged
def play_game_AI_against_random(episodes=2, steps=50):
    env = gym.make("ChessVsRandomBot-v2", log=False)
    env.moves_max = steps
    total_rewards = 0
    average_rewards = 0
    steps_needed = 0
    observation = env.reset()
    policy = ACPolicy()
    policy.init_game(observation, env.possible_moves)


    for i in range(episodes):
        print("\n", "=" * 10, "NEW GAME", "=" * 10)
        #env.render()
        episode_reward = 0

        for j in range(steps):

            #moves = env.possible_moves
            #move = random.choice(moves)
            move = policy(observation, env.possible_moves)

            action = env.move_to_action(move)


            #Clear prints
            #for i in range(20):
             #   clear_output(wait=True)

            # Eigene Aktion an das Spiel weitergeben
            observation, step_reward, done, info = env.step(action)
            episode_reward += step_reward
            policy.update(observation, step_reward, done, False, None, None)

            if done:
                env.render()
                print(">" * 5, "GAME", i, "REWARD:", episode_reward)
                #steps_needed = j
                break


        # Episode zu Ende
        observation = env.reset()
        policy.init_game(observation, env.possible_moves)

        total_rewards += episode_reward
        average_rewards = 0.05 * episode_reward + (1- 0.05) * average_rewards
        # Kein Reward Threshold angegeben
        '''if average_rewards > env.spec.reward_threshold:
            print(f"Du bist zu gut für das Spiel :( \n "
                  f"Dein Average Reward beträgt {average_rewards} und du brauchtest nur {steps_needed} Schritte zum Sieg!")
        '''


    print("\n")
    print("#" * 40)
    print("#" * 40)
    print("#" * 40)
    print("\nAVERAGE SCORE: ", average_rewards)
    print("\nTOTAL REWARD: ", total_rewards)

In [None]:
play_game_AI_against_random(50, 200)

In [None]:
env = gym.make("ChessVsRandomBot-v2")
observation = env.reset()
policy = ACPolicy()
policy.init_game(observation, env.possible_moves)
for i in range(100):
    move = policy(observation, env.possible_moves)


In [None]:
#policy = ACPolicy()
#play_game_gawron(policy, episodes=10, do_render=True)