In [7]:
import gym
from gym_chess import ChessEnvV2
import torch
import random
import numpy as np
from torch import nn
import torch.nn.functional as F
from torch.distributions import Categorical

from gym import wrappers
import matplotlib.pyplot as plt
from IPython import display
from tqdm.notebook import tqdm

In [115]:
# Play Game Example etwas mit Gawron's Code gemerged
def play_game_AI_against_random(episodes=2, steps=50):
    env = gym.make("ChessVsRandomBot-v2")
    total_rewards = 0
    average_rewards = 0
    steps_needed = 0
    for i in range(episodes):
        initial_observation = env.reset()
        policy.initgame(initial_observation)
        print("\n", "=" * 10, "NEW GAME", "=" * 10)
        #env.render()
        episode_reward = 0

        for j in range(steps):

            #moves = env.possible_moves
            #move = random.choice(moves)
            move = policy(observation, env.possible_moves)

            action = env.move_to_action(move)

            # Eigene Aktion an das Spiel weitergeben
            observation, step_reward, done, _ = env.step(action)
            episode_reward += step_reward
            print()

            if done:
                print(">" * 5, "GAME", i, "REWARD:", episode_reward)
                #steps_needed = j
                break

        # Episode zu Ende
        #total_rewards += episode_reward
        average_rewards = 0.05 * episode_reward + (1- 0.05) * average_rewards
        # Kein Reward Threshold angegeben
        '''if average_rewards > env.spec.reward_threshold:
            print(f"Du bist zu gut für das Spiel :( \n "
                  f"Dein Average Reward beträgt {average_rewards} und du brauchtest nur {steps_needed} Schritte zum Sieg!")
        '''


    print("\n")
    print("#" * 40)
    print("#" * 40)
    print("#" * 40)
    print("\nAVERAGE SCORE: ", average_rewards)

In [None]:
play_game_AI_against_random(5, 200)

In [119]:
# Der Code von Gawron

def render(env, img):
    img.set_data(env.render())
    display.display(plt.gcf())
    display.clear_output(wait=True)

def play_game_gawron(policy, episodes=2000, do_render = False, seed=100):
    random.seed(seed)
    torch.manual_seed(seed)
    if do_render:
        env = gym.make("CartPole-v1", render_mode="rgb_array")
    else:
        env = gym.make("CartPole-v1")
    observation, info = env.reset(seed=seed)
    policy.init_game(observation)

    if do_render:
        plt.ion()
        plt.axis('off')
        img = plt.imshow(env.render())

    status = {}
    episode = 0
    status['steps'] = 0
    status['episode_reward'] = 0
    status['average_reward'] = 0
    total_reward = 0


    with tqdm(total=episodes) as pbar:
        pbar.set_postfix(status)
        while True:
            try:
                action = policy(observation)
                observation, reward, terminated, truncated, info = env.step(action)
                status['steps'] += 1
                status['episode_reward'] += reward
                if do_render:
                    render(env, img)
                policy.update(observation, reward, terminated, truncated, info, pbar)

                if terminated or status['steps'] > 1000:
                    episode += 1
                    if episode > pbar.total:
                        break
                    total_reward += status['episode_reward']
                    status['average_reward'] = 0.05 * status['episode_reward'] + (1 - 0.05) * status['average_reward']
                    if status['average_reward'] > env.spec.reward_threshold:
                        print(f"Solved! Running reward is now {status['average_reward']} and "
                              f"the last episode runs to {status['steps']} time steps!")
                        break

                    pbar.set_postfix(status, refresh=episode % 10 == 0)
                    pbar.update()
                    status['steps'] = 0

                    status['episode_reward'] = 0
                    observation, info = env.reset()
                    policy.init_game(observation)

            except KeyboardInterrupt:
                break
    env.close()

In [10]:
class ACNetwork(nn.Module):

    def __init__(self, hidden_size=32, max_possible_moves=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(max_possible_moves, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size*4),
            nn.ReLU()
        )
        self.policy = nn.Sequential(
            nn.Linear(hidden_size*4, hidden_size),
            nn.Linear(hidden_size, 1),
            nn.Softmax(dim=-1)
        )
        self.critic = nn.Sequential(
            nn.Linear(hidden_size, 1)
        )

    def forward(self, x):
        x = self.fc(x)
        p = self.policy(x)
        v = self.critic(x)
        return p, v

In [11]:
from collections import namedtuple
SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])

class ACPolicy:

    def __init__(self, gamma=0.99, lr=5e-3):
        self.net = ACNetwork()
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=lr)
        self.mean_reward = None
        self.games = 0
        self.gamma = gamma
        self.eps = np.finfo(np.float32).eps.item()

    def __call__(self, observation, moves):

        self.ACTIONS = moves

        probs, value = self.net(torch.tensor(observation))
        m = Categorical(probs)
        action = m.sample()

        self.memory.append(SavedAction(m.log_prob(action), value))
        self.last_observation = observation

        return self.ACTIONS[action.item()]

    def init_game(self, observation):
        self.memory = []
        self.rewards = []
        self.total_reward = 0

    def update(self, observation, reward, terminated, truncated, info, status):
        self.total_reward += reward
        self.rewards.append(reward)
        if terminated:
            self.games += 1
            if self.mean_reward is None:
                self.mean_reward = self.total_reward
            else:
                self.mean_reward = self.mean_reward * 0.95 + self.total_reward * (1.0 - 0.95)

            # calculate discounted reward and make it normal distributed
            discounted = []
            R = 0
            for r in self.rewards[::-1]:
                R = r + self.gamma * R
                discounted.insert(0, R)
            discounted = torch.tensor(discounted)
            # discounted = (discounted - discounted.mean()) / (discounted.std() + self.eps)

            policy_losses = []
            value_losses = []
            for mem, discounted_reward in zip(self.memory, discounted):
                advantage = discounted_reward - mem.value.item()
                policy_losses.append(-(mem.log_prob * advantage))

                value_losses.append(F.smooth_l1_loss(mem.value, discounted_reward.unsqueeze(0)))

            self.optimizer.zero_grad()
            loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
            loss.backward()
            self.optimizer.step()

            if self.games % 1000 == 0:
                self.save(f"model_{self.games}.pt")

    def load(self, PATH):
        checkpoint = torch.load(PATH)
        self.net.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.games = checkpoint['games']
        self.mean_reward = checkpoint['mean_reward']

    def save(self, PATH):
        torch.save({
            'games': self.games,
            'model_state_dict': self.net.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'mean_reward': self.mean_reward}, PATH)

In [12]:
policy = ACPolicy()
play_game_gawron(policy, episodes=10, do_render=True)

  logger.warn(
  logger.warn(
  logger.warn(


TypeError: ChessEnvV2.reset() got an unexpected keyword argument 'seed'