In [1]:
import torch
from torch import tensor
from torch import nn

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import torch
from torch import tensor

In [7]:
from kaggle_environments import evaluate, make, utils
from gym import spaces
class ConnectX:
    DRAW = 0.5
    WIN = 1.0
    LOSE = -1.0
    ERROR = -10.0 
    
    def __init__(self, pair=[None, "random"], config = {"rows": 6, "columns": 7, "inarow": 4}):
        self.ks_env = make("connectx", config, debug=True)
        self.pair = pair
        self.env = self.ks_env.train(pair)
        self.config = config
        # Learn about spaces here: http://gym.openai.com/docs/#spaces
        self.action_space = spaces.Discrete(config["columns"])
        self.observation_space = spaces.Box(low=0, high=2, 
                                            shape=(config["rows"],config["columns"],1), dtype=np.integer)

        
    def reset(self):
        self.obs = self.env.reset()
        self.obs = self.get_board(self.obs, self.config)
        return self.obs.reshape((-1, 42))
    
    def switch_pair(self):
        self.pair = self.pair[::-1]
        self.env = self.ks_env.train(self.pair)
        
    def change_pair(self, pair):
        self.pair = pair
        self.env = self.ks_env.train(self.pair)
        
    def change_reward(self, reward, done):
        
        if done:
            if reward is None: #Error 
                reward = ConnectX.ERROR
            elif reward == 1:
                reward = ConnectX.WIN
            elif reward == -1:
                reward = ConnectX.LOSE
            elif reward == 0:
                reward = ConnectX.DRAW
        else:
            reward = -1/(self.config['rows'] * self.config['columns'])
            
        return reward
    
    def get_board(self, observation, configuration):
        rows = configuration['rows']
        columns = configuration['columns']

        board = np.array(observation['board']).reshape((rows,columns,1))
        new_board = np.zeros_like(board)

        mark = observation["mark"]
        new_board[board == mark] = 1
        new_board[(board != mark) & (board != 0)] = 2
        return new_board / 2 #normalization
    
    def step(self, action):
        if not np.any(self.obs[:, action] == 0):
            reward, done, _ = ConnectX.ERROR, True, {}
        else:
            self.obs, old_reward, done, _ = self.env.step(int(action))
            reward = self.change_reward(old_reward, done)
            self.obs = self.get_board(self.obs, self.config)
        
        return self.obs.reshape((-1, 42)), reward, done, _


In [8]:
class Actor:
    def __init__(self, state_dim=42, action_dim=7, clip_value=0.1, hidden_size=256) -> None:
        self.state_dim = state_dim
        self.model = nn.Sequential( nn.Linear(state_dim, hidden_size),
                                    nn.ReLU(),
                                    nn.Linear(hidden_size, hidden_size),
                                    nn.ReLU(),
                                    nn.Linear(hidden_size, action_dim),
                                    nn.Softmax()
                                    )
        self.clip_value = clip_value
        self.optimizer = torch.optim.Adagrad(self.model.parameters())
        
    def forward(self, states):
        states = torch.tensor(states, requires_grad=True).float()
        return self.model(states)

    def actor_loss(self, new_policy, old_policy, actions, advantages):
        old_policy = tensor(old_policy, requires_grad=True)

        old_policy_action = (old_policy * actions).sum()
        new_policy_action = (new_policy * actions).sum()
        div_policy = new_policy_action / old_policy_action
        
        clipped = torch.clamp(div_policy, 1 - self.clip_value, 1 + self.clip_value)
        surrogate_loss = torch.min(div_policy * advantages, clipped * advantages)
        return torch.mean(surrogate_loss)
    
    def train(self, old_policy, states, actions, advantages):
        states = tensor(states, requires_grad=True).float()
        actions = tensor(actions).float()
        advantages = tensor(advantages, requires_grad=True).float()

        new_policy = self.forward(states.view(-1, self.state_dim))

        self.optimizer.zero_grad()
        loss = self.actor_loss(new_policy, old_policy, actions, advantages)
    
        loss.backward()
        self.optimizer.step()
        return loss

class Critic:
    def __init__(self, state_dim, hidden_size=256) -> None:
        self.state_dim = state_dim
        self.model =  nn.Sequential(nn.Linear(42, hidden_size),
                                    nn.ReLU(),
                                    nn.Linear(hidden_size, hidden_size),
                                    nn.ReLU(),
                                    nn.Linear(hidden_size, 1),
                                    )

        self.optimizer = torch.optim.Adagrad(self.model.parameters())

    def forward(self, states):
        states = torch.tensor(states, requires_grad=True).float()
        return self.model(states)

    def critic_loss(self, eval_q, expect_reward):
        expect_reward = torch.tensor(expect_reward, requires_grad=True).float()
        
        return nn.functional.mse_loss(eval_q.squeeze(1), expect_reward)
    
    def train(self, states, except_reward):
        states = tensor(states, requires_grad=True).float()
        eval_q = self.model(states.view(-1, 42))
        
        self.optimizer.zero_grad()
        loss = self.critic_loss(eval_q, except_reward)

        loss.backward()
        self.optimizer.step
        return loss

class PPO_Agent:
    def __init__(self, discount, lambd, batch_size, epochs) -> None:
        self.discount = discount
        self.lambd = lambd
        self.batch_size = batch_size
        self.epochs = epochs 
        self.env = ConnectX()
        self.state_dim = self.env.observation_space.shape
        self.action_dim = self.env.action_space.n

        self.actor = Actor(42, 7, 0.1)
        self.critic = Critic(self.state_dim)
    
    def advantage_estimator(self, rewards, states, next_states, is_done):
        eval_q_current = self.critic.forward(states.reshape((-1, 42)))
        eval_q_next = self.critic.forward(next_states.reshape((-1, 42)))
        advantages = torch.zeros(len(rewards))
        n_step_targets = np.zeros(len(rewards))
        gae_cumulative = 0
        forward_val = 0

        if not is_done:
            forward_val = eval_q_next

        for k in reversed(range(0, len(rewards))):
            
            delta = float(np.array(rewards[k])) + float(self.discount) * float(forward_val) - float(eval_q_current[k])
            
            gae_cumulative = self.discount * self.lambd * gae_cumulative + delta
            advantages[k] = gae_cumulative
            forward_val = eval_q_current[k]
            n_step_targets[k] = advantages[k] + eval_q_current[k]
        return advantages, n_step_targets
    
    def train(self):
        results = []
        total_rewards = []

        for _ in range(self.epochs):
            state = self.env.reset()
            states = []
            actions = []
            rewards = []
            old_policys = []
            done = False
            episode_reward = 0
            while not done:
                                
                probs = self.actor.forward(state.reshape((-1, 42))) # получаем вероятности действий в данной ситуации
                action = np.random.choice(self.action_dim, p=probs[0].detach().numpy()) # выбираем действия в соответстии с хорошестью 
                next_state, reward, done, _ = self.env.step(action)
            
                state = np.reshape(state, [1, *self.state_dim])
                new_action = np.zeros(self.action_dim)
                new_action[action] = 1
                action = new_action
                next_state = np.reshape(next_state, [1, *self.state_dim])
                reward = np.reshape(reward, [1, 1])

                states.append(state.reshape(-1))
                actions.append(action)
                rewards.append(reward)
                old_policys.append(probs.detach().numpy())

                if len(states) >= self.batch_size or done:
                    states = np.array(states)
                    eval_q = self.critic.forward(states.reshape((-1, 42)))
                    next_eval_q = self.critic.forward(next_state.reshape((-1, 42)))
                    advantages, td_targets = self.advantage_estimator(
                        rewards, states, next_state, done)

                    for epoch in range(10):
                        self.actor.train(
                            old_policys, states, actions, advantages)
                        self.critic.train(states, td_targets)

                    states = []
                    actions = []
                    rewards = []
                    old_policys = []

                episode_reward += reward[0][0]
                state = next_state[0]
            
            
            total_rewards.append(episode_reward)
            results.append(reward[0][0])
            
                
        return total_rewards      

In [9]:
agent = PPO_Agent(0.99, 0.99, 3, 10)
agent.train()

  self.dtype = np.dtype(dtype)
  advantages = tensor(advantages, requires_grad=True).float()
  states = torch.tensor(states, requires_grad=True).float()


[0.19047619047619047,
 -10.214285714285714,
 0.8571428571428572,
 -10.214285714285714,
 0.8809523809523809,
 0.8333333333333334,
 -10.166666666666666,
 0.8095238095238095,
 -10.142857142857142,
 0.9285714285714286]