# **SEGUNDO PARCIAL DE SIS420**

## Nombre: Gonzales Suyo Franz Reinaldo
## C.U. 35-5335
## Carrera: Ing. Sistemas

### 2. Desarrolle un cuadernillo aplicando aprendizaje por refuerzo, para que un agente pueda jugar cuatro en raya.

# **APRENDIZAJE POR REFUERZO**

## Ejemplo de Aplicación: Cuatro en Raya.

In [12]:
import numpy as np

class Board():
    def __init__(self):
        self.rows = 6
        self.cols = 7
        self.state = np.zeros((self.rows, self.cols))

    def valid_moves(self):
        return [col for col in range(self.cols) if self.state[0, col] == 0]

    def update(self, symbol, col):
        for row in range(self.rows - 1, -1, -1):
            if self.state[row, col] == 0:
                self.state[row, col] = symbol
                break

    def is_game_over(self):
        # Comprobar filas, columnas y diagonales
        for row in range(self.rows):
            for col in range(self.cols - 3):
                if np.all(self.state[row, col:col + 4] == 1):
                    return 1
                if np.all(self.state[row, col:col + 4] == -1):
                    return -1

        for row in range(self.rows - 3):
            for col in range(self.cols):
                if np.all(self.state[row:row + 4, col] == 1):
                    return 1
                if np.all(self.state[row:row + 4, col] == -1):
                    return -1

        for row in range(self.rows - 3):
            for col in range(self.cols - 3):
                if np.all([self.state[row + i, col + i] == 1 for i in range(4)]):
                    return 1
                if np.all([self.state[row + i, col + i] == -1 for i in range(4)]):
                    return -1
                if np.all([self.state[row + 3 - i, col + i] == 1 for i in range(4)]):
                    return 1
                if np.all([self.state[row + 3 - i, col + i] == -1 for i in range(4)]):
                    return -1

        if len(self.valid_moves()) == 0:
            return 0  # Empate

        return None  # Continuar el juego

    def reset(self):
        self.state = np.zeros((self.rows, self.cols))

In [13]:
from tqdm import tqdm

class Game():
    def __init__(self, player1, player2):
        player1.symbol = 1
        player2.symbol = -1
        self.players = [player1, player2]
        self.board = Board()

    def selfplay(self, rounds=100):
        wins = [0, 0]
        for _ in tqdm(range(rounds)):
            self.board.reset()
            for player in self.players:
                player.reset()
            game_over = False
            while not game_over:
                for player in self.players:
                    action = player.move(self.board)
                    self.board.update(player.symbol, action)
                    for player in self.players:
                        player.update(self.board)
                    if self.board.is_game_over() is not None:
                        game_over = True
                        break
            self.reward()
            for ix, player in enumerate(self.players):
                if self.board.is_game_over() == player.symbol:
                    wins[ix] += 1
        return wins

    def reward(self):
        winner = self.board.is_game_over()
        if winner == 0:  # Empate
            for player in self.players:
                player.reward(0.5)
        else:  # Recompensa al ganador
            for player in self.players:
                if winner == player.symbol:
                    player.reward(1)
                else:
                    player.reward(0)

In [14]:
class Agent():
    def __init__(self, alpha=0.5, prob_exp=0.55):
        self.value_function = {}  # Tabla con pares estado -> valor
        self.alpha = alpha        # Learning rate
        self.positions = []       # Guardamos todas las posiciones de la partida
        self.prob_exp = prob_exp  # Probabilidad de explorar

    def reset(self):
        self.positions = []

    def move(self, board, explore=True):
        valid_moves = board.valid_moves()
        # Exploración
        if explore and np.random.uniform(0, 1) < self.prob_exp:
            return np.random.choice(valid_moves)
        # Explotación
        max_value = -float('inf')
        best_move = None
        for col in valid_moves:
            next_board = board.state.copy()
            for row in range(board.rows - 1, -1, -1):
                if next_board[row, col] == 0:
                    next_board[row, col] = self.symbol
                    break
            next_state = str(next_board)
            value = self.value_function.get(next_state, 0)
            if value >= max_value:
                max_value = value
                best_move = col
        return best_move

    def update(self, board):
        self.positions.append(str(board.state))

    def reward(self, reward):
        for p in reversed(self.positions):
            if p not in self.value_function:
                self.value_function[p] = 0
            self.value_function[p] += self.alpha * (reward - self.value_function[p])
            reward = self.value_function[p]

In [15]:
agent1 = Agent(prob_exp=0.55)
agent2 = Agent()

game = Game(agent1, agent2)
game.selfplay(300000)

  7%|▋         | 20140/300000 [10:22<2:24:15, 32.33it/s]


KeyboardInterrupt: 

In [None]:
import pandas as pd

funcion_de_valor = sorted(agent1.value_function.items(), key=lambda kv: kv[1], reverse=True)
tabla = pd.DataFrame({'estado': [x[0] for x in funcion_de_valor], 'valor': [x[1] for x in funcion_de_valor]})

tabla

In [None]:
import pickle

with open('agente.pickle', 'wb') as handle:
    pickle.dump(agent1.value_function, handle, protocol=pickle.HIGHEST_PROTOCOL)