IMPLEMENTACIÓN

In [3]:
import numpy as np
import random
from tqdm import tqdm

class Board():
    def __init__(self):
        self.state = np.zeros((6, 7))  # 6 filas, 7 columnas

    def valid_moves(self):
        return [col for col in range(7) if self.state[0, col] == 0]

    def update(self, symbol, col):
        if col not in self.valid_moves():
            raise ValueError("Movimiento ilegal!")
        
        # Encontrar la fila más baja disponible
        for row in range(5, -1, -1):
            if self.state[row, col] == 0:
                self.state[row, col] = symbol
                break

    def is_game_over(self):
        # Verificar horizontal
        for row in range(6):
            for col in range(4):
                if (self.state[row, col] != 0 and 
                    all(self.state[row, col + i] == self.state[row, col] for i in range(4))):
                    return self.state[row, col]
        
        # Verificar vertical
        for row in range(3):
            for col in range(7):
                if (self.state[row, col] != 0 and 
                    all(self.state[row + i, col] == self.state[row, col] for i in range(4))):
                    return self.state[row, col]
        
        # Verificar diagonal descendente
        for row in range(3):
            for col in range(4):
                if (self.state[row, col] != 0 and 
                    all(self.state[row + i, col + i] == self.state[row, col] for i in range(4))):
                    return self.state[row, col]
        
        # Verificar diagonal ascendente
        for row in range(3, 6):
            for col in range(4):
                if (self.state[row, col] != 0 and 
                    all(self.state[row - i, col + i] == self.state[row, col] for i in range(4))):
                    return self.state[row, col]
        
        # Empate
        if len(self.valid_moves()) == 0:
            return 0
        
        # Juego continúa
        return None

    def reset(self):
        self.state = np.zeros((6, 7))


class Game():
    def __init__(self, player1, player2):
        player1.symbol = 1
        player2.symbol = -1
        self.players = [player1, player2]
        self.board = Board()

    def selfplay(self, rounds=100):
        wins = [0, 0]
        for i in tqdm(range(1, rounds + 1)):
            self.board.reset()
            for player in self.players:
                player.reset()
            game_over = False
            while not game_over:
                for player in self.players:
                    action = player.move(self.board)
                    self.board.update(player.symbol, action)
                    for player in self.players:
                        player.update(self.board)
                    if self.board.is_game_over() is not None:
                        game_over = True
                        break
            self.reward()
            for ix, player in enumerate(self.players):
                if self.board.is_game_over() == player.symbol:
                    wins[ix] += 1
        return wins

    def reward(self):
        winner = self.board.is_game_over()
        if winner == 0:  # empate
            for player in self.players:
                player.reward(0.5)
        else:  # le damos 1 recompensa al jugador que gana
            for player in self.players:
                if winner == player.symbol:
                    player.reward(1)
                else:
                    player.reward(0)


class Agent():
    def __init__(self, alpha=0.5, prob_exp=0.3):
        self.value_function = {}  # tabla con pares estado -> valor
        self.alpha = alpha        # learning rate
        self.positions = []       # guardamos todas las posiciones de la partida
        self.prob_exp = prob_exp  # probabilidad de explorar

    def reset(self):
        self.positions = []

    def move(self, board, explore=True):
        valid_moves = board.valid_moves()
        # exploración
        if explore and np.random.uniform(0, 1) < self.prob_exp:
            # vamos a una posición aleatoria
            return random.choice(valid_moves)
        
        # explotación - vamos a la posición con más valor
        max_value = -1000
        best_col = valid_moves[0]
        
        for col in valid_moves:
            # Simular movimiento
            next_board = board.state.copy()
            # Colocar ficha en la columna
            for row in range(5, -1, -1):
                if next_board[row, col] == 0:
                    next_board[row, col] = self.symbol
                    break
            
            next_state = str(next_board.reshape(42))
            value = 0 if self.value_function.get(next_state) is None else self.value_function.get(next_state)
            if value >= max_value:
                max_value = value
                best_col = col
        return best_col

    def update(self, board):
        self.positions.append(str(board.state.reshape(42)))

    def reward(self, reward):
        # al final de la partida actualizamos la función de valor
        for p in reversed(self.positions):
            if self.value_function.get(p) is None:
                self.value_function[p] = 0
            self.value_function[p] += self.alpha * (reward - self.value_function[p])
            reward = self.value_function[p]


# Crear agentes
agent1 = Agent(prob_exp=0.3)
agent2 = Agent(prob_exp=0.1)

# Crear juego y entrenar
game = Game(agent1, agent2)
print("Entrenando agente...")
wins = game.selfplay(50000)

print(f"Resultados:")
print(f"Agente 1: {wins[0]} victorias")
print(f"Agente 2: {wins[1]} victorias")
print(f"Estados aprendidos por Agente 1: {len(agent1.value_function)}")

# Guardar el agente entrenado
import pickle
with open('agente_cuatro_en_raya.pickle', 'wb') as handle:
    pickle.dump(agent1.value_function, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Agente guardado en 'agente_cuatro_en_raya.pickle'")

Entrenando agente...


100%|██████████| 50000/50000 [30:31<00:00, 27.30it/s]


Resultados:
Agente 1: 33190 victorias
Agente 2: 16324 victorias
Estados aprendidos por Agente 1: 314795
Agente guardado en 'agente_cuatro_en_raya.pickle'


JUGAR CONTRA EL AGENTE

In [None]:
def jugar_humano_vs_agente():
    # Cargar agente entrenado
    with open('agente_cuatro_en_raya.pickle', 'rb') as handle:
        funcion_valor = pickle.load(handle)
    
    agente = Agent(prob_exp=0)  # Sin exploración
    agente.value_function = funcion_valor
    agente.symbol = -1
    
    board = Board()
    
    def mostrar_tablero():
        print("\n  0 1 2 3 4 5 6")
        for fila in board.state:
            print("  " + " ".join(['.' if x == 0 else 'X' if x == 1 else 'O' for x in fila]))
    
    print("¡Juega contra el agente!")
    print("Tú eres X, el agente es O")
    
    turno = 0  # 0 = humano, 1 = agente
    
    while board.is_game_over() is None:
        mostrar_tablero()
        
        if turno == 0:  # Turno humano
            try:
                col = int(input("Elige columna (0-6): "))
                if col in board.valid_moves():
                    board.update(1, col)
                else:
                    print("Movimiento inválido!")
                    continue
            except:
                print("Entrada inválida!")
                continue
        else:  # Turno agente
            col = agente.move(board, explore=False)
            print(f"Agente elige columna: {col}")
            board.update(-1, col)
        
        turno = 1 - turno
    
    mostrar_tablero()
    resultado = board.is_game_over()
    if resultado == 1:
        print("¡Ganaste!")
    elif resultado == -1:
        print("¡El agente ganó!")
    else:
        print("¡Empate!")

# Descomenta para jugar
jugar_humano_vs_agente()

¡Juega contra el agente!
Tú eres X, el agente es O

  0 1 2 3 4 5 6
  . . . . . . .
  . . . . . . .
  . . . . . . .
  . . . . . . .
  . . . . . . .
  . . . . . . .
