# Aprendizaje por Refuerzo - Cuatro en Raya

Este cuaderno implementa un entorno de Cuatro en Raya (6x7) para que un agente aprenda a jugar usando Q-learning u otro algoritmo de aprendizaje por refuerzo.



🎯 Objetivo del juego
Conseguir cuatro fichas del mismo color seguidas (horizontal, vertical o diagonal) antes que el oponente.

🧩 Características del juego
Jugadores: 2

Tablero: 6 filas × 7 columnas

Turnos: alternados (por lo general comienza el jugador 1)

Fichas: cada jugador tiene fichas de un color (por ejemplo, rojo y amarillo)

🧱 Mecánica de juego
En su turno, el jugador elige una columna (de la 0 a la 6).

La ficha cae hasta la fila más baja disponible de esa columna (como una gravedad simulada).

Se actualiza el tablero y se verifica si:

Algún jugador ha formado 4 fichas consecutivas en:

🔹 Horizontal (→)

🔹 Vertical (↓)

🔹 Diagonal ascendente (↗)

🔹 Diagonal descendente (↘)

Si alguien logra una conexión de 4 fichas seguidas, gana la partida.

Si el tablero se llena sin ganador, es un empate.



In [3]:
import numpy as np
import random
import matplotlib.pyplot as plt

## Entorno del juego - Connect Four

In [4]:
import numpy as np

class Board():
    def __init__(self):
        self.state = np.zeros((6, 7))  # 6 filas, 7 columnas

    def valid_moves(self):
        return [col for col in range(7) if self.state[0, col] == 0]

    def update(self, symbol, col):
        for row in range(5, -1, -1):  # De abajo hacia arriba
            if self.state[row, col] == 0:
                self.state[row, col] = symbol
                return
        raise ValueError("Columna llena!")

    def is_game_over(self):
        for row in range(6):
            for col in range(7):
                player = self.state[row, col]
                if player == 0:
                    continue
                # Horizontal
                if col <= 3 and all(self.state[row, col+i] == player for i in range(4)):
                    return int(player)
                # Vertical
                if row <= 2 and all(self.state[row+i, col] == player for i in range(4)):
                    return int(player)
                # Diagonal positiva
                if row <= 2 and col <= 3 and all(self.state[row+i, col+i] == player for i in range(4)):
                    return int(player)
                # Diagonal negativa
                if row >= 3 and col <= 3 and all(self.state[row-i, col+i] == player for i in range(4)):
                    return int(player)
        if len(self.valid_moves()) == 0:
            return 0  # Empate
        return None  # Sigue el juego

    def reset(self):
        self.state = np.zeros((6, 7))


In [6]:

from tqdm import tqdm

class Game():
    def __init__(self, player1, player2):
        player1.symbol = 1
        player2.symbol = -1
        self.players = [player1, player2]
        self.board = Board()

    def selfplay(self, rounds=100):
        wins = [0, 0]
        for i in tqdm(range(1, rounds + 1)):
            self.board.reset()
            for player in self.players:
                player.reset()
            game_over = False
            while not game_over:
                for player in self.players:
                    col = player.move(self.board)  # Solo columna
                    self.board.update(player.symbol, col)  # Ficha cae
                    for p in self.players:
                        p.update(self.board)
                    if self.board.is_game_over() is not None:
                        game_over = True
                        break
            self.reward()
            for ix, player in enumerate(self.players):
                if self.board.is_game_over() == player.symbol:
                    wins[ix] += 1
        return wins

    def reward(self):
        winner = self.board.is_game_over()
        if winner == 0:  # Empate
            for player in self.players:
                player.reward(0.5)
        else:  # Victoria
            for player in self.players:
                if winner == player.symbol:
                    player.reward(1)
                else:
                    player.reward(0)



In [7]:
class Agent():
    def __init__(self, alpha=0.5, prob_exp=0.5):
        self.value_function = {}  # estado -> valor
        self.alpha = alpha
        self.positions = []  # estados por los que pasó
        self.prob_exp = prob_exp  # probabilidad de explorar

    def reset(self):
        self.positions = []

    def move(self, board, explore=True):
        valid_moves = board.valid_moves()
        # Exploración aleatoria
        if explore and np.random.uniform(0, 1) < self.prob_exp:
            return np.random.choice(valid_moves)

        # Explotación: elige la mejor acción según la tabla
        max_value = -np.inf
        best_action = None
        for col in valid_moves:
            temp_board = board.state.copy()
            for row in range(5, -1, -1):
                if temp_board[row, col] == 0:
                    temp_board[row, col] = self.symbol
                    break
            next_state = str(temp_board.reshape(6 * 7))
            value = self.value_function.get(next_state, 0)
            if value > max_value:
                max_value = value
                best_action = col
        return best_action

    def update(self, board):
        self.positions.append(str(board.state.reshape(6 * 7)))

    def reward(self, reward):
        # Actualización de valores hacia atrás
        for p in reversed(self.positions):
            if self.value_function.get(p) is None:
                self.value_function[p] = 0
            self.value_function[p] += self.alpha * (reward - self.value_function[p])
            reward = self.value_function[p]


In [8]:
# Reentrenar ahora que Game y Board están definidos
agent1 = Agent(prob_exp=0.5)
agent2 = Agent()
game = Game(agent1, agent2)
resultados = game.selfplay(1000)  # Se puede subir a 300000 si se desea
resultados

100%|██████████| 1000/1000 [00:45<00:00, 22.03it/s]


[503, 492]

In [11]:
import pandas as pd

funcion_de_valor = sorted(agent1.value_function.items(), key=lambda kv: kv[1], reverse=True)
tabla = pd.DataFrame({'estado': [x[0] for x in funcion_de_valor], 'valor': [x[1] for x in funcion_de_valor]})

tabla

Unnamed: 0,estado,valor
0,[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,0.9375
1,[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,0.8125
2,[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,0.7500
3,[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,0.7500
4,[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,0.7500
...,...,...
17988,[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,0.0000
17989,[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,0.0000
17990,[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,0.0000
17991,[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,0.0000


In [12]:
import pickle

with open('agente.pickle', 'wb') as handle:
    pickle.dump(agent1.value_function, handle, protocol=pickle.HIGHEST_PROTOCOL)