In [1]:
import numpy as np
import random

In [10]:
board_size = 4
num_pairs = 8
total_states = board_size * board_size
episodes = 200

In [11]:
learning_rate = 0.1
discount_factor = 0.9
epsilon = 1.0
epsilon_decay = 0.995
min_epsilon = 0.01

In [12]:
def create_board():
    pairs = list(range(1, num_pairs + 1)) * 2
    random.shuffle(pairs)
    return np.array(pairs).reshape(board_size, board_size)

In [13]:
def get_reward(board, pos1, pos2):
    if board[pos1] == board[pos2]:
        return 10
    else:
        return -1

In [14]:
def choose_action(state, q_table, epsilon):
    if random.uniform(0, 1) < epsilon:
        return random.randint(0, total_states - 1), random.randint(0, total_states - 1)
    else:
        return divmod(np.argmax(q_table[state]), board_size)

In [15]:
q_table = np.zeros((total_states, total_states))

In [16]:
for episode in range(episodes):
    board = create_board()
    revealed = np.zeros_like(board)
    state = random.randint(0, total_states - 1)
    total_reward = 0

    while not np.all(revealed):
        pos1, pos2 = choose_action(state, q_table, epsilon)

        while pos1 == pos2:
            pos1, pos2 = choose_action(state, q_table, epsilon)

        reward = get_reward(board, np.unravel_index(pos1, (board_size, board_size)),
                            np.unravel_index(pos2, (board_size, board_size)))

        new_state = random.randint(0, total_states - 1)
        q_table[state, pos1] += learning_rate * (
            reward + discount_factor * np.max(q_table[new_state]) - q_table[state, pos1]
        )
        q_table[state, pos2] += learning_rate * (
            reward + discount_factor * np.max(q_table[new_state]) - q_table[state, pos2]
        )

        if reward == 10:
            revealed[np.unravel_index(pos1, (board_size, board_size))] = 1
            revealed[np.unravel_index(pos2, (board_size, board_size))] = 1

        total_reward += reward
        state = new_state

    epsilon = max(min_epsilon, epsilon * epsilon_decay)

    if episode % 20 == 0:
        print(f"Episode {episode}, Total Reward: {total_reward}")

print("Training selesai.")

Episode 0, Total Reward: -131
Episode 20, Total Reward: 13
Episode 40, Total Reward: -177
Episode 60, Total Reward: -262
Episode 80, Total Reward: -332
Episode 100, Total Reward: -28
Episode 120, Total Reward: -113
Episode 140, Total Reward: -396
Episode 160, Total Reward: -177
Episode 180, Total Reward: -528
Training selesai.
