In [11]:
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

class TicTacToe:
    def __init__(self, size=3):
        self.size = size
        self.board = np.zeros((size, size))
        self.current_player = 1

    def reset(self):
        self.board = np.zeros((self.size, self.size))
        self.current_player = 1

    def is_winner(self, player):
        for i in range(self.size):
            if np.all(self.board[i, :] == player) or np.all(self.board[:, i] == player):
                return True
        if np.all(np.diag(self.board) == player) or np.all(np.diag(np.fliplr(self.board)) == player):
            return True
        return False

    def is_full(self):
        return np.all(self.board != 0)

    def step(self, action):
        x, y = action
        self.board[x, y] = self.current_player
        reward = 0
        if self.is_winner(self.current_player):
            reward = 1  # Current player wins
        elif self.is_full():
            reward = 0.5  # Draw
        self.current_player *= -1  # Switch player
        return reward

class FunctionApproximationAgent:
    def __init__(self):
        self.model = self.build_model()
        self.epsilon = 0.1  # Exploration rate
        self.alpha = 0.1    # Learning rate

    def build_model(self):
        model = Sequential()
        model.add(Dense(64, input_dim=9, activation='relu'))  # 3x3 board flattened
        model.add(Dense(64, activation='relu'))
        model.add(Dense(9, activation='linear'))  # Output Q-values for each action (cell)
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

        return model

    def get_state_key(self, state):
        return state.flatten()

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            available_actions = np.argwhere(state == 0)  # Choose random available action
            action_index = np.random.choice(len(available_actions))
            return tuple(available_actions[action_index])
        else:
            state_key = self.get_state_key(state)
            q_values = self.model.predict(state_key.reshape(1, -1), verbose=0)[0]
            available_actions = np.argwhere(state.flatten() == 0).flatten()
            action = available_actions[np.argmax(q_values[available_actions])]
            return (action // 3, action % 3)

    def learn(self, state, action, reward, next_state):
        state_key = self.get_state_key(state)
        next_state_key = self.get_state_key(next_state)
        
        # Predict Q-values for the current state
        q_values = self.model.predict(state_key.reshape(1, -1), verbose=0)[0]
        
        # Predict Q-values for the next state
        next_q_values = self.model.predict(next_state_key.reshape(1, -1), verbose=0)[0]

        # Update Q-value for the taken action
        action_index = action[0] * 3 + action[1]
        q_values[action_index] += self.alpha * (reward + np.max(next_q_values) - q_values[action_index])
        
        # Train the model with the updated Q-values
        self.model.fit(state_key.reshape(1, -1), q_values.reshape(1, -1), verbose=0)

def train_agent(episodes):
    agent = FunctionApproximationAgent()
    wins = 0
    draws = 0

    for episode in range(episodes):
        game = TicTacToe()
        game.reset()
        while True:
            state = game.board.copy()
            action = agent.choose_action(state)
            reward = game.step(action)

            next_state = game.board.copy()
            agent.learn(state, action, reward, next_state)

            if reward == 1:  # Current player wins
                wins += 1
                break
            elif reward == 0.5:  # Draw
                draws += 1
                break

    return wins, draws

def main():
    episodes = 100
    wins, draws = train_agent(episodes)
    total_games = wins + draws
    win_rate = wins / total_games if total_games > 0 else 0
    print(f"Win rate for the agent: {win_rate:.2f}")

if __name__ == "__main__":
    main()


Win rate for the agent: 0.94
