In [None]:
# ml6

import random
import numpy as np

# --- Environment ---
class TicTacToe:
    def __init__(self):
        self.reset()

    def reset(self):
        self.state = np.zeros(9, dtype=int)
        self.done = False
        self.winner = None
        return self.state

    def available_moves(self):
        return [i for i in range(9) if self.state[i] == 0]

    def step(self, action, player):
        if self.state[action] != 0 or self.done:
            return self.state, -10, True   # invalid move
        self.state[action] = player
        self.winner = self.check_winner()
        if self.winner is not None:
            self.done = True
            if self.winner == 1:
                return self.state, 1, True
            elif self.winner == -1:
                return self.state, -1, True
            else:
                return self.state, 0, True
        elif 0 not in self.state:
            self.done = True
            return self.state, 0, True
        return self.state, 0, False

    def check_winner(self):
        combos = [
            [0,1,2],[3,4,5],[6,7,8],
            [0,3,6],[1,4,7],[2,5,8],
            [0,4,8],[2,4,6]
        ]
        for c in combos:
            if abs(sum(self.state[c])) == 3:
                return np.sign(sum(self.state[c]))
        if 0 not in self.state:
            return 0
        return None

    def render(self):
        symbols = {1:'X', -1:'O', 0:' '}
        print("\nCurrent Board (You are O, Agent is X):")
        for i in range(0,9,3):
            row = f" {symbols[self.state[i]]} | {symbols[self.state[i+1]]} | {symbols[self.state[i+2]]} "
            guide = f"   ({i+1} | {i+2} | {i+3})"
            print(row + guide)
            if i < 6: print("---+---+---")
        print()

# --- Helper for checking wins manually ---
def check_win_manual(state, player):
    wins = [
        [0,1,2],[3,4,5],[6,7,8],
        [0,3,6],[1,4,7],[2,5,8],
        [0,4,8],[2,4,6]
    ]
    for w in wins:
        if all(state[i] == player for i in w):
            return True
    return False

# --- Q-learning Agent ---
class QAgent:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.2):
        self.q_table = {}
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon

    def get_state_key(self, state):
        return tuple(state.tolist())

    def choose_action(self, state, available):
        state_key = self.get_state_key(state)
        if random.random() < self.epsilon or state_key not in self.q_table:
            return random.choice(available)
        q_values = self.q_table[state_key]
        return max(available, key=lambda a: q_values.get(a, 0))

    def update(self, old_state, action, reward, new_state, done):
        old_key = self.get_state_key(old_state)
        new_key = self.get_state_key(new_state)
        if old_key not in self.q_table:
            self.q_table[old_key] = {}
        old_q = self.q_table[old_key].get(action, 0)
        if done:
            target = reward
        else:
            next_max = max(self.q_table.get(new_key, {}).values(), default=0)
            target = reward + self.gamma * next_max
        self.q_table[old_key][action] = old_q + self.alpha * (target - old_q)

# --- Smart move selector (win/block logic) ---
def smart_choose_action(agent, env):
    state = env.state
    available = env.available_moves()

    # 1Ô∏è‚É£ Try to win
    for move in available:
        test = state.copy()
        test[move] = 1
        if check_win_manual(test, 1):
            return move

    # 2Ô∏è‚É£ Try to block opponent win
    for move in available:
        test = state.copy()
        test[move] = -1
        if check_win_manual(test, -1):
            return move

    # 3Ô∏è‚É£ Otherwise use learned Q
    return agent.choose_action(state, available)

# --- Train the agent (self-play) ---
def train_agent(episodes=50000):
    agent = QAgent()
    for _ in range(episodes):
        env = TicTacToe()
        state = env.reset()
        player = 1
        done = False
        while not done:
            available = env.available_moves()
            action = agent.choose_action(state, available)
            new_state, reward, done = env.step(action, player)
            agent.update(state, action, reward if player == 1 else -reward, new_state, done)
            state = new_state.copy()
            player *= -1
    agent.epsilon = 0.0  # play greedily after training
    return agent

# --- Play with the agent ---
def play_with_user(agent):
    env = TicTacToe()
    env.render()

    while not env.done:
        # --- User Move ---
        move = int(input("Enter your move (1-9): ")) - 1
        if move not in env.available_moves():
            print("‚ùå Invalid move! Try again.")
            continue
        env.step(move, -1)
        env.render()
        if env.done:
            break

        # --- Agent Move ---
        print("Agent is thinking...")
        action = smart_choose_action(agent, env)
        env.step(action, 1)
        env.render()

    if env.winner == 1:
        print("Agent (X) wins üòé")
    elif env.winner == -1:
        print("You (O) win üéâ")
    else:
        print("It's a draw ü§ù")

# --- Run the game ---
print("You are O, Agent is X.\nGrid mapping:")
print(" 1 | 2 | 3\n---+---+---\n 4 | 5 | 6\n---+---+---\n 7 | 8 | 9\n")
agent = train_agent(episodes=2000)
play_with_user(agent)


You are O, Agent is X.
Grid mapping:
 1 | 2 | 3
---+---+---
 4 | 5 | 6
---+---+---
 7 | 8 | 9


Current Board (You are O, Agent is X):
   |   |      (1 | 2 | 3)
---+---+---
   |   |      (4 | 5 | 6)
---+---+---
   |   |      (7 | 8 | 9)



Enter your move (1-9):  1



Current Board (You are O, Agent is X):
 O |   |      (1 | 2 | 3)
---+---+---
   |   |      (4 | 5 | 6)
---+---+---
   |   |      (7 | 8 | 9)

Agent is thinking...

Current Board (You are O, Agent is X):
 O |   |      (1 | 2 | 3)
---+---+---
 X |   |      (4 | 5 | 6)
---+---+---
   |   |      (7 | 8 | 9)



Enter your move (1-9):  5



Current Board (You are O, Agent is X):
 O |   |      (1 | 2 | 3)
---+---+---
 X | O |      (4 | 5 | 6)
---+---+---
   |   |      (7 | 8 | 9)

Agent is thinking...

Current Board (You are O, Agent is X):
 O |   |      (1 | 2 | 3)
---+---+---
 X | O |      (4 | 5 | 6)
---+---+---
   |   | X    (7 | 8 | 9)



Enter your move (1-9):  3



Current Board (You are O, Agent is X):
 O |   | O    (1 | 2 | 3)
---+---+---
 X | O |      (4 | 5 | 6)
---+---+---
   |   | X    (7 | 8 | 9)

Agent is thinking...

Current Board (You are O, Agent is X):
 O | X | O    (1 | 2 | 3)
---+---+---
 X | O |      (4 | 5 | 6)
---+---+---
   |   | X    (7 | 8 | 9)



Enter your move (1-9):  6



Current Board (You are O, Agent is X):
 O | X | O    (1 | 2 | 3)
---+---+---
 X | O | O    (4 | 5 | 6)
---+---+---
   |   | X    (7 | 8 | 9)

Agent is thinking...

Current Board (You are O, Agent is X):
 O | X | O    (1 | 2 | 3)
---+---+---
 X | O | O    (4 | 5 | 6)
---+---+---
 X |   | X    (7 | 8 | 9)



Enter your move (1-9):  8



Current Board (You are O, Agent is X):
 O | X | O    (1 | 2 | 3)
---+---+---
 X | O | O    (4 | 5 | 6)
---+---+---
 X | O | X    (7 | 8 | 9)

It's a draw ü§ù
