In [1]:
import numpy as np

dimension = 3

#Mendefinisikan environment
class Environment():
    def __init__(self):
        self.board = np.zeros((dimension, dimension))
        self.x = -1
        self.o = 1
        self.winner = None
        self.ended = False
        self.num_states = 3**(dimension*dimension)
    
    def empty(self, i, j):
        return self.board[i, j] == 0
    
    def reward(self, symbol):
        if not self.game_over():
            return 0
        
        return 1 if self.winner == symbol else 0

    def access_state(self):
        k = 0
        h = 0
        for i in range(dimension):
            for j in range(dimension):
                if self.board[i, j] == 0:
                    v = 0
                elif self.board[i, j] == self.x:
                    v = 1
                elif self.board[i, j] == self.o:
                    v = 2
                h += (3**k) * v
                k += 1
        return h

    def game_over(self, recalculation=False):
        if not recalculation and self.ended:
            return self.ended
        
        for i in range(dimension):
            for player in (self.x, self.o):
                if self.board[i].sum() == player*dimension:
                    self.winner = player
                    self.ended = True
                    return True
        
        for j in range(dimension):
            for player in (self.x, self.o):
                if self.board[:, j].sum() == player*dimension:
                    self.winner = player
                    self.ended = True
                    return True
                
        for player in (self.x, self.o):
            if self.board.trace() == player*dimension: #diagonal kiri atas ke kanan bawah
                self.winner = player
                self.ended = True
                return True
            if np.fliplr(self.board).trace() == player*dimension: #diagonal kanan atas ke kiri bawah
                self.winner = player
                self.ended = True
                return True
        
        if np.all((self.board == 0) == False): #draw
            self.winner = None
            self.ended = True
            return True
        
        self.winner = None
        return False
    
    def is_draw(self):
        return self.ended and self.winner is None
    
    # Contoh papan TTT
    # -------------
    # | X |   |   |
    # -------------
    # |   |   |   |
    # -------------
    # |   |   |   |
    # -------------
    
    def display_board(self):
        for i in range(dimension):
            print("------------")
            for j in range(dimension):
                print("  ", end="")
                if self.board[i, j] == self.x:
                    print("x ", end="")
                elif self.board[i, j] == self.o:
                    print("o ", end="")
                else:
                    print("  ", end="")
            print("")
        print("------------")
            

#Mendefinisikan agen
class Agent():
    def __init__(self, epsilon=0.1, alpha=0.5): #alpha = learning rate
        self.epsilon = epsilon
        self.alpha = alpha
        self.verbose = False
        self.state_history = []
        
    def set_V(self, V):
        self.V = V
        
    def set_symbol(self, symbol):
        self.symbol = symbol
        
    def set_verbose(self, verbose):
        self.verbose = verbose
        
    def reset_history(self):
        self.state_history = []
        
    def begin_action(self, env):
        r = np.random.rand()
        best_state = None
        if r < self.epsilon:
            if self.verbose:
                print("AI Memilih action secara random")
                
            possible_moves = []
            for i in range(dimension):
                for j in range(dimension):
                    if env.empty(i, j):
                        possible_moves.append((i, j))
            
            idx = np.random.choice(len(possible_moves))
            next_move = possible_moves[idx]
        else:
            pos2value = {}
            next_move = None
            best_value = -1
            for i in range(dimension):
                for j in range(dimension):
                    if env.empty(i, j):
                        env.board[i, j] = self.symbol
                        state = env.access_state()
                        env.board[i, j] = 0
                        pos2value[(i, j)] = self.V[state]
                        if self.V[state] > best_value:
                            best_value = self.V[state]
                            best_state = state
                            next_move = (i, j)
            if self.verbose: #mencetak informasi jika diperlukan
                print("AI Memilih action berdasarkan epsilon greedy")
                for i in range(dimension):
                    print("------------")
                    for j in range(dimension):
                        if env.empty(i, j):
                            print(" %.2f|" % pos2value[(i,j)], end="")
                        else:
                            print("  ", end="")
                            if env.board[i, j] == env.x:
                                print("x  |", end="")
                            elif env.board[i, j] == env.o:
                                print("o  |", end="")
                            else:
                                print("   |", end="")
                    print("")
                print("------------")
        env.board[next_move[0], next_move[1]] =self.symbol
        
    def update_state(self, s):
        self.state_history.append(s)
        
    def update(self, env):
        reward = env.reward(self.symbol)
        target = reward
        for prev in reversed(self.state_history):
            value = self.V[prev] + self.alpha*(target - self.V[prev])
            self.V[prev] = value
            target = value
        self.reset_history()
        
        
#Mendefinisikan human
class Human:
    def __init__(self):
        pass

    def set_symbol(self, symbol):
        self.symbol = symbol
    
    def begin_action(self, env):
        while True:
            move = input("Masukkan koordinat pergerakan selanjutnya (i, j = 0-2): ")
            i, j = move.split(",")
            i, j = int(i), int(j)
            if env.empty(i, j):
                env.board[i, j] = self.symbol
                break
                
    def update(self, env):
        pass

    def update_state(self, s):
        pass


def access_state_winner_end(env, i=0, j=0):
    results = []
    for v in (0, env.x, env.o):
        env.board[i, j] = v
        if j == 2:
            if i == 2:
                state = env.access_state()
                ended = env.game_over(recalculation = True)
                winner = env.winner
                results.append((state, winner, ended))
            else:
                results += access_state_winner_end(env, i+1, 0)
        else:
            results += access_state_winner_end(env, i, j+1)
    return results


def awal_V_x(env, state_winner_triples):
    V = np.zeros(env.num_states)
    for state, winner, ended in state_winner_triples:
        if ended:
            if winner == env.x:
                v = 1
            else:
                v = 0
        else:
            v = 0.5
        V[state] = v
    return V


def awal_V_o(env, state_winner_triples):
    V = np.zeros(env.num_states)
    for state, winner, ended in state_winner_triples:
        if ended:
            if winner == env.o:
                v = 1
            else:
                v = 0
        else:
            v = 0.5
        V[state] = v
    return V


def play(p1, p2, env, draw=False):
    current_player = None
    while not env.game_over():
        if current_player == p1:
            current_player = p2
        else:
            current_player = p1
        if draw:
            if draw == 1 and current_player == p1:
                env.display_board()
            if draw == 2 and current_player == p2:
                env.display_board()
        current_player.begin_action(env)
        state = env.access_state()
        p1.update_state(state)
        p2.update_state(state)
    if draw:
        env.display_board()
    p1.update(env)
    p2.update(env)
    
    
if __name__ == "__main__":
    p1 = Agent()
    p2 = Agent()
    
    env = Environment()
    state_winner_triples = access_state_winner_end(env)
    
    V_x = awal_V_x(env, state_winner_triples)
    p1.set_V(V_x)
    V_o = awal_V_o(env, state_winner_triples)
    p2.set_V(V_o)
    
    p1.set_symbol(env.x)
    p2.set_symbol(env.o)
    
    T = 300
    for t in range(T):
        print("Latihan AI vs AI ke-",t)
        play(p1, p2, Environment())
        
    human = Human()
    human.set_symbol(env.o)
    while True:
        p1.set_verbose(True)
        play(p1, human, Environment(), draw=2) #draw = 2 maka AI(p1) mulai dulu, draw = 1 maka human mulai dulu
        answer = input("Ingin bermain lagi? [y/n]: ")
        if answer and answer.lower()[0] == "n":
            break

#Jika kita ingi human mulai dulu
#    human = Human()
#    human.set_symbol(env.x)
#    while True:
#        p2.set_verbose(True)
#        play(human, p2, Environment(), draw=1)
#        answer = input("Ingin bermain lagi? [y/n]: ")
#        if answer and answer.lower()[0] == "n":
#            break

Latihan AI vs AI ke- 0
Latihan AI vs AI ke- 1
Latihan AI vs AI ke- 2
Latihan AI vs AI ke- 3
Latihan AI vs AI ke- 4
Latihan AI vs AI ke- 5
Latihan AI vs AI ke- 6
Latihan AI vs AI ke- 7
Latihan AI vs AI ke- 8
Latihan AI vs AI ke- 9
Latihan AI vs AI ke- 10
Latihan AI vs AI ke- 11
Latihan AI vs AI ke- 12
Latihan AI vs AI ke- 13
Latihan AI vs AI ke- 14
Latihan AI vs AI ke- 15
Latihan AI vs AI ke- 16
Latihan AI vs AI ke- 17
Latihan AI vs AI ke- 18
Latihan AI vs AI ke- 19
Latihan AI vs AI ke- 20
Latihan AI vs AI ke- 21
Latihan AI vs AI ke- 22
Latihan AI vs AI ke- 23
Latihan AI vs AI ke- 24
Latihan AI vs AI ke- 25
Latihan AI vs AI ke- 26
Latihan AI vs AI ke- 27
Latihan AI vs AI ke- 28
Latihan AI vs AI ke- 29
Latihan AI vs AI ke- 30
Latihan AI vs AI ke- 31
Latihan AI vs AI ke- 32
Latihan AI vs AI ke- 33
Latihan AI vs AI ke- 34
Latihan AI vs AI ke- 35
Latihan AI vs AI ke- 36
Latihan AI vs AI ke- 37
Latihan AI vs AI ke- 38
Latihan AI vs AI ke- 39
Latihan AI vs AI ke- 40
Latihan AI vs AI ke- 41
La

Masukkan koordinat pergerakan selanjutnya (i, j = 0-2): 0,1
AI Memilih action berdasarkan epsilon greedy
------------
  x  |  o  |  x  |
------------
 0.38|  o  |  x  |
------------
 0.25| 0.50|  o  |
------------
------------
  x   o   x 
------------
      o   x 
------------
      x   o 
------------
Masukkan koordinat pergerakan selanjutnya (i, j = 0-2): 2,0
AI Memilih action secara random
------------
  x   o   x 
------------
  x   o   x 
------------
  o   x   o 
------------
Ingin bermain lagi? [y/n]: y
AI Memilih action berdasarkan epsilon greedy
------------
 0.48| 0.44| 0.55|
------------
 0.46| 0.49| 0.48|
------------
 0.47| 0.50| 0.49|
------------
------------
          x 
------------
            
------------
            
------------
Masukkan koordinat pergerakan selanjutnya (i, j = 0-2): 0,0
AI Memilih action berdasarkan epsilon greedy
------------
  o  | 0.44|  x  |
------------
 0.45| 0.65| 0.38|
------------
 0.50| 0.50| 0.50|
------------
------------
  o       x

IndexError: index 3 is out of bounds for axis 0 with size 3