Germ Game
---
Two players against each other

In [1]:
import numpy as np
import pickle
import sys
import random
from copy import deepcopy

In [2]:
BOARD_ROWS = 7
BOARD_COLS = 7
INF = 1e9

### Board State
---
Reflect & Judge the state

2 players p1 and p2; p1 uses symbol 1 and p2 uses symbol 2, vacancy as 0

In [17]:
class State:
    def __init__(self, p1, p2):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.board[0, 0] = self.board[BOARD_ROWS-1, BOARD_COLS-1] = 1
        self.board[BOARD_ROWS-1, 0] = self.board[0, BOARD_COLS-1] = 2
        self.p1 = p1
        self.p2 = p2
        self.turn = 0
        self.isEnd = False
        self.boardHash = None
        # init p1 plays first
        self.playerSymbol = 1
    
    # get unique hash of current board state
    def getHash(self, position):
        next_board = deepcopy(self.board)
        p = position.copy()
        
        if p[2]-p[0]<0:
            for i in range(3):
                for j in range(3):
                    next_board[i][j],next_board[6-i][j] = next_board[6-i][j],next_board[i][j]
            p[0]=6-p[0]
            p[2]=6-p[2]

        if p[3]-p[1]<0:
            for i in range(7):
                for j in range(7):
                    next_board[i][j],next_board[i][6-j] = next_board[i][6-j],next_board[i][j]
            p[1]=6-p[1]
            p[3]=6-p[3]

        if p[2]-p[0]<p[3]-p[1]:
            for i in range(7):
                for j in range(i+1,7):
                    next_board[i][j],next_board[j][i] = next_board[j][i],next_board[i][j]
            p[0],p[1] = p[1],p[0]
            p[2],p[3] = p[3],p[2]
            
        s1 = s2 = s3 = 0
        e2 = e3 = 0
        shape = 0

        for dx in range(-3,4):
            for dy in range(-3,4):
                if not dx and not dy:
                    continue
                if p[0]+dx<0 or p[0]+dx>=BOARD_ROWS or p[1]+dy<0 or p[1]+dy>=BOARD_COLS:
                    continue

                if next_board[p[0]+dx][p[1]+dy]!=3-self.playerSymbol:
                    continue
                if max(abs(dx),abs(dy))==1:
                    s1=1
                if max(abs(dx),abs(dy))==2:
                    s2=1
                if max(abs(dx),abs(dy))==3:
                    s3=1

        for dx in range(-3,4):
            for dy in range(-3,4):
                if abs(dx)<=1 and abs(dy)<=1:
                    continue
                if p[2]+dx<0 or p[2]+dx>=BOARD_ROWS or p[3]+dy<0 or p[3]+dy>=BOARD_COLS:
                    continue

                if next_board[p[2]+dx][p[3]+dy]!=3-self.playerSymbol:
                    continue
                if max(abs(dx),abs(dy))==2:
                    e2=1
                if max(abs(dx),abs(dy))==3:
                    e3=1

        for dx in range(-1,2):
            for dy in range(-1,2):
                if not dx and not dy:
                    continue

                shape = shape<<1
                if p[2]+dx<0 or p[2]+dx>=BOARD_ROWS or p[3]+dy<0 or p[3]+dy>=BOARD_COLS:
                    shape = shape|1
                elif next_board[p[2]+dx][p[3]+dy]:
                    shape = shape|1

        # print(typ,s1,s2,s3,e2,e3,shape)

        if(p[3]-p[1]<2):
            return ((p[3]-p[1])<<12)+((s1|s2)<<11)+(s3<<10)+(e2<<9)+(e3<<8)+shape+1
        return ((p[3]-p[1])<<12)+(s1<<11)+(s2<<10)+(e2<<9)+(e3<<8)+shape+1
    
    def cantmove(self):
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                if self.board[i, j] == 0:
                    self.board[i, j] = 3 - self.playerSymbol
        return None
    
    def winner(self):
        # end
        if sum(map(sum,list(map(lambda row: list(map(lambda x: abs(2*x-3) if x!=0 else 0, row)), self.board)))) == BOARD_ROWS*BOARD_COLS or self.turn>=256:
            self.isEnd = True
            if sum(map(sum,list(map(lambda row: list(map(lambda x: 2*x-3 if x!=0 else 0, row)), self.board)))) < 0:
                return 1
            else:
                return 2
        # not end
        self.isEnd = False
        return None
    
    def next_board(self, A, p, player):
        AA = deepcopy(A)
        if p[0] == -1:
            for x in range(BOARD_ROWS):
                for y in range(BOARD_COLS):
                    if AA[x][y] == 0:
                        AA[x][y] = 3 - player
            return AA

        AA[p[2]][p[3]] = player
        if max(abs(p[2] - p[0]), abs(p[3] - p[1])) == 2:
            AA[p[0]][p[1]] = 0
        for x in range(max(p[2] - 1, 0), min(p[2] + 2, BOARD_ROWS)):
            for y in range(max(p[3] - 1, 0), min(p[3] + 2, BOARD_COLS)):
                if AA[x][y] == 3 - player:
                    AA[x][y] = player
        return AA


    def score(self, A, player):
        scr = 0
        for x in range(BOARD_ROWS):
            for y in range(BOARD_COLS):
                if A[x][y] == player:
                    scr += 1
                elif A[x][y] == 3 - player:
                    scr -= 1
        return scr


    def all_move(self, A, player):
        plist = []
        for x1 in range(BOARD_ROWS):
            for y1 in range(BOARD_COLS):
                if A[x1][y1] != player:
                    continue
                for x2 in range(max(x1 - 2, 0), min(x1 + 3, BOARD_ROWS)):
                    for y2 in range(max(y1 - 2, 0), min(y1 + 3, BOARD_COLS)):
                        if A[x2][y2] == 0:
                            plist.append([x1, y1, x2, y2])
        return plist


    def rand_move(self, player):
        plist = self.all_move(self.board, player)
        if not plist:
            return [-1, -1, -1, -1]
        return random.choice(plist)


    def get_phase(self, player):
        cnt = 0
        for x in range(BOARD_ROWS):
            for y in range(BOARD_COLS):
                if self.board[x][y] != 0:
                    cnt += 1
        if cnt >= BOARD_ROWS * BOARD_COLS - 10:
            return 3

        for x1 in range(BOARD_ROWS):
            for y1 in range(BOARD_COLS):
                if self.board[x1][y1] != player:
                    continue
                for x2 in range(max(x1 - 3, 0), min(x1 + 4, BOARD_ROWS)):
                    for y2 in range(max(y1 - 3, 0), min(y1 + 4, BOARD_COLS)):
                        if self.board[x2][y2] == 3 - player:
                            return 2
        return 1


    def phase1(self, player):
        cntu = cntd = cntl = cntr = 0
        for x in range(BOARD_ROWS):
            for y in range(BOARD_COLS):
                if self.board[x][y] != 3 - player:
                    continue
                if x >= y:
                    if 6-x >= y:
                        cntl += 1
                    if 6-x <= y:
                        cntd += 1
                if x <= y:
                    if 6-x >= y:
                        cntu += 1
                    if 6-x <= y:
                        cntr += 1

        p = []
        mx = max(max(cntu, cntd), max(cntl, cntr))
        if player == 1:
            if mx == cntu:
                if self.board[1][1] == 0:
                    p = [0, 0, 1, 1]
                elif self.board[2][2] == 0:
                    p = [1, 1, 2, 2]
                elif self.board[1][2] == 0:
                    p = [2, 2, 1, 2]
                elif self.board[2][1] == 0:
                    p = [2, 2, 2, 1]
                elif self.board[5][5] == 0:
                    p = [6, 6, 5, 5]
                elif self.board[4][4] == 0:
                    p = [5, 5, 4, 4]
                elif self.board[4][5] == 0:
                    p = [4, 4, 4, 5]
                elif self.board[5][4] == 0:
                    p = [4, 4, 5, 4]
                else:
                    p = self.rand_move(player)
            elif mx == cntd:
                if self.board[5][5] == 0:
                    p = [6, 6, 5, 5]
                elif self.board[4][4] == 0:
                    p = [5, 5, 4, 4]
                elif self.board[5][4] == 0:
                    p = [4, 4, 5, 4]
                elif self.board[4][5] == 0:
                    p = [4, 4, 4, 5]
                elif self.board[1][1] == 0:
                    p = [0, 0, 1, 1]
                elif self.board[2][2] == 0:
                    p = [1, 1, 2, 2]
                elif self.board[2][1] == 0:
                    p = [2, 2, 2, 1]
                elif self.board[1][2] == 0:
                    p = [2, 2, 1, 2]
                else:
                    p = self.rand_move(player)
            elif mx == cntl:
                if self.board[1][1] == 0:
                    p = [0, 0, 1, 1]
                elif self.board[2][2] == 0:
                    p = [1, 1, 2, 2]
                elif self.board[2][1] == 0:
                    p = [2, 2, 2, 1]
                elif self.board[1][2] == 0:
                    p = [2, 2, 1, 2]
                elif self.board[5][5] == 0:
                    p = [6, 6, 5, 5]
                elif self.board[4][4] == 0:
                    p = [5, 5, 4, 4]
                elif self.board[5][4] == 0:
                    p = [4, 4, 5, 4]
                elif self.board[4][5] == 0:
                    p = [4, 4, 4, 5]
                else:
                    p = self.rand_move(player)
            elif mx == cntr:
                if self.board[5][5] == 0:
                    p = [6, 6, 5, 5]
                elif self.board[4][4] == 0:
                    p = [5, 5, 4, 4]
                elif self.board[4][5] == 0:
                    p = [4, 4, 4, 5]
                elif self.board[5][4] == 0:
                    p = [4, 4, 5, 4]
                elif self.board[1][1] == 0:
                    p = [0, 0, 1, 1]
                elif self.board[2][2] == 0:
                    p = [1, 1, 2, 2]
                elif self.board[1][2] == 0:
                    p = [2, 2, 1, 2]
                elif self.board[2][1] == 0:
                    p = [2, 2, 2, 1]
                else:
                    p = self.rand_move(player)
        else:
            if mx == cntu:
                if self.board[1][5] == 0:
                    p = [0, 6, 1, 5]
                elif self.board[2][4] == 0:
                    p = [1, 5, 2, 4]
                elif self.board[1][4] == 0:
                    p = [2, 4, 1, 4]
                elif self.board[2][5] == 0:
                    p = [2, 4, 2, 5]
                elif self.board[5][1] == 0:
                    p = [6, 0, 5, 1]
                elif self.board[4][2] == 0:
                    p = [5, 1, 4, 2]
                elif self.board[4][1] == 0:
                    p = [4, 2, 4, 1]
                elif self.board[5][2] == 0:
                    p = [4, 2, 5, 2]
                else:
                    p = self.rand_move(player)
            elif mx == cntd:
                if self.board[5][1] == 0:
                    p = [6, 0, 5, 1]
                elif self.board[4][2] == 0:
                    p = [5, 1, 4, 2]
                elif self.board[5][2] == 0:
                    p = [4, 2, 5, 2]
                elif self.board[4][1] == 0:
                    p = [4, 2, 4, 1]
                elif self.board[1][5] == 0:
                    p = [0, 6, 1, 5]
                elif self.board[2][4] == 0:
                    p = [1, 5, 2, 4]
                elif self.board[2][5] == 0:
                    p = [2, 4, 2, 5]
                elif self.board[1][4] == 0:
                    p = [2, 4, 1, 4]
                else:
                    p = self.rand_move(player)
            elif mx == cntl:
                if self.board[5][1] == 0:
                    p = [6, 0, 5, 1]
                elif self.board[4][2] == 0:
                    p = [5, 1, 4, 2]
                elif self.board[4][1] == 0:
                    p = [4, 2, 4, 1]
                elif self.board[5][2] == 0:
                    p = [4, 2, 5, 2]
                elif self.board[1][5] == 0:
                    p = [0, 6, 1, 5]
                elif self.board[2][4] == 0:
                    p = [1, 5, 2, 4]
                elif self.board[1][4] == 0:
                    p = [2, 4, 1, 4]
                elif self.board[2][5] == 0:
                    p = [2, 4, 2, 5]
                else:
                    p = self.rand_move(player)
            elif mx == cntr:
                if self.board[1][5] == 0:
                    p = [0, 6, 1, 5]
                elif self.board[2][4] == 0:
                    p = [1, 5, 2, 4]
                elif self.board[2][5] == 0:
                    p = [2, 4, 2, 5]
                elif self.board[1][4] == 0:
                    p = [2, 4, 1, 4]
                elif self.board[5][1] == 0:
                    p = [6, 0, 5, 1]
                elif self.board[4][2] == 0:
                    p = [5, 1, 4, 2]
                elif self.board[5][2] == 0:
                    p = [4, 2, 5, 2]
                elif self.board[4][1] == 0:
                    p = [4, 2, 4, 1]
                else:
                    p = self.rand_move(player)
        p = [p]
        return p

    def phase23(self, A, dep, player):
        if dep == 0 :
            return [ [[-1, -1, -1, -1]], self.score(A, player)]

        mx1 = -INF
        mxp1 = []
        plist1 = self.all_move(A, player)
        if not plist1:
            p1 = [-1, -1, -1, -1]
            AA = self.next_board(A, p1, player)
            p1 = [[p1]]
            p1.append(self.score(AA, player))
            return p1

        for p1 in plist1:
            AA = self.next_board(A, p1, player)
            mn2 = INF
            plist2 = self.all_move(AA, 3 - player)
            if not plist2:
                p2 = [-1, -1, -1, -1]
                AAA = self.next_board(AA, p2, 3 - player)
                mn2 = self.score(AAA, player)

            for p2 in plist2:
                AAA = self.next_board(AA, p2, 3 - player)
                res = self.phase23(AAA, dep - 1, player)
                mn2 = min(mn2, res[1])
            if mx1 < mn2:
                mx1 = mn2
                mxp1 = [p1]
            elif mx1 == mn2:
                mxp1.append(p1)
        
        mxp1 = [mxp1]
        mxp1.append(mx1)
        return mxp1

    def availablePositions(self):
        idx = self.get_phase(self.playerSymbol)
        positions = []
        if idx == 1:
            positions = self.phase1(self.playerSymbol)
        elif idx == 2:
            positions = self.phase23(self.board, 1, self.playerSymbol)[0]
        elif idx == 3:
            positions = self.phase23(self.board, 1, self.playerSymbol)[0]

        if positions[0][0] == -1:
            positions = []
        
        return positions
    
    def updateState(self, position):
        ii = position[2] - position[0]
        jj = position[3] - position[1]
        if max(abs(ii), abs(jj)) == 2:
            self.board[position[0], position[1]] = 0
        
        dx1 = [-1, -1, -1, 0, 0, 1, 1, 1]
        dy1 = [-1, 0, 1, -1, 1, -1, 0, 1]
        i, j = position[2:4]
        self.board[i, j] = self.playerSymbol
        for ii, jj in zip(dx1, dy1):
            if i + ii < 0 or i + ii >= BOARD_ROWS or j + jj < 0 or j + jj >= BOARD_COLS:
                continue
            if self.board[i + ii, j + jj] == 3 - self.playerSymbol:
                self.board[i + ii, j + jj] = self.playerSymbol
            
        # switch to another player
        self.playerSymbol = 3 - self.playerSymbol
        self.turn = self.turn + 1
    
    # only when game ends
    def giveReward(self):
        result = self.winner()
        # backpropagate reward
        """self.p1.feedReward(-result)
        self.p2.feedReward(result)"""
        if result == 1:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
        else:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
    
    # board reset
    def reset(self):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.board[0, 0] = self.board[BOARD_ROWS-1, BOARD_COLS-1] = 1
        self.board[BOARD_ROWS-1, 0] = self.board[0, BOARD_COLS-1] = 2
        self.turn = 0
        self.isEnd = False
        self.boardHash = None
        self.playerSymbol = 1
    
    def play(self, rounds=100):
        for i in range(rounds):
            if i%10 == 0:
                print("Rounds {}".format(i))
            while not self.isEnd:
                # Player 1
                positions = self.availablePositions()
                if not positions:
                    self.cantmove()
                else:
                    p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
                    # take action and upate board state
                    self.updateState(p1_action)
                    # print("yay")
                    # print(positions)
                    # print(self.playerSymbol)
                    # print(p1_action)
                # self.showBoard()
                board_hash = self.getHash(p1_action)
                self.p1.addState(board_hash)
                # check board status if it is end
                win = self.winner()
                if win is not None:
                    # self.showBoard()
                    # ended with p1 either win or draw
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

                else:
                    # Player 2
                    positions = self.availablePositions()
                    if not positions:
                        self.cantmove()
                    else:
                        p2_action = self.p2.chooseAction(positions, self.board, self.playerSymbol)
                        self.updateState(p2_action)
                        # print("aya")
                        # print(positions)
                        # print(self.playerSymbol)
                        # print(p2_action)
                    # self.showBoard()
                    board_hash = self.getHash(p2_action)
                    self.p2.addState(board_hash)
                    
                    win = self.winner()
                    if win is not None:
                        # self.showBoard()
                        # ended with p2 either win or draw
                        self.giveReward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break
    
    # play with human
    def play2(self):
        while not self.isEnd:
            # Player 1
            positions = self.availablePositions()
            if not positions:
                self.cantmove()
            else:
                # human
                # p1_action = self.p1.chooseAction(positions)
                # computer
                p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
                # take action and upate board state
                self.updateState(p1_action)
            self.showBoard()
            # check board status if it is end
            win = self.winner()
            if win is not None:
                if win == 1:
                    print(self.p1.name, "wins!")
                else:
                    print(self.p2.name, "wins!")
                self.reset()
                break

            else:
                # Player 2
                positions = self.availablePositions()
                if not positions:
                    self.cantmove()
                else:
                    # human
                    # p2_action = self.p2.chooseAction(positions)
                    # computer
                    p2_action = self.p2.chooseAction(positions, self.board, self.playerSymbol)
                    # take action and upate board state
                    self.updateState(p2_action)
                self.showBoard()
                win = self.winner()
                if win is not None:
                    if win == 1:
                        print(self.p1.name, "wins!")
                    else:
                        print(self.p2.name, "wins!")
                    self.reset()
                    break
    
    # play with human
    def play3(self, rounds=100):
        win1 = win2 = 0
        for i in range(rounds):
            if i%10 == 0:
                print("Rounds {} : {} vs {}".format(i, win1, win2))
            while not self.isEnd:
                # Player 1
                positions = self.availablePositions()
                if not positions:
                    self.cantmove()
                else:
                    p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
                    # take action and upate board state
                    self.updateState(p1_action)
                # self.showBoard()
                board_hash = self.getHash(p1_action)
                self.p1.addState(board_hash)
                # check board status if it is end
                win = self.winner()
                if win is not None:
                    # self.showBoard()
                    # ended with p1 either win or draw
                    if win == 1:
                        win1 = win1 + 1
                        # print(self.p1.name, "wins!")
                    else:
                        win2 = win2 + 1
                        # print(self.p2.name, "wins!")
                    self.reset()
                    break

                else:
                    # Player 2
                    positions = self.availablePositions()
                    if not positions:
                        self.cantmove()
                    else:
                        p2_action = self.p2.chooseAction(positions, self.board, self.playerSymbol)
                        self.updateState(p2_action)
                    # self.showBoard()
                    board_hash = self.getHash(p2_action)
                    self.p2.addState(board_hash)
                    
                    win = self.winner()
                    if win is not None:
                        # self.showBoard()
                        # ended with p2 either win or draw
                        if win == 1:
                            win1 = win1 + 1
                            # print(self.p1.name, "wins!")
                        else:
                            win2 = win2 + 1
                            # print(self.p2.name, "wins!")
                        self.reset()
                        break
        print(win1, " vs ", win2)

    def showBoard(self):
        # p1: x  p2: o
        for i in range(0, BOARD_ROWS):
            print('------------------------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == 2:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('------------------------------')    

In [18]:
class Player:
    def __init__(self, name, exp_rate=0.1):
        self.name = name
        self.states = []  # record all positions taken
        self.lr = 0.2
        self.exp_rate = exp_rate
        self.decay_gamma = 0.95
        self.states_value = {}  # state -> value
    
    def getHash(self, current_board, position, symbol):
        next_board = deepcopy(current_board)
        p = position.copy()
        
        if p[2]-p[0]<0:
            for i in range(3):
                for j in range(3):
                    next_board[i][j],next_board[6-i][j] = next_board[6-i][j],next_board[i][j]
            p[0]=6-p[0]
            p[2]=6-p[2]

        if p[3]-p[1]<0:
            for i in range(7):
                for j in range(7):
                    next_board[i][j],next_board[i][6-j] = next_board[i][6-j],next_board[i][j]
            p[1]=6-p[1]
            p[3]=6-p[3]

        if p[2]-p[0]<p[3]-p[1]:
            for i in range(7):
                for j in range(i+1,7):
                    next_board[i][j],next_board[j][i] = next_board[j][i],next_board[i][j]
            p[0],p[1] = p[1],p[0]
            p[2],p[3] = p[3],p[2]
            
        s1 = s2 = s3 = 0
        e2 = e3 = 0
        shape = 0

        for dx in range(-3,4):
            for dy in range(-3,4):
                if not dx and not dy:
                    continue
                if p[0]+dx<0 or p[0]+dx>=BOARD_ROWS or p[1]+dy<0 or p[1]+dy>=BOARD_COLS:
                    continue

                if next_board[p[0]+dx][p[1]+dy]!=3-symbol:
                    continue
                if max(abs(dx),abs(dy))==1:
                    s1=1
                if max(abs(dx),abs(dy))==2:
                    s2=1
                if max(abs(dx),abs(dy))==3:
                    s3=1

        for dx in range(-3,4):
            for dy in range(-3,4):
                if abs(dx)<=1 and abs(dy)<=1:
                    continue
                if p[2]+dx<0 or p[2]+dx>=BOARD_ROWS or p[3]+dy<0 or p[3]+dy>=BOARD_COLS:
                    continue

                if next_board[p[2]+dx][p[3]+dy]!=3-symbol:
                    continue
                if max(abs(dx),abs(dy))==2:
                    e2=1
                if max(abs(dx),abs(dy))==3:
                    e3=1

        for dx in range(-1,2):
            for dy in range(-1,2):
                if not dx and not dy:
                    continue

                shape = shape<<1
                if p[2]+dx<0 or p[2]+dx>=BOARD_ROWS or p[3]+dy<0 or p[3]+dy>=BOARD_COLS:
                    shape = shape|1
                elif next_board[p[2]+dx][p[3]+dy]:
                    shape = shape|1

        # print(typ,s1,s2,s3,e2,e3,shape)

        if(p[3]-p[1]<2):
            return ((p[3]-p[1])<<12)+((s1|s2)<<11)+(s3<<10)+(e2<<9)+(e3<<8)+shape+1
        return ((p[3]-p[1])<<12)+(s1<<11)+(s2<<10)+(e2<<9)+(e3<<8)+shape+1
    
    def chooseAction(self, positions, current_board, symbol):
        if np.random.uniform(0, 1) <= self.exp_rate:
            # take random action
            idx = np.random.choice(len(positions))
            action = positions[idx]
        else:
            value_max = -999
            for p in positions:
                # print("p", p)
                boardHash = self.getHash(current_board, p, symbol)
                # print("boardHash", boardHash, self.states_value.get(boardHash))
                value = 0 if self.states_value.get(boardHash) is None else self.states_value.get(boardHash)
                # print("value", value, value_max)
                if value >= value_max:
                    value_max = value
                    action = p
        # print("{} takes action {}".format(self.name, action))
                    # print("action", action)
        return action
    
    # append a hash state
    def addState(self, state):
        self.states.append(state)
    
    # at the end of game, backpropagate and update states value
    def feedReward(self, reward):
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            self.states_value[st] += self.lr*(self.decay_gamma*reward - self.states_value[st])
            reward = self.states_value[st]
            
    def reset(self):
        self.states = []
        
    def savePolicy(self, rounds):
        fw = open('policy3_legr_' + str(self.lr) + '_' + str(self.exp_rate) + '_' + str(self.decay_gamma) + '_' + str(rounds) + '_' + str(self.name), 'wb')
        pickle.dump(self.states_value, fw)
        fw.close()

    def loadPolicy(self, file):
        fr = open(file,'rb')
        self.states_value = pickle.load(fr)
        fr.close()

In [19]:
class HumanPlayer:
    def __init__(self, name):
        self.name = name 
    
    def chooseAction(self, positions):
        while True:
            # for x in positions:
                # print(x)
            row1 = int(input("Input your action row1:"))
            col1 = int(input("Input your action col1:"))
            row2 = int(input("Input your action row2:"))
            col2 = int(input("Input your action col2:"))
            action = (row1, col1, row2, col2)
            if action in positions:
                return action
            else:
                sys.exit(1)
    
    # append a hash state
    def addState(self, state):
        pass
    
    # at the end of game, backpropagate and update states value
    def feedReward(self, reward):
        pass
            
    def reset(self):
        pass

### Training

In [6]:
p1 = Player("p1")
p2 = Player("p2")

st = State(p1, p2)

In [15]:
print("training...")
st.play(90)

p1.savePolicy(100)
p2.savePolicy(100)

training...
Rounds 0
Rounds 1
Rounds 2
Rounds 3
Rounds 4
Rounds 5
Rounds 6
Rounds 7
Rounds 8
Rounds 9
Rounds 10
Rounds 11
Rounds 12
Rounds 13
Rounds 14
Rounds 15
Rounds 16
Rounds 17
Rounds 18
Rounds 19
Rounds 20
Rounds 21
Rounds 22
Rounds 23
Rounds 24
Rounds 25
Rounds 26
Rounds 27
Rounds 28
Rounds 29
Rounds 30
Rounds 31
Rounds 32
Rounds 33
Rounds 34
Rounds 35
Rounds 36
Rounds 37
Rounds 38
Rounds 39
Rounds 40
Rounds 41
Rounds 42
Rounds 43
Rounds 44
Rounds 45
Rounds 46
Rounds 47
Rounds 48
Rounds 49
Rounds 50
Rounds 51
Rounds 52
Rounds 53
Rounds 54
Rounds 55
Rounds 56
Rounds 57
Rounds 58
Rounds 59
Rounds 60
Rounds 61
Rounds 62
Rounds 63
Rounds 64
Rounds 65
Rounds 66
Rounds 67
Rounds 68
Rounds 69
Rounds 70
Rounds 71
Rounds 72
Rounds 73
Rounds 74
Rounds 75
Rounds 76
Rounds 77
Rounds 78
Rounds 79
Rounds 80
Rounds 81
Rounds 82
Rounds 83
Rounds 84
Rounds 85
Rounds 86
Rounds 87
Rounds 88
Rounds 89


In [12]:
p1.loadPolicy("policy3_legr_0.2_0.1_0.95_100_p1")
p2.loadPolicy("policy3_legr_0.2_0.1_0.95_100_p2")

### Computer vs Computer

In [22]:
p1 = Player("computer1", exp_rate=0.05)
p1.loadPolicy("policy3_legr_0.2_0.1_0.95_10_p1")
p2 = Player("computer2", exp_rate=0.05)
p2.loadPolicy("policy3_legr_0.2_0.1_0.95_100_p2")

# p1 = HumanPlayer("human1")
# p2 = HumanPlayer("human2")

st = State(p1, p2)
# st.play2()
st.play3(100)

Rounds 0 : 0 vs 0
Rounds 10 : 7 vs 3
Rounds 20 : 15 vs 5
Rounds 30 : 20 vs 10
Rounds 40 : 27 vs 13
Rounds 50 : 35 vs 15
Rounds 60 : 43 vs 17
Rounds 70 : 46 vs 24
Rounds 80 : 52 vs 28
Rounds 90 : 59 vs 31
63  vs  37
