## Regret Matching Minimization

In [106]:
from random import random
import numpy as np
from time import time

In [107]:
class RPS:
    n_actions = 3

In [120]:
class Player:
    # Actions: 0:Rock  1:Paper  2:Scissors
    def __init__(self, name):
        self.name = name
        self.strategy = np.zeros(RPS.n_actions)
        self.avg_strategy = np.zeros(RPS.n_actions)
        self.strategy_sum = np.zeros(RPS.n_actions)
        self.regret_sum = np.zeros(RPS.n_actions)
    
    def getStrategy(self):
        '''Get current mixed strategy through regret-matching'''
        self.strategy = np.copy(self.regret_sum)
        self.strategy[self.strategy < 0] = 0 # reset negative regrets to zero
        normalizing_sum = np.sum(self.strategy)
#         print("Strategy: {}".format(self.strategy))
#         print("normalizing_sum: {}".format(normalizing_sum))
        if(normalizing_sum > 0):
            self.strategy /= normalizing_sum
        else:
            self.strategy = np.repeat(1 / RPS.n_actions, RPS.n_actions)
        self.strategy_sum += self.strategy
#         print("Strategy: {}".format(self.strategy))
    
    def getAverageStrategy(self):
        '''Get average mixed strategy across all training iterations'''
        normalizing_sum = np.sum(self.strategy_sum)
        if(normalizing_sum > 0):
            self.avg_strategy = self.strategy_sum/normalizing_sum
        else:
            self.avg_strategy = np.repeat(1 / RPS.n_actions, RPS.n_actions)
        
    def getAction(self):
        '''Get random action according to mixed-strategy distribution'''
        r = random()
        a = 0
        cumulative_proba = 0
#         print("Random number r: {}".format(r))
        while(a < RPS.n_actions-1):
            cumulative_proba += self.strategy[a]
            if(cumulative_proba > r):
                break
            a += 1
        return a
    
    def regret(self, my_action, opp_action):
        action_utility = np.zeros(RPS.n_actions)
        action_utility[opp_action] = 0
        action_utility[0 if(opp_action == RPS.n_actions-1) else opp_action+1] = 1
        action_utility[RPS.n_actions-1 if(opp_action == 0) else opp_action-1] = -1
        regret = action_utility - action_utility[my_action]
        self.regret_sum += regret
#         print("Regret_Sum: {}".format(self.regret_sum))

In [121]:
# p1 = Player("Nicho")
# print(p1.getAction())
# p1.getStrategy()
# p1.regret(2,0)
# print(p1.regret_sum)
# p1.getStrategy()

In [122]:
class Two_Player_Game:
    def __init__(self, max_iter=10000):
        self.p1 = Player("Player1")
        self.p2 = Player("Player2")
        self.max_iter = max_iter
    
    def train(self, avg_regret_matching=False):
        # Because here we implement the 2-player version, so we modified the architecture from the paper as below
        # To note the times each player wins
        num_wins = {
            self.p1: 0,
            self.p2: 0,
            'Draw': 0
        }
        for i in range(self.max_iter):
            # Step 1: Get regret-matching mixed-strategy actions
            if avg_regret_matching:
                self.p1.getAverageStrategy()
                self.p2.getAverageStrategy()
            else:
                self.p1.getStrategy()
                self.p2.getStrategy()
            a1 = self.p1.getAction()
            a2 = self.p2.getAction()
            # Step 2&3: Compute action utilities
            # regret function contains the accumulate action regret in Player
            self.p1.regret(a1,a2)
            self.p2.regret(a2,a1)

            winner = self.winner(a1, a2)
            num_wins[winner] += 1
        
        print("Statistics over {} runs:\n".format(self.max_iter))
        print("Player1 wins {} times".format(num_wins[self.p1]))
        print("Player2 wins {} times".format(num_wins[self.p2]))
        print("Draw {} times".format(num_wins['Draw']))

    def winner(self, a1, a2):
        if (a1 == a2): return 'Draw'
        elif (a1-a2 ==1 or a2-a1 ==2): return self.p1
        else: return self.p2

In [None]:
game = Two_Player_Game(max_iter=10000)
print("==== With simple regret-matching strategy === ")
t0 = time()
game.train()
print("----------------------------------------------\n Done in {0:.3} s".format(time()-t0))

print("==== With averaged regret-matching strategy === ")
t0 = time()
game.train(avg_regret_matching=True)
print("----------------------------------------------\n Done in {0:.3} s".format(time()-t0))