# Nash Q learning basic implementation

In [2]:
import numpy as np
import nashpy as nash

In [3]:
Q = 4#Number of games
N = 2 #Number of players
A = 2 #Number of actions per player

In [4]:
# Player1 action / Player2 action / starting state / ending state / probability
#Player 1 - column player - 0: .3, 1: .4
#Player 2 - row player - 0: .1, 1: .2
TRANSITION_MATRIX = np.array(
    [
        # Player 1 - Action 0
        [
            # Player 2 - Action 0
            [
                
                [0, 0.5, 0, 0.5],
                [0, 1, 0, 0],
                [0, 0, 1, 0],
                [0.5, 0, 0, 0.5]
                
            ],

            # Player 2 - Action 1
            [
                
                [1, 0, 0, 0],
                [0, 0, 1, 0],
                [0, 0, 0, 1],
                [0.5, 0, 0, 0.5]
                
            ]
        ],

        # Player 1 - Action 1
        [
            # Player 2 - Action 0
            [
                
                [1, 0, 0, 0],
                [0, 0, 0, 1],
                [0, 0, 0, 1],
                [0.5, 0, 0, 0.5]
                
            ],

            # Player 2 - Action 1
            [
                
                [0, 0.5, 0, 0.5],
                [0, 0, 1, 0],
                [0, 0, 0, 1],
                [0.5, 0, 0, 0.5]
                
            ]
        ]
    ]
)

In [5]:
TRANSITION_MATRIX[0, 1, 2, 3]

1.0

In [6]:
# state / player1 action / player2 action / [player1 reward, player2 reward]
PAYOFF_MATRIX = np.array([
    # State 0
    [
        [ [2, 1], [0, 0] ],
        [ [0, 0], [1, 2] ]
    ],
    # State 1
    [
        [ [1, 1], [3, 0] ],
        [ [0, 3], [2, 2] ]
    ],
    # State 
    [
        [ [2, 0], [0, 2] ],
        [ [0, 1], [1, 0] ]
    ],
    # State 33
    [
        [ [1, 1], [0, 0] ],
        [ [0, 0], [2, 2] ]
    ],
])

In [7]:
PAYOFF_MATRIX[0, :, :, 0]

array([[2, 0],
       [0, 1]])

In [8]:
def transition_probability(state, player1_action, player2_action, next):
    return TRANSITION_MATRIX[player1_action, player2_action, state, next]

In [9]:
def reward(state, player1_action, player2_action):
    return PAYOFF_MATRIX[state, player1_action, player2_action]

In [45]:
state = 0
totalReward = np.array([0, 0])
n_games = 1000
for _ in range(n_games):
    print("State:", state)
    player1_action = np.random.choice(A, p=[0.5, 0.5])
    player2_action = np.random.choice(A, p=[0.5, 0.5])
    print("Player 1 action:", player1_action)
    print("Player 2 action:", player2_action)
    next_state = np.random.choice(range(Q), p=TRANSITION_MATRIX[player1_action, player2_action, state])
    print("Next state:", next_state)
    r = reward(state, player1_action, player2_action)
    print("Reward:", r)
    state = next_state
    totalReward += r
print("Total reward:", totalReward / n_games)

State: 0
Player 1 action: 0
Player 2 action: 0
Next state: 3
Reward: [2 1]
State: 3
Player 1 action: 1
Player 2 action: 1
Next state: 3
Reward: [2 2]
State: 3
Player 1 action: 1
Player 2 action: 1
Next state: 0
Reward: [2 2]
State: 0
Player 1 action: 0
Player 2 action: 1
Next state: 0
Reward: [0 0]
State: 0
Player 1 action: 0
Player 2 action: 0
Next state: 3
Reward: [2 1]
State: 3
Player 1 action: 0
Player 2 action: 1
Next state: 3
Reward: [0 0]
State: 3
Player 1 action: 1
Player 2 action: 1
Next state: 0
Reward: [2 2]
State: 0
Player 1 action: 0
Player 2 action: 0
Next state: 1
Reward: [2 1]
State: 1
Player 1 action: 1
Player 2 action: 1
Next state: 2
Reward: [2 2]
State: 2
Player 1 action: 0
Player 2 action: 1
Next state: 3
Reward: [0 2]
State: 3
Player 1 action: 1
Player 2 action: 0
Next state: 0
Reward: [0 0]
State: 0
Player 1 action: 1
Player 2 action: 1
Next state: 3
Reward: [1 2]
State: 3
Player 1 action: 0
Player 2 action: 0
Next state: 0
Reward: [1 1]
State: 0
Player 1 action:

In [36]:
def computeNashEq(state, payoff_matrix):
    game = nash.Game(payoff_matrix[state, :, :, 0], PAYOFF_MATRIX[state, :, :, 1])
    eqs = game.vertex_enumeration()

    try:
        eq = next(eqs)
        return eq
    except StopIteration:
        a = np.random.rand()
        return [[a, 1 - a], [a, 1 - a]]
    

In [37]:
computeNashEq(3, PAYOFF_MATRIX)

(array([0., 1.]), array([0., 1.]))

In [38]:
#Simulate plays with nash policy
state = 0
totalReward = np.array([0, 0])
for _ in range(n_games):
    print("State:", state)
    nashEq = computeNashEq(state, PAYOFF_MATRIX)
    print("Nash equilibrium:", nashEq)
    player1_action = np.random.choice(A, p=nashEq[0])
    player2_action = np.random.choice(A, p=nashEq[1])
    print("Player 1 action:", player1_action)
    print("Player 2 action:", player2_action)
    next_state = np.random.choice(range(Q), p=TRANSITION_MATRIX[player1_action, player2_action, state])
    print("Next state:", next_state)
    r = reward(state, player1_action, player2_action)
    print("Reward:", r)
    state = next_state
    totalReward += r
print("Total reward:", totalReward/n_games)

State: 0
Nash equilibrium: (array([0., 1.]), array([0., 1.]))
Player 1 action: 1
Player 2 action: 1
Next state: 3
Reward: [1 2]
State: 3
Nash equilibrium: (array([0., 1.]), array([0., 1.]))
Player 1 action: 1
Player 2 action: 1
Next state: 0
Reward: [2 2]
State: 0
Nash equilibrium: (array([0., 1.]), array([0., 1.]))
Player 1 action: 1
Player 2 action: 1
Next state: 1
Reward: [1 2]
State: 1
Nash equilibrium: (array([1., 0.]), array([1., 0.]))
Player 1 action: 0
Player 2 action: 0
Next state: 1
Reward: [1 1]
State: 1
Nash equilibrium: (array([1., 0.]), array([1., 0.]))
Player 1 action: 0
Player 2 action: 0
Next state: 1
Reward: [1 1]
State: 1
Nash equilibrium: (array([1., 0.]), array([1., 0.]))
Player 1 action: 0
Player 2 action: 0
Next state: 1
Reward: [1 1]
State: 1
Nash equilibrium: (array([1., 0.]), array([1., 0.]))
Player 1 action: 0
Player 2 action: 0
Next state: 1
Reward: [1 1]
State: 1
Nash equilibrium: (array([1., 0.]), array([1., 0.]))
Player 1 action: 0
Player 2 action: 0
Next

In [14]:
qTable = np.zeros((Q, A, A, N))

In [15]:
computeNashEq(0, qTable)

[]


  hs = HalfspaceIntersection(halfspaces, feasible_point)
  hs = HalfspaceIntersection(halfspaces, feasible_point)


[[0.17496329090135054, 0.8250367090986495],
 [0.8250367090986495, 0.17496329090135054]]

In [16]:
def expectedPayoff(payoff_matrix, player1_strategy, player2_strategy):
    expected_payoff = np.dot(np.dot(player1_strategy, payoff_matrix), player2_strategy)
    return expected_payoff
    

In [17]:
print(PAYOFF_MATRIX[0, :, :])

[[[2 1]
  [0 0]]

 [[0 0]
  [1 2]]]


In [24]:
expectedPayoff(PAYOFF_MATRIX[2, :, :, 1], np.array([0.5, .5]), np.array([1, 0]))

0.5

In [44]:
#Simulate plays with nash policy
ALPHA = 0.5
GAMMA = 0.8
EPSILON = 0.1

qTable = np.zeros((Q, A, A, N))

state = 0
totalReward = np.array([0, 0])
for _ in range(n_games):
    print("State:", state)
    nashEq = np.abs(computeNashEq(state, qTable))
    print("Nash equilibrium:", nashEq)
    player1_action = np.random.choice(A, p=nashEq[0]) if np.random.rand() > EPSILON else np.random.choice(A)
    player2_action = np.random.choice(A, p=nashEq[1]) if np.random.rand() > EPSILON else np.random.choice(A)
    print("Player 1 action:", player1_action)
    print("Player 2 action:", player2_action)
    next_state = np.random.choice(range(Q), p=TRANSITION_MATRIX[player1_action, player2_action, state])
    print("Next state:", next_state)
    r = reward(state, player1_action, player2_action)
    print("Reward:", r)

    next_NashEq = computeNashEq(next_state, qTable)
    next_qVal_0 = expectedPayoff(qTable[next_state, :, :, 0], next_NashEq[0], next_NashEq[1])
    next_qVal_1 = expectedPayoff(qTable[next_state, :, :, 1], next_NashEq[0], next_NashEq[1])
    qTable[state, player1_action, player2_action, 0] = (1 - ALPHA) * qTable[state, player1_action, player2_action, 0] + ALPHA * (r[0] + GAMMA * next_qVal_0)
    qTable[state, player1_action, player2_action, 1] = (1 - ALPHA) * qTable[state, player1_action, player2_action, 1] + ALPHA * (r[1] + GAMMA * next_qVal_1)

    print("QTable:", qTable[state])
    state = next_state
    totalReward += r
print("Expected reward:", totalReward/n_games)
print("QTable:", qTable)

State: 0
Nash equilibrium: [[0.96317722 0.03682278]
 [0.96317722 0.03682278]]
Player 1 action: 0
Player 2 action: 0
Next state: 1
Reward: [2 1]
QTable: [[[1.  0.5]
  [0.  0. ]]

 [[0.  0. ]
  [0.  0. ]]]
State: 1
Nash equilibrium: [[0.05712217 0.94287783]
 [0.05712217 0.94287783]]
Player 1 action: 1
Player 2 action: 1
Next state: 2
Reward: [2 2]
QTable: [[[0. 0.]
  [0. 0.]]

 [[0. 0.]
  [1. 1.]]]
State: 2
Nash equilibrium: [[0.86805056 0.13194944]
 [0.86805056 0.13194944]]
Player 1 action: 0
Player 2 action: 1
Next state: 3
Reward: [0 2]
QTable: [[[0. 0.]
  [0. 1.]]

 [[0. 0.]
  [0. 0.]]]
State: 3
Nash equilibrium: [[0.61789309 0.38210691]
 [0.61789309 0.38210691]]
Player 1 action: 0
Player 2 action: 1
Next state: 0
Reward: [0 0]
QTable: [[[0.  0. ]
  [0.4 0.2]]

 [[0.  0. ]
  [0.  0. ]]]
State: 0
Nash equilibrium: [[1.00000000e+00 0.00000000e+00]
 [1.00000000e+00 1.11022302e-16]]
Player 1 action: 0
Player 2 action: 0
Next state: 3
Reward: [2 1]
QTable: [[[1.51071006 0.75535503]
  [0. 