In [79]:
import random
import numpy as np
from collections import deque
import numpy as np
import pandas as pd
from collections import defaultdict


In [80]:

def epsilon_greedy_search(Epsilon, qtable, state):
    if (random.random() < Epsilon):
        # 0 is 'apply matrix A', 1 is 'apply matrix B'
        # 2 is 'apply matrix C', 3 is 'apply matrix D'
        return random.choice([0, 1, 2, 3])
    else:
        # get the best move for the current state
        return best_move_for_a_state(Q_table=qtable, state=state)
    
# I would like to return the best move for a given state
def best_move_for_a_state(Q_table, state):
    vals = Q_table[str(state)]

    # if we haven't visited this state before, return a random choice of 0, 1, 2, or 3
    if vals==[0, 0, 0, 0]:
        return random.choice([0, 1, 2, 3])
    
    # if we have visited this state before, return the current best choice
    return np.argmax(vals)

# over a given state, return the maximum value of the table for that state
def max_a_prime(Q_table, state):
    return max(Q_table[str(state)])

In [81]:
# B is the inverse of A
A = np.array([[1, 1, 0], [0, 1, 0], [0, 0, 1]])
B = np.array([[1, -1, 0], [0, 1, 0], [0, 0, 1]])

# C is the inverse of D
C = np.array([[1, 0, 0], [0, 1, 1], [0, 0, 1]])
D = np.array([[1, 0, 0], [0, 1, -1], [0, 0, 1]])

# together, A, B, C, and D generate the heisenberg group

In [82]:
def getReward(matrix):
    if (matrix==A).all() or (matrix==B).all() or (matrix==C).all() or (matrix==D).all():
        return 20
    else:
        return 1/(2 + 8*(matrix[0][1] + matrix[0][2] + matrix[1][2]))

In [83]:
df = pd.read_csv("heisenberg_data.csv")

In [84]:
def get_next_step(oldObs, action):
    # action is always either 0, 1, 2, or 3
    next_state = []
    if action==0:
        next_state = oldObs @ A
    elif action==1:
        next_state = oldObs @ B
    elif action==2:
        next_state = oldObs @ C
    else:
        next_state = oldObs @ D
    curReward = getReward(next_state)
    done = curReward==20
    return (next_state, curReward, done)
    

In [94]:
# adapted from CS 540 Spring 2023 HW 10
EPISODES = 30000
LEARNING_RATE = .1
DISCOUNT_FACTOR = .99
EPSILON = 1
EPSILON_DECAY = .999

random.seed(1)

# starts with an estimate of zero reward for each state.
# adapted from ChatGPT
# the outer dictionary has keys of the string version of the given array, and 
# values of a dictionary for each of the actions that we could take at that state
Q_table = defaultdict(lambda: [0, 0, 0, 0]) 

episode_reward_record = deque(maxlen=100)

for i in range(EPISODES):
    episode_reward = 0
    done = False
    # choose a random starting row
    # adapted from https://stackoverflow.com/questions/15923826/random-row-selection-in-pandas-dataframe
    cur_row = df.sample(1)
    obs = np.array([
        [1, int(cur_row['val1']), int(cur_row['val2'])], 
        [0, 1, int(cur_row['val3'])], 
        [0, 0, 1]
        ])

    index = 1

    while (not done):
        # perform an epsilon greedy action 
        # Q(s, a) = (1-LEARNING_RATE)Q(s, a) + (LEARNING_RATE)(r + DISCOUNT_FACTOR(max a'(Q(s', a'))))
        action = epsilon_greedy_search(Epsilon=EPSILON, qtable=Q_table, state=obs)

        oldObs = obs
        obs,reward,done = get_next_step(oldObs, action)
        Q_table[str(oldObs)][action] = (1-LEARNING_RATE) * Q_table[str(oldObs)][action] + (LEARNING_RATE) * (reward + DISCOUNT_FACTOR * (max_a_prime(Q_table, obs)))

        episode_reward += reward # update episode reward

        index += 1
        # if we take more than 100 steps, end this iteration early (we are probably not making progress)
        if index > 100:
            done=True

    # decay the epsilon
    EPSILON *= EPSILON_DECAY

    # record the reward for this episode
    episode_reward_record.append(episode_reward) 

    if i%100 ==0 and i>0:
        print("Average reward for the last 100 iterations: " + str(sum(list(episode_reward_record))/100))
        print("epsilon: " + str(EPSILON) )



Average reward for the last 100 iterations: 0.7625689057100687
epsilon: 0.9038873549665959
Average reward for the last 100 iterations: 1.122294535878787
epsilon: 0.8178301806491574
Average reward for the last 100 iterations: 1.5109757440332867
epsilon: 0.7399663251239436
Average reward for the last 100 iterations: 2.217182842266635
epsilon: 0.6695157201007336
Average reward for the last 100 iterations: 2.551385459414723
epsilon: 0.6057725659163237
Average reward for the last 100 iterations: 2.334864818526633
epsilon: 0.548098260578011
Average reward for the last 100 iterations: 3.7174336591315744
epsilon: 0.4959150020176678
Average reward for the last 100 iterations: 4.127263400875193
epsilon: 0.44869999946146477
Average reward for the last 100 iterations: 4.94052638931124
epsilon: 0.4059802359226587
Average reward for the last 100 iterations: 4.55530565587314
epsilon: 0.36732772934619257
Average reward for the last 100 iterations: 6.824266225848251
epsilon: 0.33235524492954527
Average

In [86]:
Q_table[str(B@B)]

[19.54943200910122,
 -0.004545454545454546,
 -0.016026333486903677,
 0.10067506440476193]

In [87]:
# test with the other dataframe. 
test_df = pd.read_csv("heisenberg_data_test.csv")

In [88]:
test_df
cur_row = test_df.sample(1)

In [89]:
cur_matrix = np.array([[1, int(cur_row['val1']), int(cur_row['val2'])], [0, 1, int(cur_row['val3'])], [0, 0, 1]])

In [90]:
outputs = Q_table[str(cur_matrix)]
if outputs==[0, 0, 0, 0]:
    print("Problem.")
index = np.argmax(outputs)
new_matrix = cur_matrix
if index==0:
    new_matrix = cur_matrix @ A
elif index==1:
    new_matrix = cur_matrix @ B
elif index==2:
    new_matrix = cur_matrix @ C
elif index==3:
    new_matrix = cur_matrix @ D

cur_matrix = new_matrix
new_matrix

array([[ 1, -1,  3],
       [ 0,  1,  0],
       [ 0,  0,  1]])

In [91]:
def matrix_to_num_steps(cur_matrix):
    index = 1
    for i in range(50):
        if (cur_matrix==A).all() or (cur_matrix==B).all() or (cur_matrix==C).all() or (cur_matrix==D).all():
            return i
        outputs = Q_table[str(cur_matrix)]
        print(outputs)
        if outputs==[0, 0, 0, 0]:
            # this is a problem because we haven't seen this state before 
            # in training so we have no idea how to handle it
            print("Problem.")
        index = np.argmax(outputs)
        if index==0:
            cur_matrix = cur_matrix @ A
        elif index==1:
            cur_matrix = cur_matrix @ B
        elif index==2:
            cur_matrix = cur_matrix @ C
        elif index==3:
            cur_matrix = cur_matrix @ D
    return 100

In [106]:
matrix_to_num_steps(B@B@C@A)

[-0.022075447666666664, 1.970309192037845, -0.01845063867985161, -0.025014285714285718]
[-0.007142857142857143, 5.420267532376387, -0.00708411353928299, -0.007142857142857143]
[-0.004545454545454546, -0.002631578947368421, -0.004130434782608695, 11.177454481998897]
[17.30788139938405, -0.004545454545454546, -0.0033333333333333335, 0]
[20.596962041860664, 0.02542174685491623, -0.012726423165358955, 1.466495498641827]
[3.360395381780317, 8.301699192682085, 20.299999999999965, 9.162808517793957]
[19.99999999999999, 7.879772801556271, 3.9843510100090107, 13.306415398562999]


7

In [105]:
mymat = B@B@C@B@D@A@A@C@A
print(mymat)

Q_table[str(mymat)]

# cur_matrix = D@D@B@D@A

# matrix_to_num_steps(B@B@C)

[[1 0 0]
 [0 1 1]
 [0 0 1]]


[0, 0, 0, 0]

In [101]:
B@B@C@B@D@A

array([[ 1, -2,  1],
       [ 0,  1,  0],
       [ 0,  0,  1]])