In [427]:
import random
import numpy as np
from collections import deque
import numpy as np
import pandas as pd
from collections import defaultdict


In [428]:
def matrix_to_tuple(matrix):
    return (matrix[0][1], matrix[0][2], matrix[1][2])

# B is the inverse of A
A = np.array([[1, 1, 0], [0, 1, 0], [0, 0, 1]])
B = np.array([[1, -1, 0], [0, 1, 0], [0, 0, 1]])

# C is the inverse of D
C = np.array([[1, 0, 0], [0, 1, 1], [0, 0, 1]])
D = np.array([[1, 0, 0], [0, 1, -1], [0, 0, 1]])

# together, A, B, C, and D generate the heisenberg group

In [429]:

def epsilon_greedy_search(Epsilon, qtable, state):
    if (random.random() < Epsilon):
        # 0 is 'apply matrix A', 1 is 'apply matrix B'
        # 2 is 'apply matrix C', 3 is 'apply matrix D'
        return random.choice([0, 1, 2, 3])
    else:
        # get the best move for the current state
        return best_move_for_a_state(Q_table=qtable, state=state)
    
# I would like to return the best move for a given state
def best_move_for_a_state(Q_table, state):
    # vals = Q_table[(state[0][1], state[0][2], state[1][2])]

    apply_A = state @ A
    apply_B = state @ B
    apply_C = state @ C
    apply_D = state @ D

    vals = [0, 0, 0, 0]
    vals[0] = Q_table[matrix_to_tuple(apply_A)]
    vals[1] = Q_table[matrix_to_tuple(apply_B)]
    vals[2] = Q_table[matrix_to_tuple(apply_C)]
    vals[3] = Q_table[matrix_to_tuple(apply_D)]

    # if we haven't visited this state before, return a random choice of 0, 1, 2, or 3
    if vals==[0, 0, 0, 0]:
        return random.choice([0, 1, 2, 3])
    
    # if we have visited this state before, return the current best choice
    return np.argmax(vals)

# over a given state, return the maximum value of the table for that state
def max_a_prime(Q_table, state):
    apply_A = state @ A
    apply_B = state @ B
    apply_C = state @ C
    apply_D = state @ D

    vals = [0, 0, 0, 0]
    vals[0] = Q_table[matrix_to_tuple(apply_A)]
    vals[1] = Q_table[matrix_to_tuple(apply_B)]
    vals[2] = Q_table[matrix_to_tuple(apply_C)]
    vals[3] = Q_table[matrix_to_tuple(apply_D)]
    
    return max(vals)

In [430]:
def getReward(matrix):
    if (matrix==A).all() or (matrix==B).all() or (matrix==C).all() or (matrix==D).all():
        return 20
    else:
        # return -1 + 1/(2 + abs(matrix[0][1]) + abs(matrix[0][2]) + abs(matrix[1][2]))
        return -1

In [431]:
# adapted from ChatGPT
class CustomDefaultDict(dict):
    def __init__(self, default_factory, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.default_factory = default_factory

    def __missing__(self, key):
        # Compute the default value based on the missing key
        default_value = self.default_factory(key)
        self[key] = default_value  # Cache the default value for future lookups
        return default_value

def default_value_for_key(key):
    # Define a function that computes the default value based on the key
    val1 = key[0]
    val2 = key[1]
    val3 = key[2]
    cur_matrix = np.array([[1, val1, val2], [0, 1, val3], [0, 0, 1]])
    output1 = getReward(cur_matrix @ A)
    output2 = getReward(cur_matrix @ B)
    output3 = getReward(cur_matrix @ C)
    output4 = getReward(cur_matrix @ D)
    return [output1, output2, output3, output4]

custom_dict = CustomDefaultDict(default_value_for_key)

cur_mat = A@A@B

custom_dict[(cur_mat[0][1], cur_mat[0][2], cur_mat[1][2])]

[-1, -1, -1, -1]

In [432]:
df = pd.read_csv("heisenberg_data.csv")

In [433]:
def get_next_step(oldObs, action):
    # action is always either 0, 1, 2, or 3
    next_state = []
    if action==0:
        next_state = oldObs @ A
    elif action==1:
        next_state = oldObs @ B
    elif action==2:
        next_state = oldObs @ C
    else:
        next_state = oldObs @ D
    curReward = getReward(next_state)
    done = curReward==20
    return (next_state, curReward, done)
    

In [434]:
# adapted from CS 540 Spring 2023 HW 10
EPISODES = 300000
LEARNING_RATE = .9
DISCOUNT_FACTOR = .8
EPSILON = 1
EPSILON_DECAY = .9999

random.seed(1)

# starts with an estimate of zero reward for each state.
# adapted from ChatGPT
# the outer dictionary has keys of the string version of the given array, and 
# values of a dictionary for each of the actions that we could take at that state
# Q_table = defaultdict(lambda: [0, 0, 0, 0])
# Q_table = CustomDefaultDict(default_value_for_key)
Q_table = defaultdict(lambda: 0)

episode_reward_record = deque(maxlen=100)

for i in range(EPISODES):
    episode_reward = 0
    done = False
    # choose a random starting row
    # adapted from https://stackoverflow.com/questions/15923826/random-row-selection-in-pandas-dataframe
    cur_row = df.sample(1)
    obs = np.array([
        [1, int(cur_row['val1']), int(cur_row['val2'])], 
        [0, 1, int(cur_row['val3'])], 
        [0, 0, 1]
        ])

    index = 1

    while (not done):
        # perform an epsilon greedy action 
        # Q(s, a) = (1-LEARNING_RATE)Q(s, a) + (LEARNING_RATE)(r + DISCOUNT_FACTOR(max a'(Q(s', a'))))
        action = epsilon_greedy_search(Epsilon=EPSILON, qtable=Q_table, state=obs)

        oldObs = obs
        obs,reward,done = get_next_step(oldObs, action)
        
        Q_table[matrix_to_tuple(obs)] = (1-LEARNING_RATE) * Q_table[matrix_to_tuple(obs)] + (LEARNING_RATE) * (reward + DISCOUNT_FACTOR * (max_a_prime(Q_table, obs)))

        episode_reward += reward # update episode reward

        index += 1
        # if we take more than 100 steps, end this iteration early (we are probably not making progress)
        if index > 100:
            done=True

    # decay the epsilon
    EPSILON *= EPSILON_DECAY

    # record the reward for this episode
    episode_reward_record.append(episode_reward) 

    if i%500 ==0 and i>0:
        print("Average reward for the last 500 iterations: " + str(sum(list(episode_reward_record))/100))
        print("epsilon: " + str(EPSILON) )



Average reward for the last 500 iterations: -98.04
epsilon: 0.9511319235669539
Average reward for the last 500 iterations: -93.78
epsilon: 0.9047424102692004
Average reward for the last 500 iterations: -92.78
epsilon: 0.8606154505570021
Average reward for the last 500 iterations: -85.28
epsilon: 0.8186406930090225
Average reward for the last 500 iterations: -78.6
epsilon: 0.7787131683686925
Average reward for the last 500 iterations: -63.22
epsilon: 0.7407330270401349
Average reward for the last 500 iterations: -53.46
epsilon: 0.7046052893871948
Average reward for the last 500 iterations: -44.82
epsilon: 0.6702396082111141
Average reward for the last 500 iterations: -50.46
epsilon: 0.6375500428128791
Average reward for the last 500 iterations: -41.4
epsilon: 0.6064548440752141
Average reward for the last 500 iterations: -28.0
epsilon: 0.576876250026757
Average reward for the last 500 iterations: -21.98
epsilon: 0.548740291377179
Average reward for the last 500 iterations: -25.16
epsilo

In [435]:
Q_table[matrix_to_tuple(A@A@A@A@A@A@A@A@A)]

4.786709333333334

In [436]:
print(A)
print(A@A)
print(A@A@A)
print(A@A@A@C)

[[1 1 0]
 [0 1 0]
 [0 0 1]]
[[1 2 0]
 [0 1 0]
 [0 0 1]]
[[1 3 0]
 [0 1 0]
 [0 0 1]]
[[1 3 3]
 [0 1 1]
 [0 0 1]]


In [437]:
# test with the other dataframe. 
test_df = pd.read_csv("heisenberg_data_test.csv")

In [438]:
cur_matrix = np.array([[1, int(cur_row['val1']), int(cur_row['val2'])], [0, 1, int(cur_row['val3'])], [0, 0, 1]])

In [439]:
def matrix_to_num_steps(cur_matrix):
    index = 1
    for i in range(50):
        if (cur_matrix==A).all() or (cur_matrix==B).all() or (cur_matrix==C).all() or (cur_matrix==D).all():
            return i
        outputs = [0, 0, 0, 0]
        outputs[0] = Q_table[matrix_to_tuple(cur_matrix@ A)]
        outputs[1] = Q_table[matrix_to_tuple(cur_matrix@ B)]
        outputs[2] = Q_table[matrix_to_tuple(cur_matrix@ C)]
        outputs[3] = Q_table[matrix_to_tuple(cur_matrix@ D)]
        index = np.argmax(outputs)
        if index==0:
            cur_matrix = cur_matrix @ A
        elif index==1:
            cur_matrix = cur_matrix @ B
        elif index==2:
            cur_matrix = cur_matrix @ C
        elif index==3:
            cur_matrix = cur_matrix @ D
    return 100

In [440]:
def test_Q_learning(row):
    cur_matrix = np.array([[1, int(row['val1']), int(row['val2'])], [0, 1, int(row['val3'])], [0, 0, 1]])
    return matrix_to_num_steps(cur_matrix)

test_df['num_moves_Q_learning_needs'] = test_df.apply(test_Q_learning, axis=1)

In [441]:
print("The proportion of starting positions in the test dataset that we can find a route to the origin that's <50 steps: ")
sum(test_df['num_moves_Q_learning_needs']!=100)/test_df.shape[0]

The proportion of starting positions in the test dataset that we can find a route to the origin that's <50 steps: 


0.918091809180918

In [442]:
print("Of these, the proportion of times where we learned a path that was < 20 moves: ")
# encouraging because all of these were generated as sequences of 30 moves
# so we've found significantly faster paths back to the origin for almost all moves that we find a path to the origin 
sum(test_df['num_moves_Q_learning_needs']<20)/sum(test_df['num_moves_Q_learning_needs']!=100)

Of these, the proportion of times where we learned a path that was < 20 moves: 


0.9996732026143791

In [443]:
filtered_df = test_df[test_df['num_moves_Q_learning_needs']!=100]

In [444]:
def first_matrix_to_apply(row):
    outputs = [0, 0, 0, 0]
    cur_matrix = np.array([
        [1, int(row['val1']), int(row['val2'])], 
        [0, 1, int(row['val3'])], 
        [0, 0, 1]
        ])
    outputs[0] = Q_table[matrix_to_tuple(cur_matrix@ A)]
    outputs[1] = Q_table[matrix_to_tuple(cur_matrix@ B)]
    outputs[2] = Q_table[matrix_to_tuple(cur_matrix@ C)]
    outputs[3] = Q_table[matrix_to_tuple(cur_matrix@ D)]
    return np.argmax(outputs)

filtered_df['first_move_by_Q_learning'] = filtered_df.apply(first_matrix_to_apply, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['first_move_by_Q_learning'] = filtered_df.apply(first_matrix_to_apply, axis=1)


In [445]:
filtered_df = filtered_df.drop('num_moves_Q_learning_needs', axis=1)

In [446]:
bound = int(filtered_df.shape[0] * 0.6)
plus_one = bound+1
train = filtered_df.iloc[1:bound]
test = filtered_df.iloc[plus_one:filtered_df.shape[0]]

In [447]:
train.to_csv("learned_Q_moves.csv", index=False)
test.to_csv("learned_Q_moves_test.csv", index=False)

In [448]:
row = dict()
row['val1'], row['val2'], row['val3'] = matrix_to_tuple(A@A@A@C@B@D@B@D@B)

In [449]:
first_matrix_to_apply(row)

0

In [450]:
A@A@A@C@B@D@B@D@B

array([[ 1,  0,  0],
       [ 0,  1, -1],
       [ 0,  0,  1]])

In [451]:
outputs = [0, 0, 0, 0]

cur_matrix = A@A@A@A@A@D@B
print(cur_matrix)

outputs[0] = Q_table[matrix_to_tuple(cur_matrix@ A)]
outputs[1] = Q_table[matrix_to_tuple(cur_matrix@ B)]
outputs[2] = Q_table[matrix_to_tuple(cur_matrix@ C)]
outputs[3] = Q_table[matrix_to_tuple(cur_matrix@ D)]
outputs

[[ 1  4 -5]
 [ 0  1 -1]
 [ 0  0  1]]


[14.114666666666668, 14.114666666666668, 14.114666666666668, 7.233386666666668]

In [452]:
results = []

results.append((matrix_to_num_steps(A@A@A@A), 3))
results.append((matrix_to_num_steps(A@A@A@A@A), 4))
results.append((matrix_to_num_steps(B@B@B@B), 3))
results.append((matrix_to_num_steps(B@B@B@B@B), 4))
results.append((matrix_to_num_steps(C@C@C@C), 3))
results.append((matrix_to_num_steps(C@C@C@C@C), 4))
results.append((matrix_to_num_steps(D@D@D@D), 3))
results.append((matrix_to_num_steps(D@D@D@D@D), 4))
results.append((matrix_to_num_steps(A@C), 1))
results.append((matrix_to_num_steps(A@C@C), 2))
results.append((matrix_to_num_steps(B@C), 1))
results.append((matrix_to_num_steps(B@C@C), 2))

results

[(3, 3),
 (4, 4),
 (3, 3),
 (4, 4),
 (3, 3),
 (4, 4),
 (3, 3),
 (4, 4),
 (1, 1),
 (2, 2),
 (1, 1),
 (2, 2)]

In [453]:
A@A@A@C@C@C@B@B@B@D@D@D

array([[1, 0, 9],
       [0, 1, 0],
       [0, 0, 1]])

In [454]:
B@C@C@B@B@D

array([[ 1, -3,  1],
       [ 0,  1,  1],
       [ 0,  0,  1]])

In [455]:
np.array([[1, 1, -7], [0, 1, -1], [0, 0, 1]])@C@A@C

array([[ 1,  2, -4],
       [ 0,  1,  1],
       [ 0,  0,  1]])

In [456]:
Q_table[matrix_to_tuple(A@C@C@C@C@B@D@B@D@C@A)]

print(Q_table[matrix_to_tuple(A@C@C@C@C@B@D@B@D@C@A)])
print(Q_table[matrix_to_tuple(A@C@C@C@C@B@D@B@D@C@B)])
print(Q_table[matrix_to_tuple(A@C@C@C@C@B@D@B@D@C@C)])
print(Q_table[matrix_to_tuple(A@C@C@C@C@B@D@B@D@C@D)])

10.291733333333335
4.786709333333334
10.291733333333335
4.786709333333334


In [457]:
problem_cases = test_df[test_df['num_moves_Q_learning_needs']==100]

In [488]:
first_row = problem_cases.iloc[1]

In [489]:
test_mat = np.array([[1, first_row['val1'], first_row['val2']], [0, 1, first_row['val3']], [0, 0, 1]])

In [468]:
def matrix_to_next_matrix(mat):
    outputs = [0, 0, 0, 0]

    outputs[0] = Q_table[matrix_to_tuple(mat@ A)]
    outputs[1] = Q_table[matrix_to_tuple(mat@ B)]
    outputs[2] = Q_table[matrix_to_tuple(mat@ C)]
    outputs[3] = Q_table[matrix_to_tuple(mat@ D)]

    index = np.argmax(outputs)
    if index==0:
        cur_matrix = mat @ A
    elif index==1:
        cur_matrix = mat @ B
    elif index==2:
        cur_matrix = mat @ C
    elif index==3:
        cur_matrix = mat @ D

    
    return cur_matrix

In [506]:
def do_we_loop(seen, matrix):
    for i in range(20):
        seen.add(matrix_to_tuple(matrix))
        matrix = matrix_to_next_matrix(matrix)
        if matrix_to_tuple(matrix) in seen: 
            return True
            # return (matrix, i)
    # return "We do not loop"
    return False


In [507]:
seen = set()
do_we_loop(seen, test_mat)

True

In [508]:
def add_if_loop_to_df(row): 
    test_mat = np.array([[1, row['val1'], row['val2']], [0, 1, row['val3']], [0, 0, 1]])
    seen = set()
    return do_we_loop(seen, test_mat)
problem_cases['do_we_loop'] = problem_cases.apply(add_if_loop_to_df, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  problem_cases['do_we_loop'] = problem_cases.apply(add_if_loop_to_df, axis=1)


In [509]:
sum(problem_cases['do_we_loop'])/problem_cases.shape[0]
# we almost always loop within 20 moves

0.9865689865689866