In [43]:
import random
import numpy as np
from collections import deque
import numpy as np
import pandas as pd
from collections import defaultdict


In [44]:

def epsilon_greedy_search(Epsilon, qtable, state):
    if (random.random() < Epsilon):
        # 0 is 'apply matrix A', 1 is 'apply matrix B'
        # 2 is 'apply matrix C', 3 is 'apply matrix D'
        return random.choice([0, 1, 2, 3])
    else:
        # get the best move for the current state
        return best_move_for_a_state(Q_table=qtable, state=state)
    
# I would like to return the best move for a given state
def best_move_for_a_state(Q_table, state):
    vals = Q_table[(state[0][1], state[0][2], state[1][2])]

    # if we haven't visited this state before, return a random choice of 0, 1, 2, or 3
    if vals==[0, 0, 0, 0]:
        return random.choice([0, 1, 2, 3])
    
    # if we have visited this state before, return the current best choice
    return np.argmax(vals)

# over a given state, return the maximum value of the table for that state
def max_a_prime(Q_table, state):
    return max(Q_table[(state[0][1], state[0][2], state[1][2])])

In [45]:
# B is the inverse of A
A = np.array([[1, 1, 0], [0, 1, 0], [0, 0, 1]])
B = np.array([[1, -1, 0], [0, 1, 0], [0, 0, 1]])

# C is the inverse of D
C = np.array([[1, 0, 0], [0, 1, 1], [0, 0, 1]])
D = np.array([[1, 0, 0], [0, 1, -1], [0, 0, 1]])

# together, A, B, C, and D generate the heisenberg group

In [46]:
def getReward(matrix):
    if (matrix==A).all() or (matrix==B).all() or (matrix==C).all() or (matrix==D).all():
        return 20
    else:
        return -1 + 1/(2 + abs(matrix[0][1]) + abs(matrix[0][2]) + abs(matrix[1][2]))

In [47]:
# adapted from ChatGPT
class CustomDefaultDict(dict):
    def __init__(self, default_factory, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.default_factory = default_factory

    def __missing__(self, key):
        # Compute the default value based on the missing key
        default_value = self.default_factory(key)
        self[key] = default_value  # Cache the default value for future lookups
        return default_value

# Example usage:
def default_value_for_key(key):
    # Define a function that computes the default value based on the key
    val1 = key[0]
    val2 = key[1]
    val3 = key[2]
    cur_matrix = np.array([[1, val1, val2], [0, 1, val3], [0, 0, 1]])
    output1 = getReward(cur_matrix @ A)
    output2 = getReward(cur_matrix @ B)
    output3 = getReward(cur_matrix @ C)
    output4 = getReward(cur_matrix @ D)
    return [output1, output2, output3, output4]

custom_dict = CustomDefaultDict(default_value_for_key)

cur_mat = A@A@B

custom_dict[(cur_mat[0][1], cur_mat[0][2], cur_mat[1][2])]

[-0.75, -0.5, -0.8, -0.8]

In [48]:
df = pd.read_csv("heisenberg_data.csv")

In [49]:
def get_next_step(oldObs, action):
    # action is always either 0, 1, 2, or 3
    next_state = []
    if action==0:
        next_state = oldObs @ A
    elif action==1:
        next_state = oldObs @ B
    elif action==2:
        next_state = oldObs @ C
    else:
        next_state = oldObs @ D
    curReward = getReward(next_state)
    done = curReward==20
    return (next_state, curReward, done)
    

In [50]:
# adapted from CS 540 Spring 2023 HW 10
EPISODES = 300000
LEARNING_RATE = .1
DISCOUNT_FACTOR = .99
EPSILON = 1
EPSILON_DECAY = .9999

random.seed(1)

# starts with an estimate of zero reward for each state.
# adapted from ChatGPT
# the outer dictionary has keys of the string version of the given array, and 
# values of a dictionary for each of the actions that we could take at that state
# Q_table = defaultdict(lambda: [0, 0, 0, 0])
Q_table = CustomDefaultDict(default_value_for_key)

episode_reward_record = deque(maxlen=100)

for i in range(EPISODES):
    episode_reward = 0
    done = False
    # choose a random starting row
    # adapted from https://stackoverflow.com/questions/15923826/random-row-selection-in-pandas-dataframe
    cur_row = df.sample(1)
    obs = np.array([
        [1, int(cur_row['val1']), int(cur_row['val2'])], 
        [0, 1, int(cur_row['val3'])], 
        [0, 0, 1]
        ])

    index = 1

    while (not done):
        # perform an epsilon greedy action 
        # Q(s, a) = (1-LEARNING_RATE)Q(s, a) + (LEARNING_RATE)(r + DISCOUNT_FACTOR(max a'(Q(s', a'))))
        action = epsilon_greedy_search(Epsilon=EPSILON, qtable=Q_table, state=obs)

        oldObs = obs
        obs,reward,done = get_next_step(oldObs, action)
        Q_table[(oldObs[0][1], oldObs[0][2], oldObs[1][2])][action] = (1-LEARNING_RATE) * Q_table[(oldObs[0][1], oldObs[0][2], oldObs[1][2])][action] + (LEARNING_RATE) * (reward + DISCOUNT_FACTOR * (max_a_prime(Q_table, obs)))

        episode_reward += reward # update episode reward

        index += 1
        # if we take more than 100 steps, end this iteration early (we are probably not making progress)
        if index > 100:
            done=True

    # decay the epsilon
    EPSILON *= EPSILON_DECAY

    # record the reward for this episode
    episode_reward_record.append(episode_reward) 

    if i%100 ==0 and i>0:
        print("Average reward for the last 100 iterations: " + str(sum(list(episode_reward_record))/100))
        print("epsilon: " + str(EPSILON) )



Average reward for the last 100 iterations: -94.81752622162251
epsilon: 0.989950333757503
Average reward for the last 100 iterations: -94.88942223580263
epsilon: 0.9800996732739187
Average reward for the last 100 iterations: -93.45968949162248
epsilon: 0.9703470333764725
Average reward for the last 100 iterations: -93.9330556572862
epsilon: 0.9606914386955115
Average reward for the last 100 iterations: -94.22678797139571
epsilon: 0.9511319235669539
Average reward for the last 100 iterations: -93.00182446796866
epsilon: 0.9416675319357145
Average reward for the last 100 iterations: -91.42866251832325
epsilon: 0.9322973172600907
Average reward for the last 100 iterations: -92.0126754228778
epsilon: 0.9230203424170932
Average reward for the last 100 iterations: -92.70662087241739
epsilon: 0.9138356796087268
Average reward for the last 100 iterations: -93.57576582355937
epsilon: 0.9047424102692004
Average reward for the last 100 iterations: -93.8662410057146
epsilon: 0.89573962497306
Avera

In [51]:
def matrix_to_tuple(matrix):
    return (matrix[0][1], matrix[0][2], matrix[1][2])

In [52]:
# test with the other dataframe. 
test_df = pd.read_csv("heisenberg_data_test.csv")

In [53]:
cur_matrix = np.array([[1, int(cur_row['val1']), int(cur_row['val2'])], [0, 1, int(cur_row['val3'])], [0, 0, 1]])

In [54]:
def matrix_to_num_steps(cur_matrix):
    index = 1
    for i in range(50):
        if (cur_matrix==A).all() or (cur_matrix==B).all() or (cur_matrix==C).all() or (cur_matrix==D).all():
            return i
        outputs = Q_table[matrix_to_tuple(cur_matrix)]
        # print(outputs)
        # if outputs==[0, 0, 0, 0]:
        #     # this is a problem because we haven't seen this state before 
        #     # in training so we have no idea how to handle it
        #     print("Problem.")
        index = np.argmax(outputs)
        if index==0:
            cur_matrix = cur_matrix @ A
        elif index==1:
            cur_matrix = cur_matrix @ B
        elif index==2:
            cur_matrix = cur_matrix @ C
        elif index==3:
            cur_matrix = cur_matrix @ D
    return 100

In [55]:
def test_Q_learning(row):
    cur_matrix = np.array([[1, int(row['val1']), int(row['val2'])], [0, 1, int(row['val3'])], [0, 0, 1]])
    return matrix_to_num_steps(cur_matrix)

test_df['num_moves_Q_learning_needs'] = test_df.apply(test_Q_learning, axis=1)

In [56]:
print("The proportion of starting positions in the test dataset that we can find a route to the origin that's <50 steps: ")
sum(test_df['num_moves_Q_learning_needs']!=100)/test_df.shape[0]

The proportion of starting positions in the test dataset that we can find a route to the origin that's <50 steps: 


0.8544854485448545

In [57]:
print("Of these, the proportion of times where we learned a path that was < 20 moves: ")
# encouraging because all of these were generated as sequences of 30 moves
# so we've found significantly faster paths back to the origin for almost all moves that we find a path to the origin 
sum(test_df['num_moves_Q_learning_needs']<20)/sum(test_df['num_moves_Q_learning_needs']!=100)

Of these, the proportion of times where we learned a path that was < 20 moves: 


0.9968398876404494

In [58]:
filtered_df = test_df[test_df['num_moves_Q_learning_needs']!=100]

In [59]:
def first_matrix_to_apply(row):
    outputs = Q_table[(int(row['val1']), int(row['val2']), int(row['val3']))]
    return np.argmax(outputs)

filtered_df['num_moves_Q_learning_needs'] = filtered_df.apply(first_matrix_to_apply, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['num_moves_Q_learning_needs'] = filtered_df.apply(first_matrix_to_apply, axis=1)


In [60]:
bound = int(filtered_df.shape[0] * 0.6)
plus_one = bound+1
train = filtered_df.iloc[1:bound]
test = filtered_df.iloc[plus_one:filtered_df.shape[0]]

In [61]:
train.to_csv("learned_Q_moves.csv", index=False)
test.to_csv("learned_Q_moves_test.csv", index=False)