In [1]:
import random
import numpy as np
from collections import deque
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import math


In [2]:
def matrix_to_tuple(matrix):
    return (matrix[0][0], matrix[0][1], 
            matrix[1][0], matrix[1][1]) 

# index 12 according to alex's paper. Is it congruent to identity mod 2 or mod 4?
# can generate with any coset I want by starting at a representative from each coset and see if we get our way back to it
A = np.array([[1, 2], [0, 1]])
B = np.array([[1, 0], [2, 1]])

# elements on the diagonal are 1 mod 4. 
# elements not on the diagonal are 0 mod 2. 

# C is the inverse of A
# D is the inverse of B
C = np.linalg.inv(A)
D = np.linalg.inv(B)

identity = np.array([[1, 0], [0, 1]])

In [3]:

def epsilon_greedy_search(Epsilon, qtable, state):
    if (random.random() < Epsilon):
        # 0 is 'apply matrix A', 1 is 'apply matrix B'
        # 2 is 'apply matrix C', 3 is 'apply matrix D'
        return random.choice([0, 1, 2, 3])
    else:
        # get the best move for the current state
        return best_move_for_a_state(Q_table=qtable, state=state)
    
# I would like to return the best move for a given state
def best_move_for_a_state(Q_table, state):
    # vals = Q_table[(state[0][1], state[0][2], state[1][2])]

    apply_A = state @ A
    apply_B = state @ B
    apply_C = state @ C
    apply_D = state @ D

    vals = [0, 0, 0, 0]
    vals[0] = Q_table[matrix_to_tuple(apply_A)]
    vals[1] = Q_table[matrix_to_tuple(apply_B)]
    vals[2] = Q_table[matrix_to_tuple(apply_C)]
    vals[3] = Q_table[matrix_to_tuple(apply_D)]

    # if we haven't visited this state before, return a random choice of 0, 1, 2, or 3
    if vals==[0, 0, 0, 0]:
        return random.choice([0, 1, 2, 3])
    
    # if we have visited this state before, return the current best choice
    return np.argmax(vals)

# over a given state, return the maximum value of the table for that state
def max_a_prime(Q_table, state):
    apply_A = state @ A
    apply_B = state @ B
    apply_C = state @ C
    apply_D = state @ D

    vals = [0, 0, 0, 0]
    vals[0] = Q_table[matrix_to_tuple(apply_A)]
    vals[1] = Q_table[matrix_to_tuple(apply_B)]
    vals[2] = Q_table[matrix_to_tuple(apply_C)]
    vals[3] = Q_table[matrix_to_tuple(apply_D)]
    
    return max(vals)

In [4]:
max_reward = 100
step_penalty = -1

def getReward(matrix):
    if (matrix==identity).all():
        return max_reward
    else:
        return step_penalty

In [5]:
df = pd.read_csv("../Data_Generation/Data_files/sl2_Z.csv")

In [6]:
def get_next_step(oldObs, action):
    # action is always either 0, 1, 2, or 3
    next_state = []
    if action==0:
        next_state = oldObs @ A
    elif action==1:
        next_state = oldObs @ B
    elif action==2:
        next_state = oldObs @ C
    else:
        next_state = oldObs @ D
    curReward = getReward(next_state)
    done = curReward==max_reward
    return (next_state, curReward, done)
    

In [None]:
def are_we_done_yet(my_matrix):
    return (my_matrix==identity).all()

In [None]:
def apply_mat(mat, index):
    if index==0:
        return mat @ A
    elif index==1:
        return mat @ B
    elif index==2:
        return mat @ C
    elif index==3:
        return mat @ D
    assert(1==2)


In [None]:
def tuple_to_matrix(tuple):
    return np.array([[tuple[0], tuple[1]], [tuple[2], tuple[3]]])

In [None]:
df[(df['val1'] % 2 == 1) & (df['val2'] % 2 == 0) & (df['val3'] % 2 == 0) & (df['val4'] % 2 == 1)]

In [None]:
filter_df = df[df['val1'] % 2 == 1]
filter_df = filter_df[filter_df['val2'] % 2 == 0]
filter_df = filter_df[filter_df['val3'] % 2 == 0]
filter_df = filter_df[filter_df['val4'] % 2 == 1]
filter_df

In [None]:
EPISODES = 30000
LEARNING_RATE = .9
DISCOUNT_FACTOR = .99
EPSILON = 1
EPSILON_DECAY = .9999

random.seed(42)

# starts with an estimate of zero reward for each state.
# adapted from ChatGPT
Q_table = defaultdict(lambda: 0)

episode_reward_record = deque(maxlen=100)

for i in range(EPISODES):
    episode_reward = 0
    done = False
    # choose a random starting row
    # adapted from https://stackoverflow.com/questions/15923826/random-row-selection-in-pandas-dataframe
    cur_row = df.sample(1)
    obs = np.array([
        [int(cur_row['val1']), int(cur_row['val2'])], 
        [int(cur_row['val3']), int(cur_row['val4'])]
        ])

    index = 1

    while (not done):
        # perform an epsilon greedy action 
        # Q(s, a) = (1-LEARNING_RATE)Q(s, a) + (LEARNING_RATE)(r + DISCOUNT_FACTOR(max a'(Q(s', a'))))
        action = epsilon_greedy_search(Epsilon=EPSILON, qtable=Q_table, state=obs)

        oldObs = obs
        obs,reward,done = get_next_step(oldObs, action)

        # if done:
        #     assert(1==2)
        
        Q_table[matrix_to_tuple(obs)] = (1-LEARNING_RATE) * Q_table[matrix_to_tuple(obs)] + (LEARNING_RATE) * (reward + DISCOUNT_FACTOR * (max_a_prime(Q_table, obs)))

        episode_reward += reward # update episode reward

        index += 1
        # if we take more than 100 steps, end this iteration early (we are probably not making progress)
        if index > 100:
            done=True

    # decay the epsilon
    EPSILON *= EPSILON_DECAY

    # record the reward for this episode
    episode_reward_record.append(episode_reward) 

    if i%100 ==0 and i>0:
        print("Average reward for the last 100 iterations: " + str(sum(list(episode_reward_record))/100))
        print("epsilon: " + str(EPSILON) )



In [None]:
def access_Q_table(mat):
    return Q_table[matrix_to_tuple(mat)]

In [None]:
print(access_Q_table(A @ A @ A))
print(access_Q_table(B @ A @ B))
print(access_Q_table(B @ B @ A))
print(access_Q_table(C @ C @ C))
print(access_Q_table(C @ C @ B))
print(access_Q_table(D @ D @ C))
print(access_Q_table(D @ D @ D))

In [None]:
access_Q_table(np.array([[1, 1], [0, 1]]))

In [None]:
access_Q_table(A)

In [None]:
# test with the other dataframe. 
test_df = pd.read_csv("../Data_Generation/Data_files/sl2_Z_test.csv")

In [None]:
def matrix_to_num_steps(cur_matrix):
    index = 1
    for i in range(50):
        if (cur_matrix==identity).all():
            return i
        outputs = [0, 0, 0, 0]
        outputs[0] = Q_table[matrix_to_tuple(cur_matrix@ A)]
        outputs[1] = Q_table[matrix_to_tuple(cur_matrix@ B)]
        outputs[2] = Q_table[matrix_to_tuple(cur_matrix@ C)]
        outputs[3] = Q_table[matrix_to_tuple(cur_matrix@ D)]
        index = np.argmax(outputs)
        if index==0:
            cur_matrix = cur_matrix @ A
        elif index==1:
            cur_matrix = cur_matrix @ B
        elif index==2:
            cur_matrix = cur_matrix @ C
        elif index==3:
            cur_matrix = cur_matrix @ D
    return 100

In [None]:
def test_Q_learning(cur_row):
    cur_matrix = np.array([
        [int(cur_row['val1']), int(cur_row['val2'])], 
        [int(cur_row['val3']), int(cur_row['val4'])]
        ])
    return matrix_to_num_steps(cur_matrix)

test_df['num_moves_Q_learning_needs'] = test_df.apply(test_Q_learning, axis=1)

In [None]:
print("The proportion of starting positions in the test dataset that we can find a route to the origin that's <50 steps: ")
sum(test_df['num_moves_Q_learning_needs']!=100)/test_df.shape[0]

In [None]:
print("Of these, the proportion of times where we learned a path that was < 20 moves: ")
# encouraging because all of these were generated as sequences of 30 moves
# so we've found significantly faster paths back to the origin for almost all moves that we find a path to the origin 
sum(test_df['num_moves_Q_learning_needs']<20)/sum(test_df['num_moves_Q_learning_needs']!=100)

In [None]:
filtered_df = test_df[test_df['num_moves_Q_learning_needs']!=100]

In [None]:
def first_matrix_to_apply(cur_row):
    outputs = [0, 0, 0, 0]
    cur_matrix = np.array([
        [int(cur_row['val1']), int(cur_row['val2'])], 
        [int(cur_row['val3']), int(cur_row['val4'])]
        ])
    outputs[0] = Q_table[matrix_to_tuple(cur_matrix@ A)]
    outputs[1] = Q_table[matrix_to_tuple(cur_matrix@ B)]
    outputs[2] = Q_table[matrix_to_tuple(cur_matrix@ C)]
    outputs[3] = Q_table[matrix_to_tuple(cur_matrix@ D)]
    return np.argmax(outputs)

filtered_df['first_move_by_Q_learning'] = filtered_df.apply(first_matrix_to_apply, axis=1)

In [None]:
filtered_df

In [None]:
# filtered_df = filtered_df.drop('num_moves_Q_learning_needs', axis=1)

In [None]:
bound = int(filtered_df.shape[0] * 0.6)
plus_one = bound+1
train = filtered_df.iloc[1:bound]
test = filtered_df.iloc[plus_one:filtered_df.shape[0]]

In [None]:
def get_Q_value(row):
    return Q_table[(int(row['val1']), 
    int(row['val2']), 
    int(row['val3']),
    int(row['val4'])
    )]

In [None]:
train.to_csv("../Data_Generation/Data_files/train_rows_SL2Z_Q_learn.csv", index=False)
test.to_csv("../Data_Generation/Data_files/test_rows_SL2Z_Q_learn.csv", index=False)

In [None]:
def mod_2_is_identity(test_tuple):
    assert len(test_tuple)==4
    return (test_tuple[0] % 2 == 1 and 
            test_tuple[1] % 2 == 0 and 
            test_tuple[2] % 2 == 0 and 
            test_tuple[3] % 2 == 1)

In [None]:
mod_2_is_identity([1, 2, 1, 1])