In [3]:
import random
import numpy as np
from collections import deque
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt

import sys
sys.path.append('../')

from util import *

## Group Definition:
Let $A = \begin{bmatrix}1 & 2 \\ 0 & 1\end{bmatrix}$, $B = \begin{bmatrix}1 & 0 \\ 2 & 1\end{bmatrix}$.
The group $\Gamma \coloneqq \langle A,B\rangle \subseteq SL_2(\mathbb{Z})$ is an index $12$ subgroup. The diagonal entries are congruent to $1\pmod{4}$ and the non-diagonal entries and divisible by $2$. 

We define $C = A^{-1}$ and $D = B^{-1}$.

We can generate with any coset by starting at a representative from each coset and see if we get our way back to it?

In [8]:
MAX_REWARD = 100
STEP_PENALTY = -1

def getReward(matrix: np.array) -> int:
    if is_done(matrix, matrix.shape[0]):
        return MAX_REWARD
    else:
        return STEP_PENALTY

In [9]:
df = pd.read_csv("../Data_Generation/Data_files/subset_sl2_Z_3s.csv")

Verify that the diagonal entries are congruent to $1\pmod{4}$ and the non-diagonal entries and divisible by $2$. 

In [10]:
filter_df = df[df['val1'] % 4 == 1]
filter_df = filter_df[filter_df['val2'] % 2 == 0]
filter_df = filter_df[filter_df['val3'] % 2 == 0]
filter_df = filter_df[filter_df['val4'] % 4 == 1]
filter_df

Unnamed: 0,val1,val2,val3,val4
9,1.0,0.0,6.0,1.0
21,213049.0,673746.0,601302.0,1901557.0
22,93637.0,16578.0,16476.0,2917.0
24,37.0,-6.0,-6.0,1.0
27,430266781.0,-130322256.0,-151430964.0,45866485.0
...,...,...,...,...
9880,37.0,-6.0,-6.0,1.0
9907,-17783.0,153750.0,-46560.0,402553.0
9942,5221.0,-31926.0,14088.0,-86147.0
9962,-647.0,-246.0,192.0,73.0


In [16]:
env = TabularQEnv(k_sl2z_2s_gen, defaultdict(lambda: 0), getReward, MAX_REWARD)

EPISODES = 30000
LEARNING_RATE = .9
DISCOUNT_FACTOR = .99
EPSILON = 1
EPSILON_DECAY = .9999

random.seed(42)

# adapted from ChatGPT
episode_reward_record = deque(maxlen=100)

for i in range(EPISODES):
    episode_reward = 0
    done = False
    # choose a random starting row
    # adapted from https://stackoverflow.com/questions/15923826/random-row-selection-in-pandas-dataframe
    cur_row = df.sample(1)
    obs = np.array([
        [int(cur_row['val1']), int(cur_row['val2'])], 
        [int(cur_row['val3']), int(cur_row['val4'])]
        ])

    index = 1

    while (not done):
        # perform an epsilon greedy action 
        # Q(s, a) = (1-LEARNING_RATE)Q(s, a) + (LEARNING_RATE)(r + DISCOUNT_FACTOR(max a'(Q(s', a'))))
        obs, reward, done = env.step(LEARNING_RATE, DISCOUNT_FACTOR, EPSILON, obs)

        episode_reward += reward # update episode reward

        index += 1
        # if we take more than 100 steps, end this iteration early (we are probably not making progress)
        if index > 100:
            done=True

    # decay the epsilon
    EPSILON *= EPSILON_DECAY

    # record the reward for this episode
    episode_reward_record.append(episode_reward) 

    if (i+1)%500 ==0 and i>0:
        print("Episode {i}\teps: {eps:.3f}\tAvg rwd: {rwd:.3f}".format(i=i+1, eps=EPSILON, rwd=sum(list(episode_reward_record))/500))



  [int(cur_row['val1']), int(cur_row['val2'])],
  [int(cur_row['val3']), int(cur_row['val4'])]


Average reward for the last 100 iterations: -100.0
epsilon: 0.989950333757503
Average reward for the last 100 iterations: -100.0
epsilon: 0.9800996732739187
Average reward for the last 100 iterations: -100.0
epsilon: 0.9703470333764725
Average reward for the last 100 iterations: -100.0
epsilon: 0.9606914386955115
Average reward for the last 100 iterations: -100.0
epsilon: 0.9511319235669539
Average reward for the last 100 iterations: -100.0
epsilon: 0.9416675319357145
Average reward for the last 100 iterations: -100.0
epsilon: 0.9322973172600907
Average reward for the last 100 iterations: -100.0
epsilon: 0.9230203424170932
Average reward for the last 100 iterations: -100.0
epsilon: 0.9138356796087268
Average reward for the last 100 iterations: -100.0
epsilon: 0.9047424102692004
Average reward for the last 100 iterations: -100.0
epsilon: 0.89573962497306
Average reward for the last 100 iterations: -100.0
epsilon: 0.8868264233442354
Average reward for the last 100 iterations: -100.0
epsi

Traceback (most recent call last):
  File "/Users/ayun/Library/Python/3.11/lib/python/site-packages/IPython/core/interactiveshell.py", line 3548, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/t9/gxb6q8zj21dff090q066td740000gn/T/ipykernel_32638/1224767720.py", line 30, in <module>
    obs, reward, done = env.step(LEARNING_RATE, DISCOUNT_FACTOR, EPSILON, obs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ayun/CODE/MXM/MXM_AI_Ellenberg/SL2_Z/Q_learning/../util.py", line 78, in step
  File "/Users/ayun/CODE/MXM/MXM_AI_Ellenberg/SL2_Z/Q_learning/../util.py", line 50, in epsilon_greedy_search
    return self.best_move_for_a_state(state=state)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ayun/CODE/MXM/MXM_AI_Ellenberg/SL2_Z/Q_learning/../util.py", line 56, in best_move_for_a_state
  File "/Users/ayun/CODE/MXM/MXM_AI_Ellenberg/SL2_Z/Q_learning/../util.py", line 40, in get_next_possible

In [12]:
def access_Q_table(mat):
    return env.Q_table[matrix_to_tuple(mat)]

In [15]:
access_Q_table(k_sl2z_2s_gen[0])

0

In [None]:
# test with the other dataframe. 
test_df = pd.read_csv("../Data_Generation/Data_files/subset_sl2_Z_3s_test.csv")

In [18]:
print("The proportion of starting positions in the test dataset that we can find a route to the origin that's <50 steps: ")
sum(test_df['num_moves_Q_learning_needs']!=100)/test_df.shape[0]

The proportion of starting positions in the test dataset that we can find a route to the origin that's <50 steps: 


0.2668

All paths from $A$ to $I$ for $A \in \Gamma$ that take less than 20 matrix multiplications

In [None]:
print("Of these, the proportion of times where we learned a path that was < 20 moves: ")
# encouraging because all of these were generated as sequences of 30 moves
# so we've found significantly faster paths back to the origin for almost all moves that we find a path to the origin 
sum(test_df['num_moves_Q_learning_needs']<20)/sum(test_df['num_moves_Q_learning_needs']!=100)

In [None]:
append_info_states_csv("../Data_Generation/Data_files/sl2_Z_test.csv", 
                       "../Data_Generation/Data_files/train_rows_SL2Z_Q_learn.csv", 
                       "../Data_Generation/Data_files/test_rows_SL2Z_Q_learn.csv")