In [1]:
import random
import numpy as np
import torch
from collections import deque
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt

import sys
sys.path.append('../')

from util import *

## Group Definition:
Let $A = \begin{bmatrix}1 & 2 \\ 0 & 1\end{bmatrix}$, $B = \begin{bmatrix}1 & 0 \\ 2 & 1\end{bmatrix}$.
The group $\Gamma \coloneqq \langle A,B\rangle \subseteq SL_2(\mathbb{Z})$ is an index $12$ subgroup. The diagonal entries are congruent to $1\pmod{4}$ and the non-diagonal entries and divisible by $2$. 

We define $C = A^{-1}$ and $D = B^{-1}$.

We can generate with any coset by starting at a representative from each coset and see if we get our way back to it?

In [2]:
MAX_REWARD = 100
STEP_PENALTY = -1


def getReward(matrix: torch.tensor) -> int:
    if (matrix == torch.eye(2)).all():
        return MAX_REWARD
    else:
        return STEP_PENALTY

Load data from ../Data_Generation/Data_files

In [3]:
generators = k_sl2z_2s_gen
subset = '_2s'

base_dir = '../Data_Generation/Data_files/'
base_fp = base_dir + 'points/sl2_Z' + subset + '.csv'
test_fp = base_dir + 'labeled_points/sl2_Z' + subset + '_test.csv'
train_fp = base_dir + 'labeled_points/sl2_Z' + subset + '_train.csv'

df = pd.read_csv(base_fp)



Verify that the diagonal entries are congruent to $1\pmod{4}$ and the non-diagonal entries and divisible by $2$. 

In [4]:
filter_df = df[df['val1'] % 4 == 1]
filter_df = filter_df[filter_df['val2'] % 2 == 0]
filter_df = filter_df[filter_df['val3'] % 2 == 0]
filter_df = filter_df[filter_df['val4'] % 4 == 1]
filter_df

Unnamed: 0,val1,val2,val3,val4
9,1.0,0.0,-2.0,1.0
22,5.0,-2.0,8.0,-3.0
28,5.0,-12.0,-2.0,5.0
34,1.0,2.0,2.0,5.0
37,1.0,-2.0,2.0,-3.0
...,...,...,...,...
9949,1.0,-8.0,0.0,1.0
9971,-27.0,-20.0,-4.0,-3.0
9974,-7.0,30.0,-4.0,17.0
9988,-7.0,24.0,2.0,-7.0


In [5]:
env = TabularQEnv(generators, defaultdict(lambda: 0), getReward, MAX_REWARD)

EPISODES = 30000
LEARNING_RATE = .9
DISCOUNT_FACTOR = .99
EPSILON = 1
EPSILON_DECAY = .9999

random.seed(42)

# adapted from ChatGPT
episode_reward_record = deque(maxlen=100)

for i in range(EPISODES):
    episode_reward = 0
    done = False
    # choose a random starting row
    # adapted from https://stackoverflow.com/questions/15923826/random-row-selection-in-pandas-dataframe
    cur_row = df.sample(1)
    obs = torch.tensor([
        [int(cur_row['val1']), int(cur_row['val2'])], 
        [int(cur_row['val3']), int(cur_row['val4'])]
        ], dtype=torch.long)

    index = 1

    while (not done):
        # perform an epsilon greedy action 
        # Q(s, a) = (1-LEARNING_RATE)Q(s, a) + (LEARNING_RATE)(r + DISCOUNT_FACTOR(max a'(Q(s', a'))))
        obs, reward, done = env.step(LEARNING_RATE, DISCOUNT_FACTOR, EPSILON, obs)

        episode_reward += reward # update episode reward

        index += 1
        # if we take more than 100 steps, end this iteration early (we are probably not making progress)
        if index > 100:
            done=True

    # decay the epsilon
    EPSILON *= EPSILON_DECAY

    # record the reward for this episode
    episode_reward_record.append(episode_reward) 

    if (i+1)%500 ==0 and i>0:
        print("Episode {i}\teps: {eps:.3f}\tAvg rwd: {rwd:.3f}".format(i=i+1, eps=EPSILON, rwd=sum(list(episode_reward_record))/500))



  [int(cur_row['val1']), int(cur_row['val2'])],
  [int(cur_row['val3']), int(cur_row['val4'])]


Episode 500	eps: 0.951	Avg rwd: -20.000
Episode 1000	eps: 0.905	Avg rwd: -19.274
Episode 1500	eps: 0.861	Avg rwd: -20.000
Episode 2000	eps: 0.819	Avg rwd: -18.872
Episode 2500	eps: 0.779	Avg rwd: -18.448
Episode 3000	eps: 0.741	Avg rwd: -18.872
Episode 3500	eps: 0.705	Avg rwd: -18.544
Episode 4000	eps: 0.670	Avg rwd: -19.636
Episode 4500	eps: 0.638	Avg rwd: -17.052
Episode 5000	eps: 0.607	Avg rwd: -18.294
Episode 5500	eps: 0.577	Avg rwd: -16.148
Episode 6000	eps: 0.549	Avg rwd: -17.178
Episode 6500	eps: 0.522	Avg rwd: -18.168
Episode 7000	eps: 0.497	Avg rwd: -15.420
Episode 7500	eps: 0.472	Avg rwd: -18.410
Episode 8000	eps: 0.449	Avg rwd: -18.188
Episode 8500	eps: 0.427	Avg rwd: -16.526
Episode 9000	eps: 0.407	Avg rwd: -17.724
Episode 9500	eps: 0.387	Avg rwd: -16.908
Episode 10000	eps: 0.368	Avg rwd: -14.976
Episode 10500	eps: 0.350	Avg rwd: -17.316
Episode 11000	eps: 0.333	Avg rwd: -17.718
Episode 11500	eps: 0.317	Avg rwd: -17.302
Episode 12000	eps: 0.301	Avg rwd: -16.528
Episode 1250

In [6]:
def access_Q_table(mat):
    return env.Q_table[matrix_to_tuple(mat)]

In [8]:
# test with the other dataframe. 
test_df = pd.read_csv(base_fp)
test_df['num_moves_Q_learning_needs'] = test_df.apply(lambda row: env.play(df_row_to_mat(row)), axis=1)

In [10]:
print("The proportion of starting positions in the test dataset that we can find a route to the origin that's <50 steps: ")
print(sum(test_df['num_moves_Q_learning_needs']<=50)/test_df.shape[0])

print("Of these, the proportion of times where we learned a path that was < 20 moves: ")
# encouraging because all of these were generated as sequences of 30 moves
# so we've found significantly faster paths back to the origin for almost all moves that we find a path to the origin 
print(sum(test_df['num_moves_Q_learning_needs']<20)/sum(test_df['num_moves_Q_learning_needs']<=50))

The proportion of starting positions in the test dataset that we can find a route to the origin that's <50 steps: 
1.0
Of these, the proportion of times where we learned a path that was < 20 moves: 
1.0


In [None]:
def append_info_states_csv(fname_i, of_train, of_test, Q_env, prop_train=0.7):
    """
    Given CSV with various states and tabular-Q environment trained on a set containing those states, 
    estimate next best move + number of moves to identity, and append them to the state information.
    Then, split that dataset into train/test and write to corresponding CSVs
    Args:
        fname_i: csv to append to
        of_train: where to write final train csv
        of_test: where to write final test csv
        Q_env: TabularQEnv used to make predictons
        prop_train: proportion of data to be used for training
    """
    test_df = pd.read_csv(fname_i)
    test_df['num_moves_Q_learning_needs'] = test_df.apply(
        lambda row: Q_env.play(df_row_to_mat(row)), axis=1)
    filtered_df = test_df[test_df['num_moves_Q_learning_needs'] != 100]
    filtered_df['first_move_by_Q_learning'] = filtered_df.apply(
        lambda row: Q_env.best_move(df_row_to_mat(row)), axis=1)

    print(filtered_df.shape)

    bound = int(filtered_df.shape[0] * prop_train)
    train = filtered_df.iloc[1:bound]
    test = filtered_df.iloc[bound:filtered_df.shape[0]]

    train.to_csv(of_train, index=False)
    test.to_csv(of_test, index=False)

In [None]:
append_info_states_csv(base_fp,
                       train_fp,
                       test_fp,
                       env)