In [53]:
import random
import numpy as np
from collections import deque
import numpy as np
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import math

import sys
sys.path.append('../')

from util import *


In [64]:
max_reward = 100
step_penalty = -1

def getReward(matrix):
    if (matrix==np.identity(2)).all():
        return max_reward
    else:
        return step_penalty

In [65]:
base_dir = '../Data_Generation/Data_files/'
base_fp = base_dir + 'sl2_Z.csv'
generators = k_sl2z_gen

df = pd.read_csv(base_fp)

In [66]:
env = TabularQEnv(generators, defaultdict(lambda: 0), getReward, max_reward)

EPISODES = 30000
LEARNING_RATE = .9
DISCOUNT_FACTOR = .99
EPSILON = 1
EPSILON_DECAY = .9999

random.seed(42)

# starts with an estimate of zero reward for each state.
# adapted from ChatGPT
episode_reward_record = deque(maxlen=500)

for i in range(EPISODES):
    episode_reward = 0
    done = False
    # choose a random starting row
    # adapted from https://stackoverflow.com/questions/15923826/random-row-selection-in-pandas-dataframe
    cur_row = df.sample(1)
    obs = np.array([
        [int(cur_row['val1']), int(cur_row['val2'])], 
        [int(cur_row['val3']), int(cur_row['val4'])]
        ])

    index = 1

    while (not done):
        # perform an epsilon greedy action 
        # Q(s, a) = (1-LEARNING_RATE)Q(s, a) + (LEARNING_RATE)(r + DISCOUNT_FACTOR(max a'(Q(s', a'))))
        obs, reward, done = env.step(LEARNING_RATE, DISCOUNT_FACTOR, EPSILON, obs)

        episode_reward += reward # update episode reward

        index += 1
        # if we take more than 100 steps, end this iteration early (we are probably not making progress)
        if index > 100:
            done=True

    # decay the epsilon
    EPSILON *= EPSILON_DECAY

    # record the reward for this episode
    episode_reward_record.append(episode_reward) 

    if (i+1)%500 ==0 and i>0:
        print("Episode {i}\teps: {eps:.3f}\tAvg rwd: {rwd:.3f}".format(i=i+1, eps=EPSILON, rwd=sum(list(episode_reward_record))/500))



  [int(cur_row['val1']), int(cur_row['val2'])],
  [int(cur_row['val3']), int(cur_row['val4'])]


Episode 500	eps: 0.951	Avg rwd: -93.714
Episode 1000	eps: 0.905	Avg rwd: -92.858
Episode 1500	eps: 0.861	Avg rwd: -84.416
Episode 2000	eps: 0.819	Avg rwd: -66.762
Episode 2500	eps: 0.779	Avg rwd: -52.202
Episode 3000	eps: 0.741	Avg rwd: -37.990
Episode 3500	eps: 0.705	Avg rwd: -10.186
Episode 4000	eps: 0.670	Avg rwd: 11.762
Episode 4500	eps: 0.638	Avg rwd: 22.352
Episode 5000	eps: 0.607	Avg rwd: 44.602
Episode 5500	eps: 0.577	Avg rwd: 58.050
Episode 6000	eps: 0.549	Avg rwd: 61.030
Episode 6500	eps: 0.522	Avg rwd: 64.400
Episode 7000	eps: 0.497	Avg rwd: 70.676
Episode 7500	eps: 0.472	Avg rwd: 70.752
Episode 8000	eps: 0.449	Avg rwd: 72.726
Episode 8500	eps: 0.427	Avg rwd: 75.504
Episode 9000	eps: 0.407	Avg rwd: 75.994
Episode 9500	eps: 0.387	Avg rwd: 77.792
Episode 10000	eps: 0.368	Avg rwd: 76.474
Episode 10500	eps: 0.350	Avg rwd: 78.032
Episode 11000	eps: 0.333	Avg rwd: 81.162
Episode 11500	eps: 0.317	Avg rwd: 82.904
Episode 12000	eps: 0.301	Avg rwd: 81.620
Episode 12500	eps: 0.286	Avg 

In [67]:
def access_Q_table(mat):
    return env.Q_table[matrix_to_tuple(mat)]

In [68]:
test_fp = base_dir + 'test_rows_SL2Z_Q_learn.csv'
train_fp = base_dir + 'train_rows_SL2Z_Q_learn.csv'

In [69]:
# test with the other dataframe. 
test_df = pd.read_csv(base_fp)
test_df['num_moves_Q_learning_needs'] = test_df.apply(lambda row: env.play(df_row_to_mat(row)), axis=1)

In [70]:
print("The proportion of starting positions in the test dataset that we can find a route to the origin that's <50 steps: ")
print(sum(test_df['num_moves_Q_learning_needs']<=50)/test_df.shape[0])

print("Of these, the proportion of times where we learned a path that was < 20 moves: ")
# encouraging because all of these were generated as sequences of 30 moves
# so we've found significantly faster paths back to the origin for almost all moves that we find a path to the origin 
print(sum(test_df['num_moves_Q_learning_needs']<20)/sum(test_df['num_moves_Q_learning_needs']<=50))

The proportion of starting positions in the test dataset that we can find a route to the origin that's <50 steps: 
1.0
Of these, the proportion of times where we learned a path that was < 20 moves: 
0.996


In [71]:
def append_info_states_csv(fname_i, of_train, of_test, Q_env, prop_train=0.7):
    """
    Given CSV with various states and tabular-Q environment trained on a set containing those states, 
    estimate next best move + number of moves to identity, and append them to the state information.
    Then, split that dataset into train/test and write to corresponding CSVs
    Args:
        fname_i: csv to append to
        of_train: where to write final train csv
        of_test: where to write final test csv
        Q_env: TabularQEnv used to make predictons
        prop_train: proportion of data to be used for training
    """
    test_df = pd.read_csv(fname_i)
    test_df['num_moves_Q_learning_needs'] = test_df.apply(lambda row: Q_env.play(df_row_to_mat(row)), axis=1)
    filtered_df = test_df[test_df['num_moves_Q_learning_needs']!=100]
    filtered_df['first_move_by_Q_learning'] = filtered_df.apply(lambda row: Q_env.best_move(df_row_to_mat(row)), axis=1)

    print(filtered_df.shape)

    bound = int(filtered_df.shape[0] * prop_train)
    train = filtered_df.iloc[1:bound]
    test = filtered_df.iloc[bound:filtered_df.shape[0]]

    train.to_csv(of_train, index=False)
    test.to_csv(of_test, index=False)

In [72]:
append_info_states_csv(base_fp,
                       train_fp, 
                       test_fp,
                       env)

(10000, 6)
