In [2]:
import random
import numpy as np
from collections import deque
import numpy as np
import pandas as pd
from collections import defaultdict


In [3]:
def default_Q_value():
    return 0

def epsilon_greedy_search(Epsilon, qtable, state):
    if (random.random() < Epsilon):
        # 0 is 'apply matrix A', 1 is 'apply matrix B'
        # 2 is 'apply matrix C', 3 is 'apply matrix D'
        return random.choice([0, 1, 2, 3])
    else:
        # get the best move for the current state
        return best_move_for_a_state(Q_table=qtable, state=state)

def best_move_for_a_state(Q_table, state):
    candidates = []
    vals = []

    # I would like to return the best move for a given state
    for i in Q_table:
        if i[0] == str(state):
            candidates.append(i)
            vals.append(Q_table[i])
    if vals:
        return candidates[np.argmax(vals)][1]
    # if we don't have any values in our q-table for this state, just return a random action
    return random.choice([0, 1, 2, 3])

def max_a_prime(Q_table, state):
    candidates = []
    vals = []

    # I would like to return the maximum value of Q(s', a') over all possible a' values
    for i in Q_table:
        if i[0] == str(state):
            candidates.append(i)
            vals.append(Q_table[i])
    if vals:
        return Q_table[candidates[np.argmax(vals)]]
    return 0

In [4]:
# B is the inverse of A
A = np.array([[1, 1, 0], [0, 1, 0], [0, 0, 1]])
B = np.array([[1, -1, 0], [0, 1, 0], [0, 0, 1]])

# C is the inverse of D
C = np.array([[1, 0, 0], [0, 1, 1], [0, 0, 1]])
D = np.array([[1, 0, 0], [0, 1, -1], [0, 0, 1]])

# together, A, B, C, and D generate the heisenberg group

In [5]:
def getReward(matrix):
    if matrix[0][1]==1 and matrix[0][2]==0 and matrix[1][2]==0:
        return 20
    elif matrix[0][1]==0 and matrix[0][2]==0 and matrix[1][2]==1:
        return 20
    elif matrix[0][1]==-1 and matrix[0][2]==0 and matrix[1][2]==0:
        return 20
    elif matrix[0][1]==0 and matrix[0][2]==0 and matrix[1][2]==-1:
        return 20
    else:
        return 1/(1 + 8*(matrix[0][1] + matrix[0][2] + matrix[1][2]))

In [6]:
df = pd.read_csv("heisenberg_data.csv")

In [7]:
def get_next_step(oldObs, action):
    # action is always either 0, 1, 2, or 3
    next_state = []
    if action==0:
        next_state = oldObs @ A
    elif action==1:
        next_state = oldObs @ B
    elif action==2:
        next_state = oldObs @ C
    else:
        next_state = oldObs @ D
    curReward = getReward(next_state)
    done = curReward==20
    return (next_state, curReward, done)
    

In [8]:
# adapted from CS 540 HW 10
EPISODES = 200
LEARNING_RATE = .1
DISCOUNT_FACTOR = .99
EPSILON = 1
EPSILON_DECAY = .999

random.seed(1)

# starts with an estimate of zero reward for each state.
Q_table = defaultdict(default_Q_value) 

episode_reward_record = deque(maxlen=100)

for i in range(EPISODES):
    episode_reward = 0
    done = False
    # choose a random starting row
    # adapted from https://stackoverflow.com/questions/15923826/random-row-selection-in-pandas-dataframe
    cur_row = df.sample(1)
    obs = np.array([
        [1, int(cur_row['val1']), int(cur_row['val2'])], 
        [0, 1, int(cur_row['val3'])], 
        [0, 0, 1]
        ])

    index = 1

    # perform a=epsilon-greedy(Q, s), receive r, s'
    # r is the reward, s' is the next state
    while (not done):
        # want this to perform an epsilon greedy action 
        # Q(s, a) = (1-LEARNING_RATE)Q(s, a) + (LEARNING_RATE)(r + DISCOUNT_FACTOR(max a'(Q(s', a'))))
        action = epsilon_greedy_search(Epsilon=EPSILON, qtable=Q_table, state=obs)

        oldObs = obs
        obs,reward,done = get_next_step(oldObs, action)
        Q_table[(str(oldObs), action)] = (1-LEARNING_RATE) * Q_table[(str(oldObs), action)] + (LEARNING_RATE) * (reward + DISCOUNT_FACTOR * (max_a_prime(Q_table, obs)))

        episode_reward += reward # update episode reward

        index += 1
        if index > 40:
            done=True

    # decay the epsilon
    EPSILON *= EPSILON_DECAY

    # record the reward for this episode
    episode_reward_record.append(episode_reward) 

    if i%10 ==0 and i>0:
        print("LAST 10 EPISODE AVERAGE REWARD: " + str(sum(list(episode_reward_record))/100))
        print("EPSILON: " + str(EPSILON) )



Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
LAST 10 EPISODE AVERAGE REWARD: 0.02577302152238564
EPSILON: 0.9890548353295385
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
LAST 10 EPISODE AVERAGE REWARD: 0.1551848755564767
EPSILON: 0.9792086759647052
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
LAST 10 EPISODE AVERAGE REWARD: 0.2921262567131435
EPSILON: 0.9694605362958227
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
LAST 10 EPISODE AVERAGE REWARD: 0.4389392626688959
EPSILON: 0.959809440525076
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
LAST 10 EPISODE AVERAGE REWARD: 0.4834093853774489
EPSILON: 0.9502544225688344
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decaying?
Decay

In [21]:
Q_table['[[ 1  1 19]\n [ 0  1 -6]\n [ 0  0  1]]', 2]

0.0007751937984496124

In [26]:
Q_table['[[1 2 0]\n [0 1 0]\n [0 0 1]]', 2]

0

In [28]:
Q_table['[[ 1  1 10]\n [ 0  1  2]\n [ 0  0  1]]', 2]

0.001844352057595638

In [29]:
myArr = np.array([[1, 1, 10], [0, 1, 2], [0, 0, 1]])

In [30]:
Q_table[str(myArr), 2]

0.001844352057595638