# This is a notebook for testing my implementation of LSPI

In [None]:
# Messing around with OpenAI Gym

In [2]:
import gym
import numpy as np
env = gym.make('CartPole-v0')
env.reset()


array([-0.00710112,  0.02362306, -0.02358237,  0.01782232])

In [3]:
for _ in range(100):
    env.render()
    #env.step(env.action_space.sample()) # take a random action
    env.step(0)

No handlers could be found for logger "gym.envs.classic_control.cartpole"


In [2]:
env.reset()

array([ 0.04029155,  0.006834  , -0.01659412,  0.00514427])

In [29]:
env.action_space.sample()

0

In [7]:
env.observation_space

Box(4,)

In [24]:
env.step(env.action_space.sample())

(array([ 0.15534137,  0.40782107, -0.19749603, -0.83199806]), 1.0, False, {})

# Implementation

In [45]:
def LSPI(samples, basis_functions, gamma, epsilon, w):
    '''
    Compute the parameters of the policy, w, using the LSPI algorithm.
    
    Inputs:
    sample: list of tuples of the form (s,a,r,s')
    basis_functions: list of basis functions
    gamma: float, discount factor
    epsilon: float, convergence threshold
    w: intial policy parameter vector
    
    Outputs:
    w: the converged policy paramters
    '''
    
    w_prev = w
    while True:
        w = LSTDQ_OPT(samples, basis_functions, gamma, w)
        if converged(w, w_prev, epsilon):
            break 
        else:
            w_prev = w
    
    return w

def converged(w, w_prev, epsilon, sigma = 0.1):
    '''
    Determines if the policy parameters have converged based
    on whether or not the norm of the difference of w
    is less than the threshold epsilon.
    
    Inputs:
    w: a policy parameter vector
    w_prev: the policy parameter vetor from a previous iteration.
    epsilon: float, convergence threshold
    '''
    return np.linalg.norm(w-w_prev) < epsilon

def LSTDQ(samples, basis_functions, gamma, w):
    '''
    Simple version of LSTDQ
    '''
    k = len(basis_functions)
#    A = np.zeros((k,k)), this might not have an inverse, use the next line instead
    A = np.identity(k) * 0.1
    b = np.zeros(k)

    for s, a, r, sp in samples:
        
        phi = compute_phi(s,a, basis_functions)
        phi_p = compute_phi(sp, get_policy_action(sp, w), basis_functions)
        
        A = A + phi*(phi - gamma*phi_p).T
        b = b + phi*r
    
    w = np.linalg.inv(A)*b
    return w
    
    

    
    
def LSTDQ_OPT(samples, basis_functions, gamma, w, sigma=0.1):
    '''
    Computes an approximation of the policy parameters based
    on the LSTDQ-OPT algorithm presented in the paper.
    
    Inputs:
    sample: list of tuples of the form (s,a,r,s')
    basis_functions: list of basis functions
    gamma: float, discount factor
    epsilon: float, convergence threshold
    w: intial policy parameter vector
    
     sigma: small positive float.
    '''
    pass
       

def compute_phi(s,a, basis_functions):
    '''
    Computes the vector ϕ(s,a) according to the basis function ϕ_1...ϕ_k
    
    Inputs:
    s: state
    a: action
    basis_functions: list of basis functions that operate on s and a
    
    Outputs:
    ϕ(s,a), a vector where each entry is the result of one of the basis functions.
    '''
    return np.array([bf(s,a) for bf in basis_functions])
    
def get_policy_action(s, w, action_space):
    '''
    Given a parameterization for the policy,
    reconstruct the policy and querery it to get 
    the optimal action for state s. That is,
    the argmax over actions of ϕ(s,a).w
    
    Inputs:
    s: state
    w: policy parameters
    action_space: set of all possible actions
    
    Outputs:
    action a that the policy says is best
    '''
    a_max = None
    max_score = float("-inf")
    
    # Search action space for most valuable action
    for a in action_space:
        score = np.dot(compute_phi(s,a), w)
        # update if we found something better
        if score > max_score:
            max_score = score
            a_max = a
    
    return a_max
    


In [52]:
x = np.array([1,2,3])
y = np.array([4,5,6])
np.dot(x.T,y)

32

In [23]:
x = [(1,2,3), (4,5,6)]
for a,b,c in x:
    print a,b,c

1 2 3
4 5 6


In [43]:
basis_1 = lambda s,a: s+a
basis_2 = lambda s,a: s*a
basis_3 = lambda s,a: s**a

bfs = [basis_1, basis_2, basis_3]

In [44]:
compute_phi(4,5, bfs)

array([   9,   20, 1024])