# This is a notebook for testing my implementation of LSPI

In [None]:
# Messing around with OpenAI Gym

In [2]:
import gym
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline



In [49]:
env = gym.make('CartPole-v0')
#env = gym.make("NChain-v0")
env.reset()

array([ 0.00663644,  0.00141114, -0.04101978,  0.02104077])

# Implementation

In [160]:
def LSPI(basis_functions, gamma, epsilon, w):
    '''
    Compute the parameters of the policy, w, using the LSPI algorithm.
    
    Inputs:
    sample: list of tuples of the form (s,a,r,s')
    basis_functions: list of basis functions
    gamma: float, discount factor
    epsilon: float, convergence threshold
    w: intial policy parameter vector
    
    Outputs:
    w: the converged policy paramters
    '''
    
    while True:
        w_prev = w
        w = LSTDQ(basis_functions, gamma, w)
        
        if converged(w, w_prev, epsilon):
            break 
        else:
            w_prev = w
      
        print w
    return w

def converged(w, w_prev, epsilon):
    '''
    Determines if the policy parameters have converged based
    on whether or not the norm of the difference of w
    is less than the threshold epsilon.
    
    Inputs:
    w: a policy parameter vector
    w_prev: the policy parameter vetor from a previous iteration.
    epsilon: float, convergence threshold
    '''
    return np.linalg.norm(w - w_prev) < epsilon

def LSTDQ(basis_functions, gamma, w):
    '''
    Simple version of LSTDQ
    '''
    k = len(basis_functions)
    #A = np.zeros((k,k)), this might not have an inverse, use the next line instead
    A = np.identity(k) * 0.01
    b = np.zeros(k)
    
    sub_samples = generate_samples(10)
    #samples[np.random.choice(len(samples), 100, replace=False)]
    
    for s, a, r, sp in sub_samples:
        phi = compute_phi(s,a, basis_functions)
        phi_p = compute_phi(sp, get_policy_action(sp, w, basis_functions), basis_functions)

        A += np.outer(phi, (phi - gamma*phi_p))
        b = b + phi*r
    
    
    w = np.dot(np.linalg.inv(A),b)
    return w
    
    

    
    
def LSTDQ_OPT(samples, basis_functions, gamma, w, sigma=0.1):
    '''
    Computes an approximation of the policy parameters based
    on the LSTDQ-OPT algorithm presented in the paper.
    
    Inputs:
    sample: list of tuples of the form (s,a,r,s')
    basis_functions: list of basis functions
    gamma: float, discount factor
    epsilon: float, convergence threshold
    w: intial policy parameter vector
    
     sigma: small positive float.
    '''
    pass
       

def compute_phi(s,a, basis_functions):
    '''
    Computes the vector ϕ(s,a) according to the basis function ϕ_1...ϕ_k
    
    Inputs:
    s: state
    a: action
    basis_functions: list of basis functions that operate on s and a
    
    Outputs:
    ϕ(s,a), a vector where each entry is the result of one of the basis functions.
    '''
    phi= np.array([bf(s,a) for bf in basis_functions])
    return phi
    
def get_policy_action(s, w, basis_functions):
    '''
    Given a parameterization for the policy,
    reconstruct the policy and querery it to get 
    the optimal action for state s. That is,
    the argmax over actions of ϕ(s,a).w
    
    Inputs:
    s: state
    w: policy parameters
    action_space: set of all possible actions
    
    Outputs:
    action a that the policy says is best
    '''
    a_max = None
    max_score = float("-inf")
    
    # TODO: don't hard code action space
    action_space = [0,1]
    
    # Search action space for most valuable action
    for a in action_space:
        #print "phi:", compute_phi(s,a, basis_functions)
        #print "w:",w
        score = np.dot(compute_phi(s,a, basis_functions), w)
        # update if we found something better
        if score > max_score:
            max_score = score
            a_max = a
            
    return a_max
    

def get_basis_functions(env, k):
    '''
    Define some basis functions and return them in a list
    '''
    bfs = []
    random_points = []

    s1 = env.observation_space.sample()
    s2 = env.observation_space.sample()
    s3 = env.observation_space.sample()
    s4 = env.observation_space.sample()
    
    s1 = np.array([1,1,1,1])
    s2 = np.array([0,0,0,0])
    s3 = np.array([-1,1,0,-1])
    
    
    print "s1:",s1
    print "s2:",s2
    bf1 = lambda s,a: 1
    bf2 = lambda s,a: np.exp( - np.linalg.norm(s-s1)/2.0)
    bf3 = lambda s,a: np.exp( - np.linalg.norm(s-s2)/2.0)
    bf4 = lambda s,a: np.exp( - np.linalg.norm(s-s3)/2.0)
    bf5 = lambda s,a: np.exp( - np.linalg.norm(s-s3)/2.0)
    bf6 = lambda s,a: int(a==0)*np.exp( - np.linalg.norm(s-s3)/2.0)
    bf7 = lambda s,a: int(a==1)*np.exp( - np.linalg.norm(s-s1)/2.0)
    
    bfs = [bf1,bf2,bf3,bf4, bf5, bf6, bf7]
    
    return bfs


def generate_samples(n_samples, n_steps=100):
    samples = []
    for i in range(n_samples):
        env.reset()
        for j in range(n_steps):
            s = list(env.env.state)
            a = env.action_space.sample()
            sp,r, _,_ = env.step(a)
            
            sample = (s, a, r, sp)
            samples.append(sample)

    return np.array(samples)
    

In [159]:
bfs[2](np.array([10,2,3,4]),10)

0.0

In [161]:
bfs = get_basis_functions(env,10)




gamma, epsilon, k = 0.1, 0.00001, len(bfs)
w = np.zeros(k)
#w_est = LSPI(bfs, gamma, epsilon, w)
#print w_est



s1: [ -1.79830083e+00  -3.15795228e+38   3.27502208e-03  -2.88198435e+38]
s2: [ -2.91864568e+00  -3.14682696e+38   1.34608990e-01  -1.60483486e+38]


In [162]:
env.observation_space.high

array([  4.80000000e+00,   3.40282347e+38,   4.18879020e-01,
         3.40282347e+38])

In [176]:
env.reset()

array([ 0.02831727,  0.03855601, -0.02611418, -0.04275159])