In [15]:
import numpy as np
import window
from IPython.display import clear_output

SIZE = 400
TIMESTEPS = 256

class Puck(object):
    def __init__(self):
        self.x, self.y = 0.0, 0.0
        self.xv, self.yv = 0.0, 0.0
        
        self.init_network()
        
    def init_network(self):
        # network
        hdim = 16
        
        self.act = lambda x: np.tanh(x)
        self.act_p = lambda x: 1 - np.tanh(x)**2
        
        #self.act = lambda x: x * (x > 0)
        #self.act_p = lambda x: (x > 0).astype(np.float32)
        
        self.W1 = np.random.randn(2 + 1, hdim) * 0.1
        self.W2 = np.random.randn(hdim + 1, 2) * 0.1
        
        # replay
        self.memory = []
        
    def ff(self, x, y):
        fet = np.array([[x / SIZE, y / SIZE, 1]])
        
        z1 = fet.dot(self.W1)
        h1 = self.act(z1)
        h1 = np.concatenate((h1, np.ones([1, 1])), axis=1)
        
        z2 = h1.dot(self.W2)
        h2 = self.act(z2)
        #h2 = z2
        
        return (fet, z1, h1, z2, h2)
    
    def fit(self, state, action, rtrace):
        fet, z1, h1, z2, h2 = self.ff(state[0], state[1])
        
        dh2 = (state - action)
        dz2 = np.multiply(dh2, self.act_p(z2))
        #dz2 = dh2
        
        g2 = np.multiply(h1.T, dz2)
        
        dh1 = self.W2.dot(dz2.T).T[:, :-1]
        dz1 = np.multiply(dh1, self.act_p(z1))
        
        g1 = np.multiply(fet.T, dz1)
                
        self.W2 -= g2 * 0.005 * rtrace
        self.W1 -= g1 * 0.005 * rtrace
        
    def train(self, iters=10):
        # sample an episode of experience
        # see if it got a reward, and push all state action pairs to be more likely if so
        for i in range(iters):
            ep = self.memory[np.random.randint(0, len(self.memory))]
            self.train_ep(ep)
        
    def train_ep(self, ep):
        rtrace = np.linspace(0.0, ep[-1]['r'], len(ep))
        #rtrace = np.ones([len(ep)]) * ep[-1]['r']
        #rtrace[:int(len(rtrace)/2)] = 0.0
        indices = np.arange(0, len(ep))
        np.random.shuffle(indices)
        for i in indices:
            xp = ep[i]
            self.fit(xp['s0'], xp['a0'], rtrace[i])
        
    def sample_action(self, x, y, stddev=1):
        x, z1, h1, z2, h2 = self.ff(x, y)
        
        # means are h2_1 and h2_2
        # sample with stddev
        a = h2 + np.random.randn(1, 2) * stddev
        
        return a
    
    def move(self, a):
        self.x += a[0, 0]
        self.y += a[0, 1]
        #self.x = self.xv
        #self.y = self.yv
        #self.xv *= 0.0
        #self.yv *= 0.0
        self.x = np.clip(self.x, -SIZE/2, SIZE/2)
        self.y = np.clip(self.y, -SIZE/2, SIZE/2)
        
    def render(self, window, cam):
        window.draw_rect(self.x + cam[0], self.y + cam[1], 8, 8, (255, 255, 255), thickness=0)
        
class Rect(object):
    def __init__(self, x, y, w, h):
        self.x = x
        self.y = y
        self.w = w
        self.h = h
        
    def point_inside(self, px, py):
        return abs(px - self.x) < self.w/2 and abs(py - self.y) < self.h/2
    
    def collide(self, puck):
        if self.point_inside(puck.x, puck.y):
            if puck.x < self.x:
                puck.x -= .2
                puck.xv = 0.0
            elif puck.x > self.x:
                puck.x += .2
                puck.xv = 0.0
            return True
        return False
        
    def render(self, window, cam, col=(255, 255, 255)):
        window.draw_rect((self.x-self.w/2) + cam[0], \
                         (self.y-self.h/2) + cam[1], self.w, self.h, col, thickness=0)

w = window.Window(caption='Puck', width=SIZE, height=SIZE)

camera = [SIZE/2, SIZE/2]

puck = Puck()

# reward square
reward = Rect(0, 0, 80, 80)

# wall
walls = [] #[Rect(.2, 0, .3, .3)]

stddev = 32.0
ep = 0

speedup = False
running = True
reset = False
while running:
    
    if len(puck.memory) > 2:
        clear_output(wait=True)
        print('episode: {} training...'.format(ep))
        puck.train(10)
    
    puck.x = np.random.random_sample() * SIZE - SIZE/2
    puck.y = np.random.random_sample() * SIZE - SIZE/2
        
    s0 = np.array([puck.x, puck.y])
    a0 = puck.sample_action(puck.x, puck.y, stddev)
    
    episode = []
    
    ep += 1
    t = 0
    reset = False
    while not reset:
        w.clear()

        if w.pressed(window.key.R):
            puck.init_network()
            reset = True
            
        if w.pressed(window.key.S):
            speedup = not speedup
            
        if w.pressed(window.key.UP):
            stddev += 1
        if w.pressed(window.key.DOWN):
            stddev -= 1
            
            
        #if w.pressed(window.key.UP):
        #    puck.yv += .1
        #if w.pressed(window.key.DOWN):
        #    puck.yv -= .1
        #if w.pressed(window.key.LEFT):
        #    puck.xv -= .1
        #if w.pressed(window.key.RIGHT):
        #    puck.xv += .1
            
        stddev = np.clip(stddev, 0, 32)

        w.reset_keys()
        
        # act
        puck.move(a0)
        
        # reward
        r = 0.0
        
        if reward.point_inside(puck.x, puck.y):
            r = 1.0
            reset = True
        else:
            for wall in walls:
                if wall.point_inside(puck.x, puck.y):
                    r = -1.0
                    reset = True
        
        s1 = np.array([puck.x, puck.y])
        a1 = puck.sample_action(puck.x, puck.y, stddev)
        
        episode += [{'s0': s0, 'a0': a0, 'r': r, 's1': s1, 'a1': a1}]

        # drawing
        if not speedup or (ep)%20 == 0:
            reward.render(w, camera, (0, 255, 0) if r > 0 else (0, 0, 255))
            
            wall_col = (255, 255, 255) if r < 0 else (255, 0, 0)
            for wall in walls:
                wall.render(w, camera, wall_col)
                
            puck.render(w, camera)
            w.draw_matrices([puck.W1, puck.W2], x=10, y=SIZE - 10)

            # draw policy as a grid of arrows
            for xx in np.linspace(-SIZE/2, SIZE/2, 8):
                for yy in np.linspace(-SIZE/2, SIZE/2, 8):
                    a = puck.sample_action(xx, yy, stddev=0.0)
                    verts = ((xx + camera[0], yy + camera[1]),
                                 (xx + camera[0] + a[0, 0]*2, yy + camera[1] + a[0, 1]*2))
                    w.draw_poly(verts, (255, 255, 255))
                    
            w.draw_text('episode: {} memories: {} stddev: {}'.format(\
                        ep, len(puck.memory), np.round(stddev, 2)), size=12, p=(10, 10))
            
        if speedup:
            w.label = None
        
        running = w.update()
        t += 1
        if t > TIMESTEPS:
            reset = True
        
        if reset:
            if len(episode) > 1 and episode[-1]['r'] != 0:
                puck.memory.append(episode)
                #puck.train_ep(episode)
        
        s0 = s1.copy()
        a0 = a1
    
print('done')

episode: 13 training...
done
