In [None]:
%matplotlib inline
from matplotlib import pyplot as plt

import numpy as np

import pygame
from pygame.locals import (QUIT, KEYDOWN, K_ESCAPE, MOUSEBUTTONDOWN, K_RIGHT, K_LEFT, K_r, K_s)

import Box2D
from Box2D.b2 import (world, polygonShape, staticBody, dynamicBody, circleShape, fixtureDef)
from Box2D import b2Filter

from IPython.display import clear_output


# RL Constants
GAMMA = 0.98 # discount rate

# Physics and rendering constants
PPM = 10.0  # pixels per meter
TARGET_FPS = 60
TIME_STEP = 1.0 / TARGET_FPS
SCREEN_WIDTH, SCREEN_HEIGHT = 600, 280
colors = {
    staticBody: (128, 128, 128, 255),
    dynamicBody: (255, 255, 255, 255),
}

screen = pygame.display.set_mode((SCREEN_WIDTH, SCREEN_HEIGHT), 0, 32)
pygame.display.set_caption('Stumbly')
clock = pygame.time.Clock()

world = world(gravity=(0, -20), doSleep=False)

# Add a floor
floor = world.CreateStaticBody(
    position=(0, 0),
    shapes=polygonShape(box=(SCREEN_WIDTH/PPM * 1.5, 1)),
)

def to_pybox2d(p):
    return [p[0]/PPM, (SCREEN_HEIGHT-p[1])/PPM]

def to_screen(p):
    return [p[0] * PPM, SCREEN_HEIGHT - p[1] * PPM]

def render():
    screen.fill((0, 0, 0))
    for body in world.bodies:
        for fixture in body.fixtures:
            shape = fixture.shape
            if isinstance(shape, polygonShape):
                vertices = [(body.transform * v) * PPM for v in shape.vertices]
                vertices = [(v[0], SCREEN_HEIGHT - v[1]) for v in vertices]
                pygame.draw.polygon(screen, colors[body.type], vertices)
            elif isinstance(shape, circleShape):
                pos = to_screen(body.position)
                radius = int(shape.radius * PPM)
                pygame.draw.circle(screen, colors[body.type], (int(pos[0]), int(pos[1])), radius)

    
# Neural Network
def bias_add(x):
    return np.concatenate([x, [[1]]], axis=1) # bias add
def act(x):
    return np.tanh(x)
def actp(x):
    return 1.0 - np.tanh(x)**2

class Layer(object):
    def __init__(self, xdim, ydim, a=act, ap=actp):
        self.a = a
        self.ap = ap
        self.xdim = xdim
        self.ydim = ydim
        self.W = np.random.randn(xdim + 1, ydim)
    def ff(self, x):
        self.x = bias_add(x)
        self.z = self.x.dot(self.W)
        self.h = self.a(self.z) if self.a else self.z
        return self
    def bp(self, deltas, learning_rate=0.1):
        self.dz = np.multiply(deltas, self.ap(self.z)) if self.ap else deltas
        self.grad = np.multiply(self.x.T, self.dz)
        self.dx = self.W.dot(self.dz.T).T[:, :-1] # remove bias add
        
        self.W -= self.grad * learning_rate
    
class NN(object):
    def __init__(self, xdim, hdim, ydim):
        self.l1 = Layer(xdim, hdim)
        self.l2 = Layer(hdim, hdim)
        self.l3 = Layer(hdim, ydim, None, None)
    
    def ff(self, x):
        h1 = self.l1.ff(x).h
        h2 = self.l2.ff(h1).h
        h3 = self.l3.ff(h2).h
        return h3
    
    def bp(self, deltas, learning_rate=0.1):
        self.l3.bp(deltas, learning_rate)
        self.l2.bp(self.l3.dx, learning_rate)
        self.l1.bp(self.l2.dx, learning_rate)


class Agent(object):
    def __init__(self, features=2, actions=2, eps=.1, learning_rate=0.01, max_memory=5000):
        self.actions = actions
        self.features = features
        self.learning_rate = learning_rate
        self.max_memory = max_memory
        
        self.avg_r = 0.0 # average reward
        self.A = range(self.actions)
        self.M = [] # memory (s1, a1, r, s2, a2) tuples
        self.init_networks()
        
    def init_networks(self):
        self.qnet = NN(xdim=self.features, hdim=20, ydim=self.actions) # action-value prediction network
        self.vnet = NN(xdim=self.features, hdim=20, ydim=1) # value prediction network
        
    def q_approx(self, state):
        return self.qnet.ff(state)
    
    def v_approx(self, state):
        return self.vnet.ff(state)
    
    def advantage(self, state):
        q = self.q_approx(state)
        v = self.v_approx(state)
        return q - v[0, 0]
        
    def sample_action(self, qs):
        if all(qs[0, 0] == qs[0, :]): # if they're all the same
            return np.random.choice(self.A)
        
        dist = [self.eps/self.actions] * self.actions
        dist[np.argmax(qs[0])] += 1.0 - self.eps
        return np.random.choice(self.A, p=dist)
    
    def train(self, iters=1):
        for i in range(iters):
            xp = self.M[np.random.randint(len(self.M))]
            self.train_qnet(xp)
            self.train_vnet(xp)
            
    def train_qnet(self, xp):
        # train with SARSA
        s2_q = self.q_approx(xp['s2'])
        s1_q = self.q_approx(xp['s1'])

        target = s1_q.copy()
        target[0, xp['a1']] = xp['r'] + GAMMA * np.max(s2_q[0])

        deltas = s1_q - target
        self.qnet.bp(deltas, self.learning_rate)
        
    def train_vnet(self, xp):
        # train with SARS
        v2 = self.v_approx(xp['s2'])
        v1 = self.v_approx(xp['s1'])

        target = xp['r'] + GAMMA * v2
        
        delta = v1 - target
        self.vnet.bp(delta, self.learning_rate)
            
    def memorize(self, xp):
        self.avg_r += 0.001 * (xp['r'] - self.avg_r)
        if len(self.M) >= self.max_memory:
            self.M[np.random.randint(self.max_memory)] = xp
        else:
            self.M.append(xp)
            
    def visualize_q(self):
        T = np.linspace(0, 2.0 * np.pi, 100)
        q2 = np.array([self.q_approx(np.array([[np.sin(t), np.cos(t)]])) for t in T])
        fig = plt.plot(T, q2[:,0])
            
    def reset(self):
        self.init_networks()
        self.M = []
    
    @property
    def eps(self):
        return self._eps
    
    @eps.setter
    def eps(self, value):
        self._eps = np.round(min(1.0, max(0.0, value)), 2)

class Body(object):
    def __init__(self):
        # Boxes and such for limbs and joints, methods to return 'state' of body (angles)
        c = (SCREEN_WIDTH/2, SCREEN_HEIGHT/2)
        
        self.limbs = {}
        self.joints = {}
        
        self.limbs['body'] = self.add_box(position=c, size=(20, 10), angle=0)
        diff = 27
        
        self.limbs['leg'] = self.add_box(position=(c[0]+diff, c[1]), size=(30, 2), angle=0)
        self.limbs['leg2'] = self.add_box(position=(c[0]-diff, c[1]), size=(30, 2), angle=0)
        
        self.joints['leg'] = self.add_joint(self.limbs['body'], self.limbs['leg'], (c[0]+diff-7, c[1]))
        self.joints['leg2'] = self.add_joint(self.limbs['body'], self.limbs['leg2'], (c[0]-diff+7, c[1]))
        
    def add_box(self, position, size, angle):
        box = world.CreateDynamicBody(position=to_pybox2d(position), angle=angle)
        box.CreatePolygonFixture(box=(size[0]/PPM, size[1]/PPM), density=1, friction=0.3, 
                                 filter=b2Filter(groupIndex=-2))
        return box
    
    def add_joint(self, a, b, anchor):
        joint = world.CreateRevoluteJoint(bodyA=a, bodyB=b, anchor=to_pybox2d(anchor), 
                                    maxMotorTorque = 1000.0,
                                    motorSpeed = 0.0,
                                    enableMotor = True,
                                    )
        return joint
    
    def position(self):
        return to_screen(self.limbs['body'].position)
    
    def set_position(self, pos):
        t = to_pybox2d(pos)
        p = self.limbs['body'].position
        diff = (t[0]-p[0], t[1]-p[1])
        for limb in self.limbs.values():
            limb.position = (limb.position[0] + diff[0], limb.position[1] + diff[1])
        
    def zero_linear_velocity(self):
        for limb in self.limbs.values():
            limb.linearVelocity = (0, 0)
            
    def state(self):
        #return np.array([[1]])
        a1 = self.limbs['body'].angle % 2.0 * np.pi
        a2 = self.limbs['leg'].angle % 2.0 * np.pi
        a3 = self.limbs['leg2'].angle % 2.0 * np.pi
        return np.array([[a1, a2, a3]])
    
    def act(self, action):
        j1 = self.joints['leg']
        j2 = self.joints['leg2']
        l1 = self.limbs['leg']
        l2 = self.limbs['leg2']
        speed = 10
        if action == 0:
            j1.motorSpeed = speed
            j2.motorSpeed = speed
        elif action == 1:
            j1.motorSpeed = -speed
            j2.motorSpeed = -speed

agent = Agent(features=3, actions=2)
body = Body()

agent.eps = .1

speedup = False
running = True
while running:
    
    body.zero_linear_velocity()
    body.set_position((SCREEN_WIDTH/2, SCREEN_HEIGHT/2 + 100))
    
    last_x = body.position()[0]
    
    s1 = body.state()
    q1 = agent.q_approx(s1)
    a1 = agent.sample_action(q1)
    
    reset = False
    while not reset:
        for event in pygame.event.get():
            if event.type == QUIT or (event.type == KEYDOWN and event.key == K_ESCAPE):
                running = False
                reset = True
            if event.type == KEYDOWN:
                k = pygame.key.get_pressed()
                if k[K_RIGHT]:
                    agent.eps += .1
                elif k[K_LEFT]:
                    agent.eps -= .1
                elif k[K_r]:
                    agent.reset()
                elif k[K_s]:
                    speedup = not speedup

        # build xp tuple
        xp = {'s1': s1.copy(), 'a1': a1}

        # act a1
        body.act(a1)
        
        # step physics engine
        for ii in range(2):
            world.Step(TIME_STEP, 10, 10)

        # reward
        reward = body.position()[0] - last_x
        reward *= -1
        xp['r'] = reward

        # state2
        s2 = body.state()
        q2 = agent.q_approx(s2)
        # sample action 2
        a2 = agent.sample_action(q2)

        xp['s2'] = s2.copy()
        xp['a2'] = a2

        agent.memorize(xp)
        agent.train(100)
        
        if not speedup:
            clear_output(wait=True)
            advantage = agent.advantage(s1)
            print('s1: {} avg_r: {} eps: {} memories: {}\nqs: {}\nadvantage: {}'\
                  .format(s1, np.round(agent.avg_r, 1), agent.eps, len(agent.M), q2, advantage))
            render()
            pygame.display.flip()
            clock.tick(TARGET_FPS)
            #agent.visualize_q()
            #plt.show()

        # prepare for next rounds
        s1 = s2
        a1 = a2
        last_x = body.position()[0]

        if body.position()[0] > SCREEN_WIDTH or body.position()[0] < 0:
            reset = True

pygame.quit()

s1: [[ 5.29723114  5.70365815  2.77009367]] avg_r: 0.1 eps: 0.1 memories: 4317
qs: [[ 11.90946883  11.24379098]]
advantage: [[ 5.19027306  4.95433985]]
