In [None]:
import numpy as np

import pygame
from pygame.locals import (QUIT, KEYDOWN, K_ESCAPE, MOUSEBUTTONDOWN, K_RIGHT, K_LEFT, K_r, K_s)

import Box2D
from Box2D.b2 import (world, polygonShape, staticBody, dynamicBody, circleShape, fixtureDef)

from IPython.display import clear_output


# RL Constants
GAMMA = 0.98 # discount rate

# Physics and rendering constants
PPM = 10.0  # pixels per meter
TARGET_FPS = 60
TIME_STEP = 1.0 / TARGET_FPS
SCREEN_WIDTH, SCREEN_HEIGHT = 600, 280
colors = {
    staticBody: (128, 128, 128, 255),
    dynamicBody: (255, 255, 255, 255),
}

screen = pygame.display.set_mode((SCREEN_WIDTH, SCREEN_HEIGHT), 0, 32)
pygame.display.set_caption('Stumbly')
clock = pygame.time.Clock()

world = world(gravity=(0, -20), doSleep=False)

# Add a floor
floor = world.CreateStaticBody(
    position=(0, 0),
    shapes=polygonShape(box=(SCREEN_WIDTH/PPM * 1.5, 1)),
)

def to_pybox2d(p):
    return [p[0]/PPM, (SCREEN_HEIGHT-p[1])/PPM]

def to_screen(p):
    return [p[0] * PPM, SCREEN_HEIGHT - p[1] * PPM]

def render():
    screen.fill((0, 0, 0))
    for body in world.bodies:
        for fixture in body.fixtures:
            shape = fixture.shape
            if isinstance(shape, polygonShape):
                vertices = [(body.transform * v) * PPM for v in shape.vertices]
                vertices = [(v[0], SCREEN_HEIGHT - v[1]) for v in vertices]
                pygame.draw.polygon(screen, colors[body.type], vertices)
            elif isinstance(shape, circleShape):
                pos = to_screen(body.position)
                radius = int(shape.radius * PPM)
                pygame.draw.circle(screen, colors[body.type], (int(pos[0]), int(pos[1])), radius)

    
# Two layer neural network
class NN(object):
    def __init__(self, xdim, hdim, ydim):
        self.W1 = np.random.randn(xdim + 1, hdim) * .2 - .1
        self.W2 = np.random.randn(hdim + 1, ydim) * .2 - .1
        
    def act(self, x):
        return np.tanh(x)
    
    def actp(self, x):
        return 1.0 - np.tanh(x)**2
    
    @staticmethod
    def bias_add(x):
        return np.concatenate([x, [[1]]], axis=1)
    
    def ff(self, x):
        self.x = NN.bias_add(x)
        
        self.z1 = self.x.dot(self.W1)
        self.h1 = NN.bias_add(self.act(self.z1))
        
        self.z2 = self.h1.dot(self.W2)
        self.h2 = self.act(self.z2)
        
        return self.h2
    
    def bp(self, deltas, learning_rate=0.1):
        self.dz2 = np.multiply(deltas, self.actp(self.z2))
        self.dh1 = self.W2.dot(self.dz2.T).T[:, :-1] # remove bias
        self.dz1 = np.multiply(self.dh1, self.actp(self.z1))
        
        self.grad_W2 = np.multiply(self.h1.T, self.dz2)
        self.grad_W1 = np.multiply(self.x.T, self.dz1)
        
        self.W2 -= self.grad_W2 * learning_rate
        self.W1 -= self.grad_W1 * learning_rate


class Agent(object):
    def __init__(self, actions=2, eps=.1, max_memory=10000):
        self.actions = actions
        self.max_memory = max_memory
        
        self.A = range(self.actions)
        self.M = [] # memory (s1, a1, r, s2, a2) tuples
        self.init_network()
        
    def init_network(self):
        self.nn = NN(xdim=1, hdim=4, ydim=self.actions)
        
    def observe(self, state):
        self.nn.ff(state)
        return self.nn.h2
        
    def sample_action(self, qs):
        if all(qs[0, 0] == qs[0, :]): # if they're all the same
            return np.random.choice(self.A)
        
        dist = [self.eps/self.actions] * self.actions
        dist[np.argmax(qs[0])] += 1.0 - self.eps
        return np.random.choice(self.A, p=dist)
    
    def train(self, iters=1):
        for i in range(iters):
            xp = self.M[np.random.randint(len(self.M))]
            
            # train with SARSA
            s2_q = self.observe(xp['s2'])
            s1_q = self.observe(xp['s1'])
            
            target = s1_q.copy()
            target[0, xp['a1']] = xp['r'] + GAMMA * np.max(s2_q[0])
            
            deltas = s1_q - target
            self.nn.bp(deltas, learning_rate=.01)
            
    def memorize(self, xp):
        if len(self.M) >= self.max_memory:
            self.M[np.random.randint(self.max_memory)] = xp
        else:
            self.M.append(xp)
            
    def reset(self):
        self.init_network()
        self.M = []
    
    @property
    def eps(self):
        return self._eps
    
    @eps.setter
    def eps(self, value):
        self._eps = np.round(min(1.0, max(0.0, value)), 2)

class Body(object):
    def __init__(self):
        # Boxes and such for limbs and joints, methods to return 'state' of body (angles)
        c = to_pybox2d((SCREEN_WIDTH/2, SCREEN_HEIGHT/2))
        
        self.limbs = {}
        self.joints = {}
        
        self.limbs['body'] = self.add_box(position=c, size=(60, 4), angle=0)
        self.limbs['leg'] = self.add_box(position=(c[0], c[1]), size=(10, 2), angle=0)
        
        wc = self.limbs['body'].worldCenter
        self.joints['leg'] = world.CreateRevoluteJoint(
                                    bodyA=self.limbs['body'], 
                                    bodyB=self.limbs['leg'], 
                                    anchor=(wc[0]+1, wc[1]),
                                    maxMotorTorque = 1000.0,
                                    motorSpeed = 0.0,
                                    enableMotor = True,
                                    )
        
    def add_box(self, position, size, angle):
        box = world.CreateDynamicBody(position=to_pybox2d(position), angle=angle)
        box.CreatePolygonFixture(box=(size[0]/PPM, size[1]/PPM), density=1, friction=0.3)
        return box
    
    def position(self):
        return to_screen(self.limbs['body'].position)
    
    def set_position(self, pos):
        t = to_pybox2d(pos)
        p = self.limbs['body'].position
        diff = (t[0]-p[0], t[1]-p[1])
        for limb in self.limbs.values():
            limb.position = (limb.position[0] + diff[0], limb.position[1] + diff[1])
        
    def zero_linear_velocity(self):
        for limb in self.limbs.values():
            limb.linearVelocity = (0, 0)
            
    def state(self):
        return np.array([[self.limbs['leg'].angle]])
    
    def act(self, action):
        if action == 0:
            self.joints['leg'].motorSpeed = -10.0;
        elif action == 1:
            self.joints['leg'].motorSpeed = 10.0;
        elif action == 2:
            self.joints['leg'].motorSpeed = 0.0

agent = Agent(actions=3)
body = Body()

agent.eps = .1

speedup = False
running = True
while running:
    
    body.zero_linear_velocity()
    body.set_position((SCREEN_WIDTH/2, SCREEN_HEIGHT/2))
    
    last_x = body.position()[0]
    
    s1 = agent.observe(body.state())
    a1 = agent.sample_action(s1)
    
    reset = False
    while not reset:
        for event in pygame.event.get():
            if event.type == QUIT or (event.type == KEYDOWN and event.key == K_ESCAPE):
                running = False
                reset = True
            if event.type == KEYDOWN:
                k = pygame.key.get_pressed()
                if k[K_RIGHT]:
                    agent.eps += .1
                elif k[K_LEFT]:
                    agent.eps -= .1
                elif k[K_r]:
                    agent.reset()
                elif k[K_s]:
                    speedup = not speedup

        # build xp tuple
        xp = {'s1': body.state().copy(), 'a1': a1}

        # act a1
        body.act(a1)

        # observe reward
        reward = body.position()[0] - last_x
        #reward *= -1

        xp['r'] = reward

        # observe state 2
        s2 = agent.observe(body.state())
        # sample action 2
        a2 = agent.sample_action(s2)

        xp['s2'] = body.state().copy()
        xp['a2'] = a2

        agent.memorize(xp)
        agent.train(1)

        # prepare for next round
        s1 = s2.copy()
        a1 = a2
        last_x = body.position()[0]

        # step physics engine
        world.Step(TIME_STEP, 10, 10)

        if body.position()[0] > SCREEN_WIDTH or body.position()[0] < 0:
            reset = True
            
        clear_output(wait=True)
        qs = agent.observe(body.state())
        print('eps: {} memories: {} qs: {}'.format(agent.eps, len(agent.M), qs))
        
        if not speedup:
            render()
            pygame.display.flip()
            clock.tick(TARGET_FPS)

pygame.quit()

eps: 0.1 memories: 10000 qs: [[ 0.94860121  0.82269185  0.86414786]]
