In [1]:
import numpy as np
import pygame as pg
from itertools import count, product
from tqdm import tqdm


pygame 2.1.0 (SDL 2.0.16, Python 3.8.10)
Hello from the pygame community. https://www.pygame.org/contribute.html


### Define Constants

In [2]:
"""pong table dimensions"""
WIDTH = HEIGHT = 1

"""pong peddles dimensions"""
P_W = 0.2
P_H = 0.02

"""pong peddles y positions"""
Y0 = 0.9
Y1 = 0.1

"""ball attributes"""
BALL_R = 0.02
BALL_VY = 1
BALL_VX = 0

"""state vector indices"""
X0 = 0  # x position of peddle 0
X1 = 1  # x position of peddle 1
X_B = 2  # x position of ball
Y_B = 3  # y position of ball
VX_B = 4  # vx of ball
VY_B = 5  # vy of ball

dt = 0.01

"""pong gui constants"""
SCALE = 500
PAD = int(0.05 * WIDTH * SCALE)
PG_W, PG_H = WIDTH * SCALE, HEIGHT * SCALE
PED_W, PED_H = int(P_W * SCALE), int(P_H * SCALE)

FPS = 20
BG_COLOR = pg.Color(50, 50, 50)
BORDER_COLOR = pg.Color(220, 220, 220)
BALL_COLOR = pg.Color(200, 70, 70)
PEDDLE_COLOR = pg.Color(240, 240, 240)

### Pong Transition Function

In [3]:
def pong_transition(s, a):
    """
    given state and action vectors, return next state vector and reward
    state vector is <x_{p0}, x_{p1}, x_{ball}, y_{ball}, v_x_{ball}, v_y_{ball}>
    action_vector is <v_x_{p0}, v_x_{p1}}>
    next state vector is <x_{p0} + v_x_{p0}dt, x_{p1} + v_x_{p1}dt, x_{ball} + v_x_{ball}dt, y_{ball} + v_y_{ball}dt, v_x_{ball}_{new}, v_y_{ball}_{new}>
    """

    # get the peddles next positions
    # if action takes peddle off the screen, effective action (peddle velocity) is 0
    s_p = np.copy(s)
    p_trans = s[: X_B] + a * dt
    a[(p_trans < P_W/2) | (p_trans > WIDTH-P_W/2)] = 0
    s_p[: X_B] += a * dt
    
    r = np.zeros(2)
    terminal = False
    # if ball touches either peddle, reverse ball y velocity, and add peddle x velocity to ball x velocity
    if s[Y_B] - BALL_R <= Y1:
        # if ball is as high as the top peddle
        if s[X1] - P_W/2 <= s[X_B] <= s[X1] + P_W/2:
            # if ball is on top peddle, 
            # flip y velocity, and add peddle x velocity to ball x velocity
            s_p[VY_B] *= -1
            s_p[VX_B] += a[1]
        else:
            r[1] = -1
            terminal = True
            
    elif s[Y_B] + BALL_R >= Y0:
        # if ball is as high as the top peddle
        if s[X0] - P_W/2 <= s[X_B] <= s[X0] + P_W/2:
            # if ball is on top peddle, 
            # flip y velocity, and add peddle x velocity to ball x velocity
            s_p[VY_B] *= -1
            s_p[VX_B] += 3 * a[0]
        else:
            r[0] = -1
            terminal = True

    # if ball touches sides, reverse ball x velocity
    if s[X_B] <= BALL_R or s[X_B] >= 1-BALL_R:
        s_p[VX_B] *= -1
        
    # transition ball according to its velocity
    s_p[X_B: VX_B] += s_p[VX_B: ] * dt

    return s_p, r, terminal
    

### Pong Gui

In [4]:
class PongGui:

    def __init__(self):
        self.screen, self.bg = self.init()
        
    def init(self):
        pg.init()  # initialize pygame
        screen = pg.display.set_mode((WIDTH * SCALE + 2 * PAD, HEIGHT * SCALE + 2 * PAD))  # set up the screen
        pg.display.set_caption("Mohamed Martini")  # add a caption
        bg = pg.Surface(screen.get_size())  # get a background surface
        bg = bg.convert()
        bg.fill(BG_COLOR)
        screen.blit(bg, (0, 0))
        return screen, bg

    def render(self):
        """show the grid array on the screen"""
        pg.display.flip()
        pg.display.update()
    
    def draw_table(self):
        pg.draw.rect(self.screen, BORDER_COLOR, (PAD//2, PAD//2, PG_W + PAD, PG_H + PAD), PAD)
        pg.draw.line(self.screen, BORDER_COLOR, (0, PAD//2), (PG_W + 2 * PAD - 2, PAD//2), width=PAD)
        pg.draw.line(self.screen, BORDER_COLOR, (0, PG_H + 3 * PAD / 2), (PG_W + 2 * PAD - 2, PG_H + 3 * PAD / 2), width=PAD)
        
        pg.draw.line(self.screen, BORDER_COLOR, (0, int((PG_H + 2 * PAD) / 2)), 
                     (int(PG_W + 2 * PAD)-5, int((PG_H + 2 * PAD) / 2)), width=5)
        
    
    def draw_state(self, s):
        for center_x, center_y in zip(s[X0: X0 + 2], [Y0, Y1]):
            center_x = int(center_x * SCALE + PAD)
            center_y = int(center_y * SCALE + PAD)
            pg.draw.rect(self.screen, PEDDLE_COLOR, 
                         (center_x - int(PED_W / 2),
                         center_y - int(PED_H / 2),
                          int(PED_W),
                          int(PED_H))
                         )
        
        circle_center = s[X_B: VX_B] * SCALE + PAD
        pg.draw.circle(self.screen, BALL_COLOR, 
                       circle_center.astype(int),
                       int(BALL_R * SCALE), 
                       width=int(BALL_R * SCALE))
    
    def reset_screen(self):
        self.screen.fill(BG_COLOR)
        self.draw_table()
    
    def play(self, theta0=None, theta1=None):
        """receive a list of positions on the x axis, and plot the movement of the screen"""
        s = get_s0()
        if theta0 is None:
            theta0 = np.zeros(NUM_FEATURES * NUM_ACTIONS)
        if theta1 is None:
            theta1 = np.zeros(NUM_FEATURES * NUM_ACTIONS)
        
        clock = pg.time.Clock()
        run = True
        while run:
            clock.tick(FPS)
            for event in pg.event.get():
                if event.type == pg.QUIT:
                    run = False
            self.reset_screen()
            self.draw_table()
            self.draw_state(s)
            
            self.render()
            
            a0, _ = get_action(s, theta0)
            a1 = np.random.choice(range(3))  # get_action(s, theta1)
            s, r, terminal = pong_transition(transform(s, direction=0), np.array((A[a0], A[a1])))
            s = transform(s)
            if terminal:
                s = get_s0()
        pg.quit()


### Actor Critic

#### Helper Functions:

In [5]:
def x_s(s: np.array):
    """return x(s) as fourier basis of state"""
    x = np.zeros(NUM_FEATURES)
    for i, c in enumerate(product(range(D + 1), repeat=K)):
        c = np.array(c)
        x[i] = np.cos(np.pi * s.T @ c)
    return x

def x_sa(s: np.array, a: int):
    """return x(s, a) as fourier basis of state, shifted according to the action index"""
    x = np.zeros(NUM_FEATURES * NUM_ACTIONS)
    start = NUM_FEATURES * a
    end = start + NUM_FEATURES
    x[start: end] = x_s(s)
    return x
    
    
def h_s(s: np.array, theta: np.array):
    """return actions' preferences in state s"""
    h = np.zeros(NUM_ACTIONS)
    for a in range(NUM_ACTIONS):
        h[a] = theta @ x_sa(s, a)
    return h

def pi_s(s: np.array, theta: np.array):
    """return policy at state s"""
    h = h_s(s, theta)
    exp = np.exp(h - np.max(h))
    return exp / np.sum(exp)

def v_s(s: np.array, w: np.array):
    """return the value of a state given the weights vector"""
    return w @ x_s(s)

def get_action(s, theta):
    """return index of action at state s according to weights theta"""
    policy = pi_s(s, theta)
    return np.random.choice(range(NUM_ACTIONS), p=policy), policy

def get_pi_gradient(s, a, policy):
    """compute gradient ln pi(a|s, theta), which equals x(s,a) = \sum_b \pi(b|s, theta) x(s,b)"""
    x = x_sa(s, a)
    summation = 0
    for i in range(NUM_ACTIONS):
        summation += policy[i] * x_sa(s, i)
    return x - summation

def get_s0():
    s = np.zeros(K)
    s[X0: VX_B] = 0.5
    s[VX_B] = np.random.uniform(0.45, 0.55)
    s[VY_B] =  1
    return s

def transform(s, direction=1):
    dirs = ((0, 1), (-7, 7))
    _s = np.copy(s)
    _s[VX_B: ] = np.interp(s[VX_B: ], dirs[direction], dirs[1 - direction])
    return _s

#### Algorithm

In [6]:

# def actor_critic_et(num_episodes):
#     gamma = 1
#     theta0 = np.zeros(NUM_ACTIONS * NUM_FEATURES)  # theta for each action
#     theta1 = np.zeros(NUM_ACTIONS * NUM_FEATURES)  # theta for each action
    
#     W0 = np.zeros(NUM_FEATURES)  # weights for estimating v_s
#     W1 = np.zeros(NUM_FEATURES)
    
#     lambda_w = 0.8
#     lambda_theta = 0.8
    
#     alpha_w = 1e-3
#     alpha_theta = 1e-3
    
#     steps_per_e = np.zeros(num_episodes)
    
#     for episode in range(num_episodes):
#         # initialize s
#         s = np.random.uniform(0.4, 0.6, size=K)
#         s0 = np.copy(s)
#         s1 = np.copy(s)
#         s1[0], s1[1] = s1[1], s1[0]

#         # reset z vectors
#         z_theta0 = np.zeros_like(theta0)
#         z_w0 = np.zeros_like(W0)
#         z_theta1 = np.zeros_like(theta1)
#         z_w1 = np.zeros_like(W1)

#         # reset gamma multiplier
#         I = 1
        
#         score = np.zeros(2)
        
#         # loop through episode
#         for t in count():
#             # select action
#             a0, policy0 = get_action(s0, theta0)
#             a1, policy1 = get_action(s1, theta1)
            
#             # take action, observe reward and next state
#             a = np.array([A[a0], A[a1]])
#             trans = transform(s0, start=(0, 1), end=(-1, 1))
#             s_p0, r, terminal = pong_transition(trans, a)
#             s_p0 = transform(s_p0, start=(-1, 1), end=(0, 1))
            
#             s_p1 = np.copy(s_p0)
#             s_p1[0], s_p1[1] = s_p1[1], s_p1[0]
#             score += r
            
#             # calculate the error (delta) - account for terminal state
#             if terminal:
#                 v_sp0, v_sp1 = 0, 0
#             else:
#                 v_sp0, v_sp1 = v_s(s_p0, W0), v_s(s_p1, W1)
            
#             delta0 = r[0] + gamma * v_sp0  - v_s(s0, W0)
#             delta1 = r[1] + gamma * v_sp1  - v_s(s1, W1)
            
#             # update z_w
#             z_w0 = gamma * lambda_w * z_w0 + x_s(s0)
#             z_w1 = gamma * lambda_w * z_w1 + x_s(s1)
            
#             # update z_theta
#             gradient0 = get_pi_gradient(s0, a0, policy0)
#             z_theta0 = gamma * lambda_theta * z_theta0 + I * gradient0
#             gradient1 = get_pi_gradient(s1, a1, policy1)
#             z_theta1 = gamma * lambda_theta * z_theta1 + I * gradient1
            
#             # update w
#             W0 += alpha_w * delta0 * z_w0
#             W1 += alpha_w * delta1 * z_w1
            
#             # update theta
#             theta0 += alpha_theta * delta0 * z_theta0
#             theta1 += alpha_theta * delta1 * z_theta1
            
#             if terminal:
#                 print(episode, t)
# #                 print(theta0)
#                 break
            
#             I *= gamma
#             s0 = s_p0
#             s1 = s_p1
        
#         steps_per_e[episode] = t
            
#     return theta0, theta1, steps_per_e

def actor_critic_et(num_episodes):
    gamma = 0.99
    theta0 = np.zeros(NUM_ACTIONS * NUM_FEATURES)  # theta for each action
    theta1 = np.zeros(NUM_ACTIONS * NUM_FEATURES)
    
    W = np.zeros(NUM_FEATURES)  # weights for estimating v_s
    
    lambda_w = 0.8
    lambda_theta = 0.8
    
    alpha_w = 1e-3
    alpha_theta = 1e-4
    
    steps_per_e = np.zeros(num_episodes)
    
    for episode in tqdm(range(num_episodes)):
        # initialize s
        s = get_s0()

        # reset z vectors
        z_theta = np.zeros_like(theta0)
        z_w = np.zeros_like(W)

        # reset gamma multiplier
        I = 1
        
        score = np.zeros(2)
        
        # loop through episode
        for t in count():
            # select action
            a0, policy0 = get_action(s, theta0)
            a1 = np.random.choice(range(3))
            
            # take action, observe reward and next state
            s_sim = transform(s, direction=0)
            a_sim = np.array([A[a0], A[a1]])
            s_p_sim, r, terminal = pong_transition(s_sim, a_sim)
            s_p = transform(s_p_sim)
            score += r
            
            # calculate the error (delta) - account for terminal state
            if terminal:
                v_sp = 0
            else:
                v_sp = v_s(s_p, W)
            
            delta = r[0] + gamma * v_sp  - v_s(s, W)
            
            # update z_w
            z_w = gamma * lambda_w * z_w + x_s(s)
            
            # update z_theta
            gradient = get_pi_gradient(s, a0, policy0)
            z_theta = gamma * lambda_theta * z_theta + I * gradient
            
            # update w
            W += alpha_w * delta * z_w
            
            # update theta
            theta0 += alpha_theta * delta * z_theta
            
            if terminal or t > 10_000:
                break
            
            I *= gamma
            s = s_p
        
        steps_per_e[episode] = t
            
    return theta0, theta1, steps_per_e, score
            

In [7]:
BASIS = dict()  # to store the used fourier basis
A = np.array([-1, 0, 1])
NUM_ACTIONS = 3
K = 6
D = 3
NUM_FEATURES = (D + 1) ** K

runs = 1
num_episodes = 1000


for run in range(runs):
    theta0, theta1, steps_per_e, score = actor_critic_et(num_episodes)
    np.save("theta0", theta0)

# print("Score:", score)

  return w @ x_s(s)
  delta = r[0] + gamma * v_sp  - v_s(s, W)
 88%|█████████████████████████████████▌    | 883/1000 [1:11:24<09:27,  4.85s/it]


ValueError: probabilities contain NaN

In [8]:
PongGui().play(theta0, theta1)
# PongGui().play()

NameError: name 'theta0' is not defined