# Intro

The plan is to have a player blob (blue), which aims to navigate its way as quickly as possible to the food blob (green), while avoiding the enemy blob (red). Now, we could make this super smooth with high definition, but we already know we're going to be breaking it down into observation spaces. Instead, let's just start in a discrete space. Something between a 10x10 and 20x20 should suffice. Do note, the larger you go, the larger your Q-Table will be in terms of space it takes up in memory as well as time it takes for the model to actually learn. So, our environment will be a 20 x 20 grid, where we have 1 player, 1 enemy, and 1 food. For now, we'll just have the player able to move, in attempt to reach the food, which will yield a reward.

# Requirements

In [1]:
import numpy as np
from PIL import Image  # for creating visual env
import cv2  # for showing our visual live
import matplotlib.pyplot as plt
import pickle  # to save/load Q-Tables
from matplotlib import style  # to make pretty charts.
import time  # using this to keep track of our saved Q-Tables.

# Environment size, constants and variables
A 10x10 Q-Table for example, in this case, is ~15MB. A 20x20 is ~195MB

In [2]:
style.use('ggplot')
SIZE = 10
HM_EPISODES = 25000
MOVE_PENALTY = 1
ENEMY_PENALTY = 300
FOOD_REWARD = 25
epsilon = 0.9
EPS_DECAY = 0.9998
SHOW_EVERY = 3000
# In case you have a q table, load here (filename)
start_q_table = None
LEARNING_RATE = 0.1
DISCOUNT = 0.95
# key in dict
PLAYER_N = 1
FOOD_N = 2
ENEMY_N = 3
# Dict for colors BGR
d = {1: (255, 175, 0),
     2: (0,255, 0),
     3: (0, 0, 255)}

# Blob

In [3]:
class Blob:
    def __init__(self):
        self.x = np.random.randint(0, SIZE)
        self.y = np.random.randint(0, SIZE)
    def __str__(self):
        return f'{self.x}, {self.y}'
    def __sub__(self, other):
        return (self.x - other.x, self.y - other.y)
    def action(self, choice):
        if choice == 0:
           self.move(x=1, y=1)
        elif choice == 1:
            self.move(x=-1, y=-1)
        elif choice == 2:
            self.move(x=-1, y=1)
        elif choice == 3:
            self.move(x=1, y=-1)
    def move(self, x=False, y=False):
        if not x:
            self.x += np.random.randint(-1, 2)
        else:
            self.x += x
        if not y:
            self.y += np.random.randint(-1, 2)
        else:
            self.y += y
        
        if self.x < 0:
            self.x = 0
        elif self.x > SIZE-1:
            self.x = SIZE-1
        if self.y < 0:
            self.y = 0
        elif self.y > SIZE-1:
            self.y = SIZE-1

# Q table

In [4]:
if start_q_table is None:
    q_table = {}
    # (x1, y1), (x2, y2)
    for x1 in range(-SIZE+1, SIZE):
        for y1 in range(-SIZE+1, SIZE):
            for x2 in range(-SIZE+1, SIZE):
                for y2 in range(-SIZE+1, SIZE):
                    q_table[((x1, y1),(x2,y2))] = [np.random.uniform(-5, 0) for i in range(4)]
else:
    with open(start_q_table, 'rb') as f:
        q_table = pickle.load(f)

episode_rewards = []
for episode in range(HM_EPISODES):
    player = Blob()
    food = Blob()
    enemy = Blob()
    if episode % SHOW_EVERY == 0:
        print(f'on # {episode}, epsilon: {epsilon}')
        print(f"{SHOW_EVERY} ep mean: {np.mean(episode_rewards[-SHOW_EVERY:])}")
        show = True
    else:
        show = False
    # frames of the episode
    episode_reward = 0
    for i in range(200):
        obs = (player-food, player-enemy)
        if np.random.random() > epsilon:
            action = np.argmax(q_table[obs])
        else:
            action = np.random.randint(0, 4)
            
        player.action(action)
        '''
        MAYBE
        enemy.move()
        food.move()
        '''

on # 0, epsilon: 0.9
3000 ep mean: nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


on # 3000, epsilon: 0.9
3000 ep mean: nan
on # 6000, epsilon: 0.9
3000 ep mean: nan
on # 9000, epsilon: 0.9
3000 ep mean: nan
on # 12000, epsilon: 0.9
3000 ep mean: nan
on # 15000, epsilon: 0.9
3000 ep mean: nan
on # 18000, epsilon: 0.9
3000 ep mean: nan
on # 21000, epsilon: 0.9
3000 ep mean: nan
on # 24000, epsilon: 0.9
3000 ep mean: nan


# Rewarding

In [5]:
if player.x == enemy.x and player.y == enemy.y:
    reward =- ENEMY_PENALTY
elif player.x == food.x and player.y == food.y:
    reward = FOOD_REWARD
else:
    reward = -MOVE_PENALTY

# Q values and information

In [6]:
new_obs = (player-food, player-enemy)
# Max Q value for new obs
max_future_q = np.max(q_table[new_obs])
# Current Q for the chosen action
current_q = np.max(q_table[obs][action])
if reward == FOOD_REWARD:
    new_q = FOOD_REWARD
elif reward == -ENEMY_PENALTY:
    new_q = - ENEMY_PENALTY
else:
    new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)
q_table[obs][action] =  new_q

# Displaying the environment

In [None]:
while True:
    if show:
        env = np.zeros((SIZE, SIZE, 3), dtype=np.uint8)
        # Setting food, player and enemy tile ro their corresponding colors
        env[food.x][food.y] = d[FOOD_N]
        env[player.x][player.y] = d[PLAYER_N]
        env[enemy.x][enemy.y] = d[ENEMY_N]
        # Reading to RGB
        img = Image.fromarray(env, 'RGB')
        img = img.resize(300, 300)
        cv2.imshow('', np.array(img))
        if reward == FOOD_REWARD or reward == -ENEMY_PENALTY:
            if cv2.waitKey(500) & 0xFF == ord('q'):
                break
        else:
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

# Handling rewards

In [None]:
episode_rewards += reward
while True:
    if reward == FOOD_REWARD or reward == -ENEMY_PENALTY:
        break
episode_rewards.append(episode_rewards)
epsilon *= EPS_DECAY

# Graphs and savings


In [None]:
moving_avg = np.convolve(episode_reward, np.ones((SHOW_EVERY,)) / SHOW_EVERY, mode='valid')
# You could also do it with chunks
plt.plot([i for i in range(len(moving_avg))], moving_avg)
plt.ylabel(f"Reward {SHOW_EVERY}ma")
plt.xlabel("episode #")
plt.show()

with open(f"qtable-{int(time.time())}.pickle", "wb") as f:
    pickle.dump(q_table, f)