In [1]:
import tensorflow as tf
import numpy as np
from selenium import webdriver
import time
import os

In [32]:
# TODO: make env parent class
class env2048:  
    def __init__(self):
        DRIVER = 'geckodriver'
        self.driver = webdriver.Firefox()
        self.driver.implicitly_wait(3)
        self.driver.get('https://play2048.co')
        self.action_map = {0:'up', 1:'right', 2:'down', 3:'left'}
        
        # setup score memory for finding rewards
        self.score = 0
    
    # for this project, input will never be of magnitude 0 so I don't do a divide by 0 check
    def normalize(self, data):
        norm = np.linalg.norm(data)
        return data/norm
    
    def get_state(self):
        successful = False
        while not successful:
            successful = True
            tiles = self.driver.find_element_by_class_name('tile-container')
            values = [0]*16
            for tile in tiles.find_elements_by_tag_name('div'):
                className = tile.get_attribute('class')
                # lazy method of avoiding the tile-inner class
                if len(className) > 20:
                    # must do this to isolate the string w/ pos information
                    tile_pos = className.split(' ')[2]
                    # use indexes since size of string won't change so this is fast
                    x = int(tile_pos[14]) - 1
                    y = int(tile_pos[16]) - 1
                    index = 4*y + x
                    if tile.text != '':
                        values[index] = int(tile.text)
                    else:
                        # if here that means page hadn't fully updated
                        successful = False
        return self.normalize(np.array([values]))
    
    # while there is a score-addition tag that could make this more efficient
    # there were some issues where at times the text wouldn't be read properly
    # therefore I just manually calculate score addition for more consistent data
    def get_reward(self):
        r = self.driver.find_element_by_class_name('score-container')
        # do the split in case the score addition text is in the element
        new_score = int(r.text.split('\n')[0])
        # divide by 4 to normalize score somewhat (4 is the lowest change in score possible)
        # thus this will value combining two 2's into a 4 as a reward of 1
        r = (new_score - self.score)/4
        self.score = new_score
        # punish wasteful moves
        if r == 0:
            return -1
        return r
        
        
    
    def get_done(self):
        status = self.driver.find_element_by_class_name('game-message')
        return status.text != ""
    
    def take_action(self, action):
        body = self.driver.find_element_by_tag_name('body')
        if action == 0:
            body.send_keys(u'\ue013')
        elif action == 1:
            body.send_keys(u'\ue014')
        elif action == 2:
            body.send_keys(u'\ue015')
        elif action == 3:
            body.send_keys(u'\ue012')
        
        time.sleep(.2)
        
        return self.get_state(), self.get_reward(), self.get_done()
    
    def __del__(self):
        self.driver.close()

In [3]:
# TODO: cleanup this to be more reusable

inputs = tf.keras.Input(shape=(16,))
x = tf.keras.layers.Dense(64, activation = tf.nn.relu)(inputs)
predictions = tf.keras.layers.Dense(4, activation = 'linear')(x)

model = tf.keras.Model(inputs = inputs, outputs = predictions)
model.compile(optimizer = tf.train.GradientDescentOptimizer(.05),
              loss = 'categorical_crossentropy',
              metrics = ['accuracy'])

In [4]:
# TODO: would epsilon-greedy help? run benchmark when done
def get_action(state):
    q_vals = model.predict(state)[0]
    # do this to 
    exp = np.exp(q_vals)
    softmax_vals = exp/np.sum(exp)
    return np.random.choice(4, p = softmax_vals)

In [37]:
# train model
env = env2048()
done = False
iter = 0
state = env.get_state()
experiences = []
while not done and iter < 10:
    iter += 1
    action = get_action(state)
    experience = [state, action]
    
    state, reward, done = env.take_action(action)
    experience.extend([state, reward, done])
    experiences.append(experience)