In [None]:
import glob
import os
import sys
import random
import time
import numpy as np
import cv2
import math
from collections import deque

import tensorflow as tf
import tensorflow.keras.backend as backend
from tensorflow.python.keras.backend import set_session
from tensorflow.keras.applications.xception import Xception
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Activation, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard

from PIL import Image

from threading import Thread


from tqdm import tqdm

In [None]:
TRAIN_SET_SIZE = 50_000
MIN_TRAIN_SET_SIZE = 200
MINIBATCH_SIZE = 16
TRAIN_BATCH_SIZE = 2
SAVE_EVERY = 500
MODEL_NAME = '256x2'
MEMORY_FRACTION = 0.8

# image settings
IMG_HEIGHT = 100
IMG_WIDTH = 100
IMG_CHANNELS = 3

# environment settings
NUM_EPISODES = 20_000
# NUM_ACTIONS = 10 # MOVE_LEFT, MOVE_RIGHT, MOVE_UP, MOVE_DOWN, MOUSE_LEFT, MOUSE_RIGHT, MOUSE_UP, MOUSE_DOWN, LCLICK, RCLICK
NUM_ACTIONS = 2 # MOVE_UP, LCLICK
DISCOUNT = 0.99
EPISODE_TIME = 3
AGGREGATE_STATS_EVERY = 500
MIN_REWARD = -100

# exploration settings
epsilon = 1
EPSILON_DECAY = 0.9975
MIN_EPSILON = 0.001

In [None]:
class ModifiedTensorBoard(TensorBoard):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.step = 1
        self.writer = tf.summary.FileWriter(self.log_dir)
        
    def set_model(self, model):
        pass

    def on_epoch_end(self, epoch, logs=None):
        self.update_stats(**logs)

    def on_batch_end(self, batch, logs=None):
        pass

    def on_train_end(self, _):
        pass

    def update_stats(self, **stats):
        self._write_logs(stats, self.step)

In [None]:
class MraftAgent:
    def __init__(self):
        self.sess = tf.Session()
        set_session(self.sess)
        
        self.graph = tf.get_default_graph()
        
        self.model = self.create_model()
        
        self.target_model = self.create_model()
        self.target_model.set_weights(self.model.get_weights())
        self.target_update_counter = 0
        
        self.train_set = deque(maxlen=TRAIN_SET_SIZE)
        
        self.tensorboard = ModifiedTensorBoard(log_dir=f'logs/miraft-model-{int(time.time())}')
        self.last_logged_step = 0
        self.cur_step = 0
        
        self.training_initialized = False
        
        self.terminate = False
        
    def create_model(self):
        model = Sequential([
            Conv2D(256, (3, 3), input_shape=(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS)),
            Activation('relu'),
            MaxPooling2D(2, 2),
            Dropout(0.2),
            
            Conv2D(256, (3, 3)),
            Activation('relu'),
            MaxPooling2D(2, 2),
            Dropout(0.2),
            
            Flatten(),
            Dense(64),
            Dense(NUM_ACTIONS, activation='linear')
        ])
        
        model.compile(loss='mse', optimizer=Adam(lr=0.001), metrics=['accuracy'])
        
        return model
    
    def get_q_values(self, state):
        with self.graph.as_default():
            return self.model.predict(np.array(state).reshape(-1, *state.shape)/ 255)[0]
    
    def train(self):
        if len(self.train_set) < MIN_TRAIN_SET_SIZE:
            return
        print('Training...')
        minibatch = random.sample(self.train_set, MINIBATCH_SIZE)
        
        current_states = np.array([transition[0] for transition in minibatch]) / 255.
            
        with self.graph.as_default():
            current_q_values = self.model.predict(current_states)
        
        future_states = np.array([transition[3] for transition in minibatch]) / 255
        with self.graph.as_default():
            future_q_values = self.target_model.predict(future_states)
        
        for i, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):
            if not done:
                new_q = DISCOUNT * reward * np.max(future_q_values[i])
            else:
                new_q = reward
                
            current_q_list = current_q_values[i]
            current_q_list[action] = new_q
            
        X = current_states
        y = np.array(current_q_values)
        
        log_step = False
        if self.tensorboard.step > self.last_logged_step:
            log_step = True
            self.last_logged_step = self.tensorboard.step
            
        with self.graph.as_default():
            self.model.fit(
                X,
                y,
                batch_size=TRAIN_BATCH_SIZE,
                verbose=0,
                shuffle=False,
                callbacks=[self.tensorboard] if False else None
            )
        
        if log_step:
            self.target_update_counter += 1
        
        if self.target_update_counter > SAVE_EVERY:
            self.target_model.set_weights(self.model.get_weights)
            
    def train_in_loop(self):
         # iterate through once to setup..
        X = np.random.uniform(size=(1, IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS)).astype(np.float32)
        y = np.random.uniform(size=(1, NUM_ACTIONS)).astype(np.float32)
        with self.graph.as_default(): # apparently useless statement but good practice to prevent overlapping graph values
            set_session(self.sess)
            self.model.fit(X, y, verbose=False, batch_size=1)
        
        self.training_initialized = True
        
        while True:
            if self.terminate:
                break
            self.train()
            time.sleep(0.01)
            

In [None]:
class MraftEnv:
    
    def __init__(self):
        self.episode_start = 0

    def reset(self):
        img = self.next_frame()
        return img
    
    def step(self, action):
        if action == 0: # move foreward
            pass
            # press and hold w for 1 second
            # delay by however much to keep things synchronous
        elif action == 1: # left click
            pass
            # press and hold for 1 second
            # delay by however much to keep things synchronous
        
        frame = self.next_frame()
        destroyed, picked_up = self.process(frame)
        
        if destroyed and picked_up:
            done = True
            reward = 100
        elif destroyed:
            done = False
            reward = -1
        else:
            done = False
            reward = -10
        
        if time.time() - episode_start > EPISODE_TIME:
            done = True

        return frame, reward, done
    
    def next_frame(self):
        img = cv2.imread('images/img.png')
        img = cv2.resize(img, (IMG_HEIGHT, IMG_WIDTH))
        return img
    
    def process(self, frame):
        return random.randint(0, 1) == 1, random.randint(0, 1) == 1

In [None]:
random.seed(1)
np.random.seed(1)
tf.set_random_seed(1)

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=MEMORY_FRACTION)
backend.set_session(tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)))

if not os.path.isdir('models'):
    os.makedirs('models')

In [None]:
env = MraftEnv()
agent = MraftAgent()

train_thread = Thread(target=agent.train_in_loop, daemon=True)
train_thread.start()

ep_rewards = []

while not agent.training_initialized:
    time.sleep(0.01)

for ep in range(NUM_EPISODES):
    print(f'Episode {ep}')
    episode_start = time.time()
    
    agent.tensorboard.step = ep
    
    episode_reward = 0
    
    current_state = env.reset()
    
    done = False
    while True:
        if random.uniform(0, 1) > epsilon:
            action = np.argmax(agent.get_q_values(current_state))
        else:
            action = random.randint(0, 1)
        
        next_state, reward, done = env.step(action)
        
        episode_reward += reward
        
        agent.train_set.append((current_state, action, reward, next_state, done))
        
        if done:
            break
            
    ep_rewards.append(episode_reward)
    
    # visualization
    if ep % AGGREGATE_STATS_EVERY == 0 or ep == 1:
        average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:]) / len(ep_rewards[-AGGREGATE_STATS_EVERY:])
        min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:]) / len(ep_rewards[-AGGREGATE_STATS_EVERY:])
        max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:]) / len(ep_rewards[-AGGREGATE_STATS_EVERY:])
#         agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward)

    if min_reward >= MIN_REWARD:
        agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg__{min_reward:_>7.2f}min.h5')

    
    if epsilon > MIN_EPSILON:
        epsilon = max(MIN_EPSILON, EPSILON_DECAY * epsilon)