In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [2]:
%matplotlib inline

import gym
import gym.wrappers
import itertools
import numpy as np
import os
import random
import tensorflow as tf

  return f(*args, **kwds)


В данной работе мы реализуем алгоритм q-learning и научим модель эффективно играть в [breakout]('https://en.wikipedia.org/wiki/Breakout_(video_game)') 

In [3]:
# Укажем разрешение кадров игры https://gym.openai.com/envs/Breakout-v0/
input_resolution = [210, 160, 3]
# Разрешение после предобработки (см. ниже)
result_resolution = [80, 80] 

Создадим функцию, которая будет осуществлять предобработку извлеченных из игры кадров. В модуле [tf.image]('https://www.tensorflow.org/api_docs/python/tf/image') содержится множество функций для работы с изображениями, воспользуемся ими 

In [4]:
def create_state_processor():
    # Выделим пространственную размерность изображения после предобработки
    size = tf.constant(result_resolution[:2])
    
    with tf.variable_scope("state_processor"):
        input_state = tf.placeholder(shape=input_resolution, dtype=tf.uint8)
        output = input_state
        
        # Обрежем рамку и интерфейс при помощи crop_to_bounding_box 
        output = tf.image.crop_to_bounding_box(output, 34, 0, 160, 160)
        # Отмасштабируем изображение
        output = tf.image.resize_images(output, size, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
        
        # Если количество каналов не указанно, либо равно 1, то переводим иображение в grayscale
        if len(result_resolution) <3 or result_resolution[2] == 1:
            output = tf.image.rgb_to_grayscale(output)
            output = tf.squeeze(output)

    def process(sess, state):
        return sess.run(output, { input_state: state })
    
    return process

Подключим [openai gym]('https://github.com/openai/gym'). Gym -- это специальная среда, для запуска симуляций, предназначенных для обучения алгоритмов  reinforcement learning. Так же нам понадобится [atari-py]('https://github.com/openai/atari-py'), содержащая в себе эмуляторы игр Atari

In [5]:
# Выбираем игру из библиотеки 
env = gym.envs.make("Breakout-v0")

# Указываем директорию эксперимента. 
experiment_dir = os.path.abspath("~/Lesson5_practice/atari-experiments/{}".format(env.spec.id))
# Указываем директорию в которую будут сохраняться видеозаписи эпизодов, сыгранных алгоритмом
monitor_path = os.path.join(experiment_dir, "monitor")

if not os.path.exists(monitor_path):
    os.makedirs(monitor_path)

# Частота записи видео
record_video_every=2    

# Monitor осуществляет запись видео на диск
env = gym.wrappers.Monitor(env, monitor_path, 
                   video_callable=lambda count: count % record_video_every == 0 and count > 2,
                      resume=True)

In [6]:
# Пространство действий (https://gym.openai.com/envs/Breakout-v0/)
actions = [0,1,2,3]

In [7]:
def build_q_estimator(session, resolution, action_space_size=5, name = 'q_estimator'):  
    with tf.variable_scope(name):
        # Плейсхолдеры для кадров, оценок и действий
        frame_pl = tf.placeholder(shape=[None] + resolution + [4], dtype=tf.float32, name="frame")
        reward_pl = tf.placeholder(shape=[None], dtype=tf.float32, name="reward")
        action_pl = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")
       
        # Реализация в соответствии со https://arxiv.org/pdf/1312.5602.pdf
        conv_1 = tf.layers.conv2d(frame_pl, 32, 8, strides=(4,4), activation=tf.nn.relu, name="conv_1")
        conv_2 = tf.layers.conv2d(conv_1, 64, 4, strides=(2,2), activation=tf.nn.relu, name="conv_2")
        conv_3 = tf.layers.conv2d(conv_2, 128, 3, strides=(1,1), activation=tf.nn.relu, name="conv_3")
        flatten = tf.contrib.layers.flatten(conv_3)
        dense1 = tf.layers.dense(flatten, 512, activation=tf.nn.relu, name="dense1")
        output = tf.layers.dense(dense1, action_space_size, name="output")

        # Get the predictions for the chosen actions only
        mask = tf.one_hot(action_pl, action_space_size)
        filtered_tensor = tf.multiply(output, mask)
        action_prediction = tf.reduce_sum(filtered_tensor, axis=1)

        temp = tf.squared_difference(reward_pl, action_prediction)
        loss = tf.reduce_mean(temp)

        train_op = tf.train.AdamOptimizer(0.00025).minimize(loss, global_step=tf.train.get_or_create_global_step())

        writer = tf.summary.FileWriter('atari-experiments/Breakout-v0/tfboard', session.graph)
        summary = tf.summary.merge(
                [tf.summary.scalar("loss", loss),
                tf.summary.histogram("q_values", output)])
        

    def predict(session, frame):
        feed_dict = { 
            frame_pl: frame
        }
        return session.run(output, feed_dict)
    
    def update(session, frames, reward, action):
        feed_dict = { 
            frame_pl: frames, 
            reward_pl: reward,
            action_pl: action
        }
#         print ("shape of action=", action.shape)
#         print ("shape of action_pl", action_pl.shape)
#         print("action_pl=", action_pl)
        loss_value, _, smr, step = session.run([loss, train_op, summary, tf.train.get_or_create_global_step()], feed_dict)
        
#         print(loss_value)
        
        writer.add_summary(smr, step)
        return loss_value
    
    return predict, update, writer
        

In [8]:
def make_policy(action_space_size, predict):
    def eps_gready(session, frames, epsilon):
        is_random = np.random.choice([True, False], p = [epsilon,1.0 - epsilon])
        if is_random:
            return np.random.choice(action_space_size)
        q_vaues = predict(session, np.expand_dims(frames, 0))
        return np.argmax(q_vaues)
    return eps_gready

In [9]:
def copy_model_parameters(sess, estimator1, estimator2):
    e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1)]
    e1_params = sorted(e1_params, key=lambda v: v.name)
    e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2)]
    e2_params = sorted(e2_params, key=lambda v: v.name)

    update_ops = []
    for e1_v, e2_v in zip(e1_params, e2_params):
        op = e2_v.assign(e1_v)
        update_ops.append(op)

    sess.run(update_ops)

In [10]:
def learn_to_play(session, 
                  num_episodes = 3000,
                  memory_size = 500000, 
                  init_memory_size = 1000, 
                  discount_factor = 0.99,
                  epsilon_start = 1.0, 
                  epsilon_end = 0.0, 
                  epsilon_decay_steps=100000,
                  batch_size=128):
    
    
    predict, update, writer = build_q_estimator(session, result_resolution, len(actions))
    target_predict, _, __ = build_q_estimator(session, result_resolution, len(actions), name='target_estimator')
    process = create_state_processor()
    session.run(tf.global_variables_initializer())
    
    # The replay memory
    memory = []

    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)
    eps_gready_policy = make_policy(len(actions), predict)

    global_step = 0
    

    
    for i_episode in range(1, num_episodes):
        # Populate the replay memory with initial experience
        print("Do not disturb. Playing games, i_episode={}".format(i_episode))
        
        frame = env.reset()
        processed_frame = process(session, frame)
        current_frames = np.stack([processed_frame] * 4, axis=2)
        
        for t in itertools.count():
            episode_summary = tf.Summary()
            epsi = epsilons[min(epsilon_decay_steps-1, max(global_step-init_memory_size, 0))]
            episode_summary.value.add(simple_value=epsi, tag="epsilon")
            writer.add_summary(episode_summary, global_step)
                            
            action_index = eps_gready_policy(session, current_frames, epsi)
            frame, reward, done, _ = env.step(actions[action_index])
            processed_frame = process(session, frame)
            next_frames = np.append(current_frames[:,:,1:], np.expand_dims(processed_frame, 2), axis=2)
            
            if len(memory) == memory_size:
                memory.pop(0)
                
            memory.append((current_frames, 
                           reward, 
                           actions[action_index],  
                           next_frames,
                           done))
            
            global_step += 1
            current_frames = next_frames

            if done:        
                break
            
            if global_step - init_memory_size < 0:
                continue 
                
            samples = random.sample(memory, batch_size)
            
            frames_batch, reward_batch, action_batch, next_frames_batch, done_batch  = map(np.array, zip(*samples))
            
            #print ("shape of action_batch=", action_batch.shape)
            
            q_values_next = target_predict(session, next_frames_batch)
            q_values_batch = reward_batch + np.invert(done_batch).astype(np.float32) * discount_factor * np.amax(q_values_next, axis=1)
            
            if global_step % 1000 == 0:
                print('copy model parameters')
                copy_model_parameters(session, 'q_estimator', 'target_estimator')
            
            # Perform gradient descent update
            #print ("updating loss...")
            loss = update(session, frames_batch, q_values_batch, action_batch)
            

In [None]:
tf.reset_default_graph()
with tf.Session() as session:
    learn_to_play(session)