In [1]:
!pip3 install box2d-py



In [2]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

# HIDE OUTPUT
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1



In [3]:
import gym
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

display = Display(visible=0, size=(1400, 900))
display.start()

"""
Utility functions to enable video recording of gym environment 
and displaying it.
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [4]:
# Класс для обучения
import tensorflow as tf
import gym
from keras import models
from keras import layers
from tensorflow.keras.optimizers import Adam
from collections import deque
import random
import numpy as np
from tensorflow import keras

tf.keras.backend.clear_session()
tf.config.optimizer.set_jit(True) # Enable XLA
tf.config.list_logical_devices()

[LogicalDevice(name='/device:CPU:0', device_type='CPU'),
 LogicalDevice(name='/device:GPU:0', device_type='GPU')]

In [6]:
class LunarLanderTrain:
    def __init__(self, env, eps=1):
        self.env = env
        self.gamma = 0.99

        self.epsilon = eps
        self.epsilon_decay = 0.0075
        self.epsilon_min = 0.01

        self.learing_rate = 0.001
        self.replay_buffer = deque(maxlen=20000)

        self.episode_num = 400
        self.num_pick_from_buffer = 32

        self.train_network = self.create_network()
        self.target_network = self.create_network()
        self.target_network.set_weights(self.train_network.get_weights())

    def create_network(self):
        inputs = keras.Input(shape=env.observation_space.shape)
        x = layers.Dense(32, activation='relu')(inputs)
        x = layers.Dense(64, activation='relu')(x)
        outputs = layers.Dense(self.env.action_space.n, activation='linear')(x)
        model = keras.Model(inputs=inputs, outputs=outputs)
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learing_rate))
        return model

    def get_action(self, state):
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.rand(1) < self.epsilon:
            action = np.random.randint(0, 4)
        else:
            action = np.argmax(self.train_network.predict(state.reshape((1,) + state.shape)))
        return action

    def train_from_buffer(self):
        if len(self.replay_buffer) < self.num_pick_from_buffer:
            return
        samples = random.sample(self.replay_buffer, self.num_pick_from_buffer)
        states, actions, rewards, newstates, dones = zip(*samples)
        states, actions, rewards, newstates, dones = np.array(states), np.array(actions), np.array(rewards), np.array(newstates), np.array(dones)
        notdones = ~dones

        targets = self.train_network.predict(states)
        Q_futures = self.target_network.predict(newstates).max(axis = 1)
        targets[(np.arange(self.num_pick_from_buffer), actions)] = rewards * dones + (rewards + Q_futures * self.gamma) * notdones
        self.train_network.fit(states, targets, epochs=1, verbose=0)


    def orginal_try(self, state, eps):
        max_reward = -99

        while True:
            action = self.get_action(state)
            new_state, reward, done, _ = env.step(action)

            if reward > max_reward:
                max_reward = reward

            self.replay_buffer.append([state, action, reward, new_state, done])
            self.train_from_buffer()

            state = new_state
            if done: break

        self.train_network.save('trainNetworkInEPS{}.h5'.format(eps))
        self.target_network.set_weights(self.train_network.get_weights())

        print("it = {}, reward = {}, max_reward = {}, eps = {}".format(eps, reward, max_reward, max(self.epsilon_min, self.epsilon)))
        self.epsilon -= self.epsilon_decay

    def start(self):
        for eps in range(self.episode_num):
            state = env.reset()
            self.orginal_try(state, eps)

In [None]:
#Обучение
env = gym.make('LunarLander-v2')
dqn = LunarLanderTrain(env = env)
dqn.start()

it = 0, reward = -100, max_reward = 0.037916706947698914, eps = 1
it = 1, reward = -100, max_reward = 27.424044017197247, eps = 0.9925
it = 2, reward = -100, max_reward = 12.143199333352339, eps = 0.9850000000000001
it = 3, reward = -100, max_reward = 20.78168163820689, eps = 0.9775000000000001
it = 4, reward = -100, max_reward = 6.663352032629548, eps = 0.9700000000000002
it = 5, reward = -100, max_reward = 12.492645216341602, eps = 0.9625000000000002
it = 6, reward = -100, max_reward = 17.212395846288473, eps = 0.9550000000000003
it = 7, reward = -100, max_reward = 76.13961237246443, eps = 0.9475000000000003
it = 8, reward = -100, max_reward = 6.2832909877379, eps = 0.9400000000000004
it = 9, reward = -100, max_reward = 7.185304048932903, eps = 0.9325000000000004
it = 10, reward = -100, max_reward = 12.483068464131122, eps = 0.9250000000000005
it = 11, reward = -100, max_reward = 9.74416172720252, eps = 0.9175000000000005
it = 12, reward = -100, max_reward = 7.283342366448182, eps = 

In [6]:
#Обучение
env = gym.make('LunarLander-v2')
dqn = LunarLanderTrain(env = env)
dqn.train_network.load_weights('trainNetworkInEPS125.h5')
dqn.epsilon = 0.06250000000000283
dqn.start()

it = 0, reward = 100, max_reward = 100, eps = 0.06250000000000283
it = 1, reward = -100, max_reward = 3.4805655219744382, eps = 0.05500000000000283
it = 2, reward = -100, max_reward = 5.702614364544741, eps = 0.04750000000000283
it = 3, reward = -100, max_reward = 4.784370425433708, eps = 0.04000000000000283
it = 4, reward = -100, max_reward = 2.537401568147119, eps = 0.03250000000000283
it = 5, reward = -100, max_reward = 3.358698749925492, eps = 0.025000000000002832
it = 6, reward = -100, max_reward = 5.232558578803531, eps = 0.017500000000002833
it = 7, reward = -100, max_reward = 3.7754941483805853, eps = 0.010000000000002833
it = 8, reward = -0.2807337148370436, max_reward = 4.591073961497, eps = 0.01
it = 9, reward = -100, max_reward = 3.350887870698301, eps = 0.01
it = 10, reward = -100, max_reward = 2.472435847151144, eps = 0.01
it = 11, reward = -100, max_reward = 2.7898416531617274, eps = 0.01
it = 12, reward = -100, max_reward = 1.7049621453568318, eps = 0.01
it = 13, reward

KeyboardInterrupt: ignored

In [34]:
#Проверка
env = wrap_env(gym.make('LunarLander-v2'))
dqn = LunarLanderTrain(env = env, eps=0)
dqn.train_network.load_weights('trainNetworkInEPS51.h5') # ТОП = {38, 39, 42, 43, 45, 46, 47, 49, (51), 53, 54, 55, 56}

state = env.reset()

while True:
    env.render()
    action = dqn.get_action(state) 
    state, reward, done, info = env.step(action)  
    if done: break;
            
env.close()
show_video()
print(reward)

100


In [None]:
state
# первые 2 - это положение по оси x и оси y (высота), 
# другие 2 - это условия скорости по осям x, y, 
# угол посадочного модуля и угловая скорость, 
# левая и правая левые точки контакта (bool )

array([ 0.3058899 ,  0.02807165,  0.01110496, -0.77348167,  0.32755476,
        4.867282  ,  1.        ,  1.        ], dtype=float32)

In [None]:
env.action_space

Discrete(4)