Визуализация
https://colab.research.google.com/github/jeffheaton/t81_558_deep_learning/blob/master/t81_558_class_12_01_ai_gym.ipynb#scrollTo=XDKGJ9A3O8fT

In [None]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

# HIDE OUTPUT
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1

In [None]:
import gym
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

display = Display(visible=0, size=(1400, 900))
display.start()

"""
Utility functions to enable video recording of gym environment 
and displaying it.
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [None]:
# Класс для обучения
import tensorflow as tf
import gym
from keras import models
from keras import layers
from tensorflow.keras.optimizers import Adam
from collections import deque
import random
import numpy as np
from tensorflow import keras


tf.keras.backend.clear_session()
tf.config.optimizer.set_jit(True) # Enable XLA
tf.config.list_logical_devices()


class MountainCarTrain:
    def __init__(self, env, eps=1):
        self.env = env
        self.gamma = 0.99

        self.epsilon = eps
        self.epsilon_decay = 0.05
        self.epsilon_min = 0.01

        self.learing_rate = 0.001
        self.replay_buffer = deque(maxlen=20000)

        self.episode_num = 400
        self.iteration_num = 201   #max is 200
        self.num_pick_from_buffer = 32

        self.train_network = self.create_network()
        self.target_network = self.create_network()
        self.target_network.set_weights(self.train_network.get_weights())

    def create_network(self):
        inputs = keras.Input(shape=env.observation_space.shape)
        x = layers.Dense(32, activation='relu')(inputs)
        x = layers.Dense(64, activation='relu')(x)
        outputs = layers.Dense(self.env.action_space.n, activation='linear')(x)
        model = keras.Model(inputs=inputs, outputs=outputs)
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learing_rate))
        return model

    def get_action(self, state):
        self.epsilon = max(self.epsilon_min, self.epsilon)
        if np.random.rand(1) < self.epsilon:
            action = np.random.randint(0, 3)
        else:
            action = np.argmax(self.train_network.predict(state.reshape((1,) + state.shape)))
        return action

    def train_from_buffer(self):
        if len(self.replay_buffer) < self.num_pick_from_buffer:
            return
        samples = random.sample(self.replay_buffer, self.num_pick_from_buffer)
        states, actions, rewards, newstates, dones = zip(*samples)
        states, actions, rewards, newstates, dones = np.array(states), np.array(actions), np.array(rewards), np.array(newstates), np.array(dones)
        notdones = ~dones

        targets = self.train_network.predict(states)
        Q_futures = self.target_network.predict(newstates).max(axis = 1)
        targets[(np.arange(self.num_pick_from_buffer), actions)] = rewards * dones + (rewards + Q_futures * self.gamma) * notdones
        self.train_network.fit(states, targets, epochs=1, verbose=0)


    def orginal_try(self, state, eps):
        reward_sum = 0
        max_position=-99

        for i in range(self.iteration_num):
            action = self.get_action(state)
            new_state, reward, done, _ = env.step(action)

            if new_state[0] > max_position:
                max_position = new_state[0]

            if new_state[0] >= 0.5:
                reward += 10

            self.replay_buffer.append([state, action, reward, new_state, done])
            self.train_from_buffer()

            reward_sum += reward
            state = new_state
            if done: break

        if i >= 199:
            print("Failed. epsoide = {}".format(eps))
        else:
            print("Success. epsoide = {}, used {} iterations!".format(eps, i))
            self.train_network.save('trainNetworkInEPS{}.h5'.format(eps))

        self.target_network.set_weights(self.train_network.get_weights())

        print("eps = {}, reward = {}, maxPosition = {}".format(max(self.epsilon_min, self.epsilon), reward_sum, max_position))
        self.epsilon -= self.epsilon_decay

    def start(self):
        for eps in range(self.episode_num):
            state = env.reset()
            self.orginal_try(state, eps)

In [None]:
#Обучение
env = gym.make('MountainCar-v0')
dqn = MountainCarTrain(env = env)
dqn.start()

In [None]:
#Проверка
env = wrap_env(gym.make("MountainCar-v0"))
dqn = MountainCarTrain(env = env, eps=0)
dqn.train_network.load_weights('drive/MyDrive/MountainCar Models/trainNetworkInEPS148.h5') # ТОП - 148, 150-153, 149

state = env.reset()

while True:
    env.render()
    action = dqn.get_action(state) 
    state, reward, done, info = env.step(action)  
    if done: break;
            
env.close()
show_video()