# Installization & Import

In [1]:
!pip install tensorflow==2.3.1 gym gym-retro keras-rl2 stable-baselines3



You should consider upgrading via the 'g:\extra_environment\python-virtualenv\venv\gym\scripts\python.exe -m pip install --upgrade pip' command.



Collecting gym-retro
  Downloading gym_retro-0.8.0-cp37-cp37m-win_amd64.whl (152.0 MB)
Installing collected packages: gym-retro
Successfully installed gym-retro-0.8.0


In [1]:
import random
import time
import os

import gym 
import retro

import numpy as np
from tensorflow import keras 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Convolution2D
from tensorflow.keras.optimizers import Adam

# Environment

In [2]:
#if env:
#    env.close()
env = retro.make(game='Airstriker-Genesis')
observations = env.observation_space.shape
actions = env.action_space
print(observations)
print(actions)
print(env.action_space.sample())

(224, 320, 3)
MultiBinary(12)
[0 0 0 1 0 1 1 0 1 1 0 1]


In [3]:
episodes = 1
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))

Episode:1 Score:140.0


# KerasRL's DQN

## Import

In [6]:
from rl.agents import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy, BoltzmannQPolicy

## Model

In [7]:
def build_model(observations, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
    model.add(Dense(24, activation='tanh'))
    model.add(Dense(48, activation='tanh'))
    model.add(Dense(actions, activation='linear'))
    return model

In [8]:
model = build_model(observations, actions)

TypeError: int() argument must be a string, a bytes-like object or a number, not 'MultiBinary'

In [36]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 4)                 0         
_________________________________________________________________
dense (Dense)                (None, 24)                120       
_________________________________________________________________
dense_1 (Dense)              (None, 48)                1200      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 98        
Total params: 1,418
Trainable params: 1,418
Non-trainable params: 0
_________________________________________________________________


## DQN

In [18]:
def build_agent(model, actions):
    memory = SequentialMemory(limit=50000, window_length=1)
    policy = BoltzmannQPolicy()
    dqn = DQNAgent(model=model, nb_actions=actions, memory=memory, nb_steps_warmup=2000,
                   target_model_update=1e-2, policy=policy)
    dqn.compile(Adam(lr=0.01, decay=0.01), metrics=['mse'])
    return dqn

In [19]:
dqn = build_agent(model, actions)

In [24]:
dqn.fit(env, nb_steps=10000, visualize=False, verbose=1)

Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 76.169 seconds


<tensorflow.python.keras.callbacks.History at 0x1b85b1d8388>

In [25]:
dqn.save_weights('./trained_models/KeraRL/model_10000')

## Test

In [26]:
dqn.load_weights('./trained_models/KeraRL/model_10000')

In [34]:
scores = dqn.test(env, nb_episodes=5, visualize=True)
print(np.mean(scores.history['episode_reward']))

Testing for 5 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
200.0


In [48]:
episodes = 5
for episode in range(episodes):
    state = env.reset()
    score = 0
    while True:
        action = dqn.forward(state)
        state, reward, done, info = env.step(action)
        env.render()
        score+=reward
        if done:
            break
    print('Episode: {} score: {}'.format(episode, score))

Episode: 0 score: 200.0
Episode: 1 score: 200.0
Episode: 2 score: 200.0
Episode: 3 score: 200.0
Episode: 4 score: 200.0


# Stable baseline

## Import

In [4]:
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv, SubprocVecEnv
from stable_baselines3 import A2C, PPO
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.evaluation import evaluate_policy

## Callback

In [5]:
class SavingBestTrainingRewardCallback(BaseCallback):
    def __init__(self, check_freq:int, save_path: str, verbose=1):
        super(SavingBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path
    def _init_callback(self):
        if self.save_path:
            os.makedirs(self.save_path, exist_ok=True)
    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'model_{}'.format(self.n_calls))
            self.model.save(model_path)
        return True

In [10]:
CHECKPOINT_DIR = './trained_models/AirStrike-Genesis/StableBaselines/'
LOG_DIR = './logs/AirStrike-Genesis/StableBaselines/'
callback = SavingBestTrainingRewardCallback(check_freq=500, save_path=CHECKPOINT_DIR)

## Train

In [11]:
# agent = PPO('CnnPolicy', env, verbose=0, tensorboard_log=LOG_DIR)
agent = PPO('CnnPolicy', env, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [None]:
# trained_agent = PPO.load('./train/model_10000', env=env, tensorboard_log=LOG_DIR)

In [None]:
agent.learn(total_timesteps= 3000, callback= callback)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.51e+03 |
|    ep_rew_mean     | 160      |
| time/              |          |
|    fps             | 39       |
|    iterations      | 1        |
|    time_elapsed    | 51       |
|    total_timesteps | 2048     |
| train/             |          |
|    learning_rate   | 0.0003   |
---------------------------------


## Test

In [49]:
agent = PPO.load(CHECKPOINT_DIR + '/model_4000', env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [52]:
evaluate_policy(agent, env, n_eval_episodes=1, render=False)

(240.0, 0.0)

In [54]:
episodes = 1
for episode in range(episodes):
    state = env.reset()
    score = 0
    while True:
        action, states = agent.predict(state)
        state, reward, done, info = env.step(action)
        env.render()
        score+=reward
        if done:
            break
    print('Episode: {} score: {}'.format(episode, score))

Episode: 0 score: 160.0


# T

# T