In [1]:
import gym

In [2]:
env = gym.make('Pitfall-v0')

In [None]:
#Test visualization
episodes = 10

for episode in range(1, episodes):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        state, reward, done, info = env.step(env.action_space.sample())
        #print(f"State: {state}\nReward: {reward}\nDone: {done}\nInfo: {info}")
        score += reward
    print(f'Episode: {episode}\nScore: {score}')

In [3]:
#Import neural network from Keras
#TODO: re-build in pytorch
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras.optimizers import Adam

In [4]:
def build_model(height, width, channels, actions):
    model = Sequential()
    model.add(Conv2D(32,(8,8), strides=(4,4), activation='relu', input_shape=(3, height, width, channels)))
    model.add(Conv2D(64, (4,4), strides=(2,2), activation='relu'))
    model.add(Conv2D(64, (4,4), strides=(2,2), activation='relu'))
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [5]:
height, width, channels = env.observation_space.shape
actions = env.action_space.n

In [10]:
del model

In [11]:
model = build_model(height, width, channels, actions)

In [12]:
from rl.agents import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy

In [13]:
def build_agent(model, actions):
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.2, nb_steps=10000)
    memory = SequentialMemory(limit=4000, window_length=3)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, enable_dueling_network=True, 
                   dueling_type='avg', nb_actions=actions, nb_steps_warmup=1000)
    return dqn

In [14]:
dqn = build_agent(model, actions)

In [15]:
dqn.compile(Adam(lr=0.001))

In [16]:
dqn.fit(env, nb_steps=40000, visualize=False, verbose=1)

Training for 40000 steps ...
Interval 1 (0 steps performed)
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
5 episodes - episode_reward: -212.200 [-1061.000, 0.000] - loss: 8.425 - mean_q: 12.023 - mean_eps: 0.505 - ale.lives: 2.687

Interval 2 (10000 steps performed)
7 episodes - episode_reward: -190.143 [-909.000, 0.000] - loss: 4.984 - mean_q: 11.959 - mean_eps: 0.100 - ale.lives: 2.552

Interval 3 (20000 steps performed)
   11/10000 [..............................] - ETA: 1:57:48 - reward: 0.0000e+00done, took 21049.154 seconds


<tensorflow.python.keras.callbacks.History at 0x1c8f1c760f0>

In [20]:
scores = dqn.test(env, nb_episodes=10, visualize=True)
print(np.mean(scores.history['episode_reward']))

Testing for 10 episodes ...


KeyboardInterrupt: 

In [19]:
dqn.save_weights('models/dqn.h5f')

In [None]:
dqn.load_weights('models/dqn.h5f')