
# Imports

In [2]:
import random
import time
import os

import tensorflow as tf
from tensorflow.python.client import device_lib
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Input
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers
from tensorflow.keras import backend as kb
from keras import optimizers
import gym

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

tf.debugging.set_log_device_placement(True)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


# Setting up env

In [3]:
env = gym.make('MountainCar-v0')
env.reset()
goal_steps = 200
score_requirement = -198
intial_games = 10000

# Playing a random game

In [4]:
def play_a_random_game_first():
    for step_index in range(goal_steps):
#         env.render()
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        print("Step {}:".format(step_index))
        print("action: {}".format(action))
        print("observation: {}".format(observation))
        print("reward: {}".format(reward))
        print("done: {}".format(done))
        print("info: {}".format(info))
        if done:
            break
    env.reset()

In [5]:
play_a_random_game_first()
# Step in game
# Action 0=left, 1=stay, 2=right
# observation = [position, velocity]
# reward = -1 for each time step, until position 0.5 is reached (top)
#          No penalty for climbing left hill (top of left hill = wall)
#
# done (with the game)
# info = ? (not needed)

Step 0:
action: 0
observation: [-0.5364003  -0.00091084]
reward: -1.0
done: False
info: {}
Step 1:
action: 1
observation: [-0.53721515 -0.00081485]
reward: -1.0
done: False
info: {}
Step 2:
action: 1
observation: [-0.5379279  -0.00071276]
reward: -1.0
done: False
info: {}
Step 3:
action: 2
observation: [-5.37533226e-01  3.94678785e-04]
reward: -1.0
done: False
info: {}
Step 4:
action: 1
observation: [-5.37034069e-01  4.99156726e-04]
reward: -1.0
done: False
info: {}
Step 5:
action: 2
observation: [-0.53543417  0.00159989]
reward: -1.0
done: False
info: {}
Step 6:
action: 0
observation: [-0.53474553  0.00068864]
reward: -1.0
done: False
info: {}
Step 7:
action: 0
observation: [-5.34973308e-01 -2.27773901e-04]
reward: -1.0
done: False
info: {}
Step 8:
action: 1
observation: [-5.35115789e-01 -1.42481464e-04]
reward: -1.0
done: False
info: {}
Step 9:
action: 1
observation: [-5.35171910e-01 -5.61210462e-05]
reward: -1.0
done: False
info: {}
Step 10:
action: 1
observation: [-5.35141250e-01  

# Run with GPU/CPU

In [6]:
def run(device, function, repeats, **kwargs):
    """
    Run a given function on the specified device with the provided keyword arguments
    """
    with tf.device(device):
        t0 = time.time()

        # Run function with all additional keyword arguments provided
        results, model = function(**kwargs)

        t = time.time() - t0
    return results, model


# Might be different on other pc
cpu = '/device:CPU:0'
gpu = '/device:GPU:0'

local_device_protos = device_lib.list_local_devices()
print([x.name for x in local_device_protos])
tf.device('/device:GPU:0')

['/device:CPU:0', '/device:XLA_GPU:0', '/device:XLA_CPU:0']


<contextlib._GeneratorContextManager at 0x7f7a6e0541d0>

# Data prep

In [7]:
# We need to play multiple times so that we can collect the data which we can use further
def model_data_preparation():
    training_data = []
    accepted_scores = []
    for game_index in range(intial_games):
        score = 0
        game_memory = []
        previous_observation = []
        # episode ends when you reach 0.5(top) (done) position, or if 200 iterations are reached.
        for step_index in range(goal_steps):
            # Take random action 0: left, 1: stay, 2:right
            action = random.randrange(0, 3)
            # Simulate action
            observation, reward, done, info = env.step(action)
            
            # Store previous observation and the action to get that observation 
            if len(previous_observation) > 0:
                game_memory.append([previous_observation, action])
                
            # Overwrite previous observation
            previous_observation = observation
            
            # Check if position of car is near top of hill (-0.2) (top=0.5)
            # Reward = 1, instead of reward given by enviroment(gym)
            if observation[0] > -0.2:
                reward = 1
            
            # Add score
            score += reward
            if done:
                break
        
        # Only keep game if score > requirement
        if score >= score_requirement:
            accepted_scores.append(score)
            # Transform data to acceptable input for network
            for data in game_memory:
                if data[1] == 1:
                    output = [0, 1, 0]
                elif data[1] == 0:
                    output = [1, 0, 0]
                elif data[1] == 2:
                    output = [0, 0, 1]
                # Add the position of car and the output
                training_data.append([data[0], output])
        
        env.reset()
    
    print(accepted_scores)
    
    return training_data

In [8]:
# Data is created by randomly playing the game and changing rewards
# Reward = 1 if near the top (position > -0.2)
# Only games are stored which meet the score requirement
# The train data consists of the position of the car and the action (left, stay, right)
training_data = model_data_preparation()

[-180.0, -192.0, -188.0, -188.0, -178.0, -192.0, -176.0, -190.0, -192.0, -188.0, -198.0, -154.0, -164.0, -186.0, -192.0, -184.0, -178.0, -186.0, -196.0, -192.0, -192.0, -184.0, -186.0, -176.0, -188.0, -170.0, -178.0, -176.0, -182.0, -174.0, -194.0, -174.0, -178.0, -180.0, -170.0, -184.0, -178.0, -188.0, -192.0, -190.0, -162.0, -172.0, -174.0, -182.0, -192.0, -178.0, -186.0]


# Model

In [15]:
def build_model(input_size, output_size):
    model = Sequential()
    model.add(Dense(128, input_dim=input_size, activation='relu'))
    model.add(Dense(52, activation='relu'))
    model.add(Dense(output_size, activation='linear'))
    model.compile(loss='mse', optimizer=optimizers.Adam())

    return model

# Train model

In [16]:
def train_model(training_data):
    # Position of car
    X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]))
    # Action
    y = np.array([i[1] for i in training_data]).reshape(-1, len(training_data[0][1]))
    model = build_model(input_size=len(X[0]), output_size=len(y[0]))
    
    model.fit(X, y, epochs=5)
    return model

In [17]:
trained_model = train_model(training_data)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Play the game with the trained model

In [19]:
scores = []
choices = []
for each_game in range(100):
    score = 0
    prev_obs = []
    for step_index in range(goal_steps):
        # Uncomment this line if you want to see how our bot playing
        # env.render()
        if len(prev_obs)==0:
            action = random.randrange(0,2)
        else:
            action = np.argmax(trained_model.predict(prev_obs.reshape(-1, len(prev_obs)))[0])
        
        choices.append(action)
        new_observation, reward, done, info = env.step(action)
        prev_obs = new_observation
        score+=reward
        if done:
            break

    env.reset()
    scores.append(score)

print(scores)
print('Average Score:',sum(scores)/len(scores))
print('choice 1:{}  choice 0:{} choice 2:{}'.format(choices.count(1)/len(choices),choices.count(0)/len(choices),choices.count(2)/len(choices)))

[-168.0, -134.0, -152.0, -200.0, -135.0, -133.0, -135.0, -139.0, -139.0, -137.0, -137.0, -137.0, -132.0, -131.0, -166.0, -145.0, -135.0, -130.0, -167.0, -131.0, -165.0, -134.0, -162.0, -172.0, -135.0, -166.0, -156.0, -168.0, -168.0, -135.0, -131.0, -143.0, -135.0, -200.0, -200.0, -138.0, -166.0, -172.0, -142.0, -152.0, -131.0, -166.0, -132.0, -136.0, -134.0, -131.0, -133.0, -138.0, -143.0, -165.0, -163.0, -200.0, -132.0, -200.0, -131.0, -136.0, -146.0, -132.0, -134.0, -143.0, -136.0, -137.0, -139.0, -164.0, -147.0, -133.0, -168.0, -166.0, -200.0, -200.0, -132.0, -131.0, -153.0, -131.0, -130.0, -139.0, -137.0, -134.0, -144.0, -172.0, -141.0, -166.0, -146.0, -134.0, -132.0, -131.0, -135.0, -131.0, -200.0, -137.0, -134.0, -134.0, -200.0, -134.0, -143.0, -200.0, -140.0, -173.0, -137.0, -176.0]
Average Score: -149.66
choice 1:0.17379393291460643  choice 0:0.17546438594146732 choice 2:0.6507416811439263
