
# Imports

In [1]:
import random
import time
import os

import tensorflow as tf
from tensorflow.python.client import device_lib
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Input
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers
from tensorflow.keras import backend as kb
from keras import optimizers
import gym

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

tf.debugging.set_log_device_placement(True)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


# Setting up env

In [2]:
env = gym.make('MountainCar-v0')
env.reset()
goal_steps = 200
score_requirement = -198
intial_games = 10000

# Playing a random game

In [19]:
def play_a_random_game_first():
    for step_index in range(goal_steps):
        #env.render()
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        print("Step {}:".format(step_index))
        print("action: {}".format(action))
        print("observation: {}".format(observation))
        print("reward: {}".format(reward))
        print("done: {}".format(done))
        print("info: {}".format(info))
        if done:
            break
    env.reset()

In [20]:
play_a_random_game_first()


Step 0:
action: 2
observation: [-0.57981927  0.00143023]
reward: -1.0
done: False
info: {}
Step 1:
action: 0
observation: [-0.57896938  0.00084989]
reward: -1.0
done: False
info: {}
Step 2:
action: 2
observation: [-0.57670612  0.00226326]
reward: -1.0
done: False
info: {}
Step 3:
action: 1
observation: [-0.57404624  0.00265988]
reward: -1.0
done: False
info: {}
Step 4:
action: 0
observation: [-0.57200945  0.00203679]
reward: -1.0
done: False
info: {}
Step 5:
action: 1
observation: [-0.56961085  0.0023986 ]
reward: -1.0
done: False
info: {}
Step 6:
action: 2
observation: [-0.56586826  0.00374259]
reward: -1.0
done: False
info: {}
Step 7:
action: 1
observation: [-0.56180949  0.00405877]
reward: -1.0
done: False
info: {}
Step 8:
action: 0
observation: [-0.55846477  0.00334472]
reward: -1.0
done: False
info: {}
Step 9:
action: 1
observation: [-0.55485903  0.00360574]
reward: -1.0
done: False
info: {}
Step 10:
action: 1
observation: [-0.55101919  0.00383985]
reward: -1.0
done: False
info: {

# Run with GPU/CPU

In [27]:
def run(device, function, repeats, **kwargs):
    """
    Run a given function on the specified device with the provided keyword arguments
    """
    with tf.device(device):
        t0 = time.time()

        # Run function with all additional keyword arguments provided
        model = function(**kwargs)

        t = time.time() - t0
    return model


# Might be different on other pc
cpu = '/device:CPU:0'
gpu = '/device:GPU:0'

local_device_protos = device_lib.list_local_devices()
print([x.name for x in local_device_protos])
#tf.device('/device:GPU:0')

['/device:CPU:0', '/device:XLA_GPU:0', '/device:XLA_CPU:0']


# Data prep

In [6]:
def model_data_preparation():
    training_data = []
    accepted_scores = []
    for game_index in range(intial_games):
        score = 0
        game_memory = []
        previous_observation = []
        for step_index in range(goal_steps):
            action = random.randrange(0, 3)
            observation, reward, done, info = env.step(action)
            
            if len(previous_observation) > 0:
                game_memory.append([previous_observation, action])
                
            previous_observation = observation
            if observation[0] > -0.2:
                reward = 1
            
            score += reward
            if done:
                break
            
        if score >= score_requirement:
            accepted_scores.append(score)
            for data in game_memory:
                if data[1] == 1:
                    output = [0, 1, 0]
                elif data[1] == 0:
                    output = [1, 0, 0]
                elif data[1] == 2:
                    output = [0, 0, 1]
                training_data.append([data[0], output])
        
        env.reset()
    
    print(accepted_scores)
    
    return training_data

In [7]:
training_data = model_data_preparation()

[-194.0, -188.0, -176.0, -180.0, -174.0, -174.0, -192.0, -172.0, -194.0, -184.0, -184.0, -192.0, -182.0, -158.0, -182.0, -178.0, -186.0, -164.0, -184.0, -192.0, -178.0, -186.0, -192.0, -188.0, -178.0, -164.0, -188.0, -180.0, -172.0, -186.0, -186.0, -182.0, -198.0, -182.0, -180.0, -190.0, -172.0, -186.0, -164.0, -166.0, -182.0, -180.0, -176.0, -184.0, -180.0]


# Model

In [8]:
def build_model(input_size, output_size):
    model = Sequential()
    model.add(Dense(128, input_dim=input_size, activation='relu'))
    model.add(Dense(52, activation='relu'))
    model.add(Dense(output_size, activation='linear'))
    model.compile(loss='mse', optimizer=optimizers.Adam())

    return model

# Train model

In [24]:
def train_model(training_data):
    X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]))
    y = np.array([i[1] for i in training_data]).reshape(-1, len(training_data[0][1]))
    model = build_model(input_size=len(X[0]), output_size=len(y[0]))
    
    model.fit(X, y, epochs=5)
    return model

In [31]:
trained_model = train_model(training_data)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [30]:
trained_model = run(device=gpu, function=train_model, repeats=1, training_data=training_data)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.engine.sequential.Sequential at 0x7f58b0167ef0>

In [23]:
scores = []
choices = []
for each_game in range(100):
    score = 0
    prev_obs = []
    for step_index in range(goal_steps):
        # Uncomment this line if you want to see how our bot playing
        # env.render()
        if len(prev_obs)==0:
            action = random.randrange(0,2)
        else:
            action = np.argmax(trained_model.predict(prev_obs.reshape(-1, len(prev_obs)))[0])
        
        choices.append(action)
        new_observation, reward, done, info = env.step(action)
        prev_obs = new_observation
        score+=reward
        if done:
            break

    env.reset()
    scores.append(score)

print(scores)
print('Average Score:',sum(scores)/len(scores))
print('choice 1:{}  choice 0:{} choice 2:{}'.format(choices.count(1)/len(choices),choices.count(0)/len(choices),choices.count(2)/len(choices)))

[-168.0, -124.0, -171.0, -119.0, -171.0, -159.0, -167.0, -125.0, -121.0, -161.0, -160.0, -168.0, -126.0, -119.0, -118.0, -118.0, -172.0, -119.0, -125.0, -160.0, -157.0, -165.0, -168.0, -162.0, -118.0, -125.0, -126.0, -119.0, -168.0, -172.0, -119.0, -168.0, -121.0, -119.0, -119.0, -119.0, -118.0, -171.0, -157.0, -165.0, -166.0, -162.0, -121.0, -168.0, -119.0, -170.0, -118.0, -161.0, -166.0, -158.0, -125.0, -173.0, -168.0, -125.0, -126.0, -118.0, -160.0, -160.0, -120.0, -159.0, -172.0, -171.0, -118.0, -157.0, -173.0, -119.0, -118.0, -123.0, -167.0, -120.0, -159.0, -164.0, -118.0, -120.0, -125.0, -126.0, -170.0, -120.0, -164.0, -120.0, -158.0, -162.0, -164.0, -125.0, -168.0, -126.0, -173.0, -124.0, -157.0, -125.0, -172.0, -117.0, -118.0, -158.0, -171.0, -125.0, -125.0, -159.0, -125.0, -170.0]
Average Score: -144.16
choice 1:0.003537735849056604  choice 0:0.43236681465038845 choice 2:0.5640954495005549
