In [1]:
import gym
import random
import numpy as np
from keras.models     import Sequential
from keras.layers     import Dense
from keras.optimizers import Adam

In [2]:
env = gym.make('MountainCar-v0')
env.reset()
goal_steps = 200
score_requirement = -198
intial_games = 10000

In [3]:
def play_a_random_game_first():
    for step_index in range(goal_steps):
        env.render()
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        print("Step {}:".format(step_index))
        print("action: {}".format(action))
        print("observation: {}".format(observation))
        print("reward: {}".format(reward))
        print("done: {}".format(done))
        print("info: {}".format(info))
        if done:
            break
    env.reset()

In [4]:
play_a_random_game_first()

Step 0:
action: 0
observation: [-0.42020121 -0.00177578]
reward: -1.0
done: False
info: {}
Step 1:
action: 2
observation: [-0.42174009 -0.00153888]
reward: -1.0
done: False
info: {}
Step 2:
action: 2
observation: [-0.42303108 -0.00129099]
reward: -1.0
done: False
info: {}
Step 3:
action: 2
observation: [-0.42406493 -0.00103386]
reward: -1.0
done: False
info: {}
Step 4:
action: 2
observation: [-0.42483425 -0.00076932]
reward: -1.0
done: False
info: {}
Step 5:
action: 0
observation: [-0.42733351 -0.00249926]
reward: -1.0
done: False
info: {}
Step 6:
action: 0
observation: [-0.43154476 -0.00421125]
reward: -1.0
done: False
info: {}
Step 7:
action: 1
observation: [-0.43643768 -0.00489292]
reward: -1.0
done: False
info: {}
Step 8:
action: 0
observation: [-0.44297688 -0.0065392 ]
reward: -1.0
done: False
info: {}
Step 9:
action: 1
observation: [-0.45011487 -0.00713799]
reward: -1.0
done: False
info: {}
Step 10:
action: 0
observation: [-0.45879953 -0.00868466]
reward: -1.0
done: False
info: {

action: 1
observation: [-0.55998774  0.00258164]
reward: -1.0
done: False
info: {}
Step 107:
action: 1
observation: [-0.55713372  0.00285402]
reward: -1.0
done: False
info: {}
Step 108:
action: 1
observation: [-0.55402862  0.0031051 ]
reward: -1.0
done: False
info: {}
Step 109:
action: 1
observation: [-0.55069561  0.00333301]
reward: -1.0
done: False
info: {}
Step 110:
action: 1
observation: [-0.54715959  0.00353601]
reward: -1.0
done: False
info: {}
Step 111:
action: 2
observation: [-0.54244702  0.00471257]
reward: -1.0
done: False
info: {}
Step 112:
action: 2
observation: [-0.53659316  0.00585386]
reward: -1.0
done: False
info: {}
Step 113:
action: 0
observation: [-0.53164187  0.00495129]
reward: -1.0
done: False
info: {}
Step 114:
action: 0
observation: [-0.52763026  0.00401161]
reward: -1.0
done: False
info: {}
Step 115:
action: 2
observation: [-0.52258841  0.00504185]
reward: -1.0
done: False
info: {}
Step 116:
action: 0
observation: [-0.51855414  0.00403427]
reward: -1.0
done: Fa

In [5]:
def model_data_preparation():
    training_data = []
    accepted_scores = []
    for game_index in range(intial_games):
        score = 0
        game_memory = []
        previous_observation = []
        for step_index in range(goal_steps):
            action = random.randrange(0, 3)
            observation, reward, done, info = env.step(action)
            
            if len(previous_observation) > 0:
                game_memory.append([previous_observation, action])
                
            previous_observation = observation
            if observation[0] > -0.2:
                reward = 1
            
            score += reward
            if done:
                break
            
        if score >= score_requirement:
            accepted_scores.append(score)
            for data in game_memory:
                if data[1] == 1:
                    output = [0, 1, 0]
                elif data[1] == 0:
                    output = [1, 0, 0]
                elif data[1] == 2:
                    output = [0, 0, 1]
                training_data.append([data[0], output])
        
        env.reset()
    
    print(accepted_scores)
    
    return training_data

In [6]:
training_data=model_data_preparation()

[-192.0, -196.0, -192.0, -188.0, -168.0, -184.0, -170.0, -178.0, -184.0, -184.0, -186.0, -188.0, -176.0, -170.0, -176.0, -194.0, -188.0, -180.0, -186.0, -174.0, -188.0, -190.0, -176.0, -186.0, -194.0, -180.0, -194.0, -186.0, -178.0, -172.0, -190.0, -146.0, -194.0, -182.0, -184.0, -176.0, -176.0, -178.0, -184.0, -168.0, -164.0, -190.0, -184.0]


In [7]:
def build_model(input_size, output_size):
    model = Sequential()
    model.add(Dense(128, input_dim=input_size, activation='relu'))
    model.add(Dense(52, activation='relu'))
    model.add(Dense(output_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam())
    return model


In [8]:
def train_model(training_data):
    X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]))
    y = np.array([i[1] for i in training_data]).reshape(-1, len(training_data[0][1]))
    model = build_model(input_size=len(X[0]), output_size=len(y[0]))
    
    model.fit(X, y, epochs=5)
    return model

In [9]:
trained_model = train_model(training_data)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [11]:
scores = []
choices = []
for each_game in range(100):
    score = 0
    game_memory = []
    prev_obs = []
    for step_index in range(goal_steps):
        env.render()
        if len(prev_obs)==0:
            action = random.randrange(0,2)
        else:
            action = np.argmax(trained_model.predict(prev_obs.reshape(-1, len(prev_obs)))[0])
        
        choices.append(action)
        new_observation, reward, done, info = env.step(action)
        prev_obs = new_observation
        game_memory.append([new_observation, action])
        score += reward
        if done:
            break

env.reset()
scores.append(score)

print(scores)
print('Average Score:',sum(scores)/len(scores))
print('choice 1:{}  choice 0:{} choice 2:{}'.format(choices.count(1)/len(choices),choices.count(0)/len(choices),choices.count(2)/len(choices)))

[-1.0]
Average Score: -1.0
choice 1:0.15384615384615385  choice 0:0.1806020066889632 choice 2:0.6655518394648829
