# Cart-Pole Problem

https://github.com/openai/gym/wiki/Leaderboard#cartpole-v0

https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py

### Details

See details here:

https://github.com/openai/gym/wiki/CartPole-v0

### Deep Q Neural Network

https://medium.com/dabbler-in-de-stress/the-inverted-pendulum-problem-with-deep-reinforcement-learning-9f149b68c018

In [5]:
import gym
import random
import numpy as np
import tensorflow as tf
from tensorflow import keras
from statistics import median, mean
from collections import Counter

In [91]:
LR = 1e-3
env = gym.make("CartPole-v0")
env.reset()
goal_steps = 500
score_requirement = 100
initial_games = 100000

In [92]:
def some_random_games_first():
    # Each of these is its own game.
    for episode in range(5):
        env.reset()
        # this is each frame, up to 200...but we wont make it that far.
        for t in range(200):
            # This will display the environment
            # Only display if you really want to see it.
            # Takes much longer to display it.
            env.render()
            
            # This will just create a sample action in any environment.
            # In this environment, the action can be 0 or 1, which is left or right
            action = env.action_space.sample()
            
            # this executes the environment with an action, 
            # and returns the observation of the environment, 
            # the reward, if the env is over, and other info.
            observation, reward, done, info = env.step(action)
            if done:
                break
    env.close()
                
some_random_games_first()

## Create Traininig Data

In [93]:
def initial_population():
    # [OBS, MOVES]
    training_data = []
    # all scores:
    scores = []
    # just the scores that met our threshold:
    accepted_scores = []
    # iterate through however many games we want:
    for _ in range(initial_games):
        score = 0
        # moves specifically from this environment:
        game_memory = []
        # previous observation that we saw
        prev_observation = []
        # for each frame in 500
        for _ in range(goal_steps):
            # choose random action (0 or 1)
            action = random.randrange(0,2)
            # do it!
            observation, reward, done, info = env.step(action)
            
            # notice that the observation is returned FROM the action
            # so we'll store the previous observation here, pairing
            # the prev observation to the action we'll take.
            if len(prev_observation) > 0 :
                game_memory.append([prev_observation, action])
            prev_observation = observation
            score+=reward
            if done: 
                break
                
        # reset env to play again
        env.reset()
        # save overall scores
        scores.append(score)
        
        # IF our score is higher than our threshold, we'd like to save
        # every move we made
        # NOTE the reinforcement methodology here. 
        # all we're doing is reinforcing the score, we're not trying 
        # to influence the machine in any way as to HOW that score is 
        # reached.
        if score >= score_requirement:
            accepted_scores.append(score)
            for data in game_memory:
                # convert to one-hot (this is the output layer for our neural network)
                if data[1] == 1:
                    output = [0,1]
                elif data[1] == 0:
                    output = [1,0]
                    
                # saving our training data
                training_data.append([data[0], output])
    
    # just in case you wanted to reference later
    training_data_save = np.array(training_data)
    np.save('saved.npy',training_data_save)
    
    # some stats here, to further illustrate the neural network magic!
    print('Average accepted score:',mean(accepted_scores))
    print('Median score for accepted scores:',median(accepted_scores))
    print(Counter(accepted_scores))
    
    return training_data

In [94]:
def neural_network_model(input_size):

    model = keras.Sequential([
        keras.layers.Flatten(input_shape=(input_size, 1)),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dense(256, activation='relu'),
        keras.layers.Dense(512, activation='relu'),
        keras.layers.Dense(256, activation='relu'),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dense(2, activation='softmax')
    ])
    model.compile(optimizer='adam',
#                 learning_rate=LR, 
                loss='categorical_crossentropy', 
                metrics=['accuracy']) 
                #target_tensors='targets')

    return model

In [95]:
def train_model(training_data, n_epochs, model=False):

    X = np.array([i[0] for i in training_data]).reshape(-1,len(training_data[0][0]),1) # training data
    y = np.array([i[1] for i in training_data]) # training lables
    
    if not model:
        model = neural_network_model(input_size = len(X[0]))
    
    model.fit(X, y, epochs=n_epochs)
    return model

# model.fit(train_images, train_labels, epochs=10)

In [96]:
training_data = initial_population()

Average accepted score: 111.3409090909091
Median score for accepted scores: 108.0
Counter({103.0: 5, 108.0: 4, 109.0: 3, 102.0: 3, 100.0: 3, 105.0: 3, 101.0: 3, 113.0: 3, 111.0: 2, 125.0: 1, 142.0: 1, 117.0: 1, 132.0: 1, 104.0: 1, 114.0: 1, 110.0: 1, 119.0: 1, 123.0: 1, 112.0: 1, 165.0: 1, 106.0: 1, 137.0: 1, 107.0: 1, 127.0: 1})


In [98]:
model = train_model(training_data, n_epochs = 50) # train the model

Train on 4855 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [111]:
scores = []
choices = []
for each_game in range(10):
    score = 0
    game_memory = []
    prev_obs = env.reset()
    for _ in range(goal_steps):
#         env.render()

        if len(prev_obs)==0:
            action = random.randrange(0,2)
        else:
            action = np.argmax(model.predict(prev_obs.reshape(-1,len(prev_obs),1))[0])

        choices.append(action)
                
        new_observation, reward, done, info = env.step(action)
        prev_obs = new_observation
        game_memory.append([new_observation, action])
        score+=reward
        if done: 
            print("DONE. Score = ", score)
            break
    env.close()
    
    scores.append(score)

print('Average Score:',sum(scores)/len(scores))
# print('choice 1:{}  choice 0:{}'.format(choices.count(1)/len(choices), choices.count(0)/len(choices)))
# print(score_requirement)

DONE. Score =  200.0
DONE. Score =  200.0
DONE. Score =  196.0
DONE. Score =  200.0
DONE. Score =  200.0
DONE. Score =  200.0
DONE. Score =  170.0
DONE. Score =  200.0
DONE. Score =  185.0
DONE. Score =  200.0
Average Score: 195.1


In [112]:
scores = []
choices = []

score = 0
game_memory = []
prev_obs = env.reset()


In [113]:
if len(prev_obs)==0:
    action = random.randrange(0,2)
else:
    action = np.argmax(model.predict(prev_obs.reshape(-1,len(prev_obs),1))[0])

choices.append(action)

new_observation, reward, done, info = env.step(action)
prev_obs = new_observation
# game_memory.append([new_observation, action])
score+=reward
# if done: 
#     print("DONE. Score = ", score)
#     break
# env.close()

# scores.append(score)

In [126]:
model.predict(prev_obs.reshape(-1,len(prev_obs),1))[0]

array([0.4956295, 0.5043705], dtype=float32)