In [None]:
import time
import random
import itertools

import numpy as np
import gym

import tensorflow as tf
import tensorflow.keras.layers as kl


In [None]:
class RandomModel():

    def __init__( self, env ):

        self.env = env

    def action( self, observation ):

        return self.env.action_space.sample()


In [None]:
class DeterministicModel():

    def __init__( self, env ):

        self.env = env

    def action( self, observation ):

        # default to left
        action = 0

        # if angle is over zero degrees (right) move right
        if observation[ 2 ] > 0:
            action = 1

        return action


In [None]:

env = gym.make( "CartPole-v1" )

model = RandomModel( env )
#model = DeterministicModel( env )

best_q = 0
batches = 10
render = False
observation = env.reset()
for i in range(batches):

    q = 0
    for _ in range(100):

        action = model.action( observation ) 
        observation, reward, done, info = env.step( action )
        
        q = q + reward

        if render:
            env.render()
            time.sleep( 0.1 )

        if done:
            break

    if q > best_q:
        best_q = q

    observation = env.reset()

print( f'** BEST Q, {best_q}' )

env.close()


In [None]:
class MachineLearningModel( tf.keras.Model ):

    def __init__( self, env ):
    
        super( MachineLearningModel, self ).__init__()

        self.env = env

        self.layer_input = kl.InputLayer( input_shape = ( 1,4 ), name='input' )
        self.layer_mask = kl.Masking( mask_value = 0.0, name='mask' )
        self.layer_in = kl.Dense( 128, activation='relu', name='in' )
        self.layer_hidden1 = kl.Dense( 64, activation='relu', name='hidden1' )
        self.layer_hidden2 = kl.Dense( 32, activation='relu', name='hidden2' )
        self.layer_logit = kl.Dense( 2, activation='softmax', name='logit' )

    def call( self, x ):

        y = self.layer_input( x )
        y = self.layer_mask( y )
        y = self.layer_in( y )
        y = self.layer_hidden1( y )
        y = self.layer_hidden2( y )
        y = self.layer_logit( y )

        return y


In [None]:

env = gym.make( "CartPole-v1" )
model = MachineLearningModel( env )
model.compile(
    optimizer=tf.keras.optimizers.Adam( learning_rate=1e-5 ),
    loss=tf.keras.losses.CategoricalCrossentropy( )
)


In [81]:

render = True
best_q = 0

print()
print( '** *************************' )
print( '** SAMPLING,...' )

batches_x = []
batches_q = []

samples = 32
q_avg = 0
observation = env.reset()

while len( batches_x ) < samples:
    total_q = 0
    done = False
    batch_x = []
    batch_q = []

    action = None
    while not done:
        action = tf.convert_to_tensor( [ (random.random() % 2) ])
        q = model.predict( action )
        action = tf.argmax( action ).numpy()

        observation, reward, done, info = env.step( action )

        total_q = total_q + reward
        batch_x.append( action )
        batch_q.append( reward )

        if render:
            env.render()
            time.sleep( 0.1 )

    q_avg = ( q_avg * len( batches_x )  + total_q ) / ( len(batches_x ) + 1 )
    if total_q >= best_q:
        
        best_q = total_q

        batches_x.append( batch_x )
        batches_q.append( batch_q )
        print( f'** added sample, q {total_q}, q_avg, {q_avg}' )

    observation = env.reset()


env.close()





** *************************
** SAMPLING,...
** added sample, q 8.0, q_avg, 8.0
** added sample, q 10.0, q_avg, 9.0
** added sample, q 10.0, q_avg, 9.333333333333334
** added sample, q 10.0, q_avg, 9.5
** added sample, q 10.0, q_avg, 9.4048
** added sample, q 11.0, q_avg, 9.614444444444445
** added sample, q 11.0, q_avg, 9.545417024318704
** added sample, q 11.0, q_avg, 9.494873173239267
** added sample, q 11.0, q_avg, 9.27003899218355
** added sample, q 11.0, q_avg, 9.631068792045017
** added sample, q 11.0, q_avg, 9.543184369242868
** added sample, q 11.0, q_avg, 9.699481310266576
** added sample, q 11.0, q_avg, 9.61907673567525
** added sample, q 11.0, q_avg, 9.469023638561936
** added sample, q 11.0, q_avg, 9.282231214117088
** added sample, q 11.0, q_avg, 9.27123783487062
** added sample, q 11.0, q_avg, 9.451837881877779
** added sample, q 11.0, q_avg, 9.321174575073623
** added sample, q 11.0, q_avg, 9.468695520906232
** added sample, q 11.0, q_avg, 9.329661217204315
** added sa

In [82]:
model.fit( 
    x= batches_x,
    y= batches_q,
    epochs = 128,
    batch_size = 32,
    use_multiprocessing = True,
    verbose = 0
)


ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {'(<class \'list\'> containing values of types {"<class \'numpy.int64\'>"})'}), (<class 'list'> containing values of types {'(<class \'list\'> containing values of types {"<class \'float\'>"})'})

In [None]:
print( batches_q )

tf.convert_to_tensor