In [None]:
import random
import sys

In [None]:
import gym
import numpy as np
from keras.layers import Dense, Flatten
from keras.models import Sequential

In [None]:
def build_model(with_conv=False):
    model = Sequential()
    model.add(Dense(20, input_shape=(2,) + env.observation_space.shape, kernel_initializer='uniform', activation='relu'))
    model.add(Flatten())
    model.add(Dense(128, kernel_initializer='uniform', activation='relu'))
    model.add(Dense(64, kernel_initializer='uniform', activation='relu'))
    model.add(Dense(16, kernel_initializer='uniform', activation='relu'))
    model.add(Dense(env.action_space.n, kernel_initializer='uniform', activation='linear'))
    model.compile(optimizer='adam', loss='mse')
    return model

In [None]:
def observe_and_learn(model):
    # starting observation
    observation = env.reset()
    #print(observation)
    obs = np.expand_dims(observation[0], axis=0)
    state = np.stack((obs, obs), axis=1)
    done = False

    # observe for a set amount of timesteps and add the observations to memory
    # uses epsilon-greedy with epsilon annealed over time
    for t in range(observetime):
        if np.random.rand() <= epsilon:
            action = env.action_space.sample()
        else:
            Q = model.predict(state)
            action = np.argmax(Q)
            # print(env.step(action))
        result=env.step(action)
        observation_new, reward, done, info = result[:4]
        #print(observation_new)
        obs_new = np.expand_dims(observation_new, axis=0)
        # print(state)
        # print(obs_new)
        state_new = np.append(np.expand_dims(obs_new, axis=0), state[:, :1, :], axis=1)
        D.append((state, action, reward, state_new, done))
        state = state_new
        if done:
            env.reset()
            obs = np.expand_dims(observation[0], axis=0)
            state = np.stack((obs, obs), axis=1)
    # finish observation

    # train the model with a preset minibatch size
    # model is trained with SARSA (state action reward state action) algorithm, with adam optimizer
    minibatch = random.sample(D, mb_size)

    inputs_shape = (mb_size,) + state.shape[1:]
    inputs = np.zeros(inputs_shape)
    targets = np.zeros((mb_size, env.action_space.n))

    for i in range(mb_size):
        state = minibatch[i][0]
        action = minibatch[i][1]
        reward = minibatch[i][2]
        state_new = minibatch[i][3]
        done = minibatch[i][4]

        inputs[i:i + 1] = np.expand_dims(state, axis=0)
        targets[i] = model.predict(state)
        Q_sa = model.predict(state_new)
        
        if done:
            targets[i, action] = reward
        else:
            targets[i, action] = reward + gamma * np.max(Q_sa)

        loss = model.train_on_batch(inputs, targets)
        sys.stdout.write("\rEpisode = %s,Loss = %.5f" % (str(episode), loss))

In [None]:
def play(model):
    observation = env.reset()
    obs = np.expand_dims(observation[0], axis=0)
    state = np.stack((obs, obs), axis=1)
    done = False
    tot_reward = 0.0
    while not done:
        env.render()
        Q = model.predict(state)
        action = np.argmax(Q)
        result1=env.step(action)
        observation, reward, done, info = result1[:4]
        obs = np.expand_dims(observation, axis=0)
        state = np.append(np.expand_dims(obs, axis=0), state[:, :1, :], axis=1)
        tot_reward += reward
    print('Game ended! Total reward: {}'.format(tot_reward))

In [None]:
if __name__ == '__main__':
    episode = 0
    env = gym.make('CartPole-v1')

    D = []
    observetime = 500
    epsilon = 0.95
    gamma = 0.9
    mb_size = 50
    model = build_model()
    mode = input("Input mode: ")
    if mode.upper() == 'train':
        for i in range(10):
            for j in range(20):
                observe_and_learn(model) 
                episode += 1
                print('\n')
                D = []
            epsilon *= 0.9
            print('\n')
            play(model)
        model_json = model.to_json()
        with open("modeldetails.json", "w") as json_file:
            json_file.write(model_json)
        model.save_weights("modeldetails.h5")
        print("Saved the model")
    elif mode.upper() == "playgame":
        model = build_model()
        model.load_weights('modeldetails.h5')
        for i in range(20):
            play(model)