In [3]:
# https://gist.github.com/EderSantana/c7222daa328f0e885093 much code and inspiration from here, il iterate from this starting point

# Normal, choose max reward action
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers.advanced_activations import PReLU
import numpy as np
import gym

EPSILON = 0.2
NUM_ACTIONS = 2
STATE_SPACE = 4
MAX_MEMORY = 160
EPOCH = 1000 
BATCH_SIZE=8


def weighted_choice(weights):
    totals = np.cumsum(weights)
    norm = totals[-1]
    throw = np.random.rand()*norm
    return np.searchsorted(totals, throw)


class ExperienceReplay(object):
    def __init__(self, discount=.9):
        self.memory = []
        self.discount = discount

    def remember(self, states, game_over):
        self.memory.append([states, game_over])
        if len(self.memory) > MAX_MEMORY:
            del self.memory[0]

    def get_batch(self, model, batch_size=10):
        inputs = np.zeros((min(len(self.memory), batch_size), STATE_SPACE))
        targets = np.zeros((inputs.shape[0], NUM_ACTIONS))
        for i, idx in enumerate(np.random.randint(0, len(self.memory),
                                                  size=inputs.shape[0])):
            state_t, action_t, reward_t, state_tp1 = self.memory[idx][0]
            game_over = self.memory[idx][1]

            inputs[i:i+1] = state_t
            targets[i] = model.predict(np.reshape(state_t, (1, state_t.shape[0])))[0]
            Q_sa = np.max(model.predict(np.reshape(state_tp1, (1, state_tp1.shape[0])))[0])
            if game_over: 
                targets[i, action_t] = reward_t
            else:
                targets[i, action_t] = reward_t + self.discount * Q_sa
        return inputs, targets


def ker_model(input_size, output_size):
    model = Sequential()
    model.add(Dense(256, input_shape=(input_size,)))
    model.add(PReLU())
    model.add(Dense(256))
    model.add(PReLU())
    model.add(Dense(output_size))
    model.compile(optimizer='adam', loss='mse')
    model.summary()
    return model

env = gym.make('CartPole-v0')
exp_replay = ExperienceReplay()
model = ker_model(STATE_SPACE, NUM_ACTIONS)
env.render()
for e in range(EPOCH):
    win_cnt = 0
    loss = 0.
    input_t = env.reset()
    game_over = False

    while not game_over:
        input_tm1 = input_t
        # get next action
        if e < 100:
            if np.random.rand() <= EPSILON:
                action = np.random.randint(0, NUM_ACTIONS, size=1)[0]
            else:
                
                q = model.predict(np.reshape(input_tm1, (1, input_tm1.shape[0])))
                action = np.argmax(q[0])
        else:
            q = model.predict(np.reshape(input_tm1, (1, input_tm1.shape[0])))
            action = np.argmax(q[0])
        
        # Cartpole goes from -1 to 1
        # action -= 1
        
        # apply action, get rewards and new state
        input_t, reward, game_over, info = env.step(action)
        win_cnt += reward

        # store experience
        exp_replay.remember([input_tm1, action, reward, input_t], game_over)

        # adapt model
        inputs, targets = exp_replay.get_batch(model, batch_size=BATCH_SIZE)
        loss += model.train_on_batch(inputs, targets)
    print("Epoch {:03d}/999 | Loss {:.4f} | Win count {}".format(e, loss, win_cnt))

[2017-08-06 20:51:06,796] Making new env: CartPole-v0


  (sample.dtype, var.uid, str(var.dtype)))
  (sample.dtype, var.uid, str(var.dtype)))


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 256)               1280      
_________________________________________________________________
p_re_lu_7 (PReLU)            (None, 256)               256       
_________________________________________________________________
dense_10 (Dense)             (None, 256)               65792     
_________________________________________________________________
p_re_lu_8 (PReLU)            (None, 256)               256       
_________________________________________________________________
dense_11 (Dense)             (None, 2)                 514       
Total params: 68,098
Trainable params: 68,098
Non-trainable params: 0
_________________________________________________________________


Epoch 000/999 | Loss 9.6624 | Win count 12.0


Epoch 001/999 | Loss 17.7941 | Win count 11.0


Epoch 002/999 | Loss 25.5614 | Win count 10.0
Epoch 003/999 | Loss 35.7214 | Win count 10.0


Epoch 004/999 | Loss 24.8250 | Win count 12.0
Epoch 005/999 | Loss 17.8595 | Win count 8.0


Epoch 006/999 | Loss 47.3830 | Win count 11.0


Epoch 007/999 | Loss 30.1727 | Win count 15.0


Epoch 008/999 | Loss 20.1777 | Win count 11.0


Epoch 009/999 | Loss 25.9562 | Win count 10.0


Epoch 010/999 | Loss 14.6086 | Win count 9.0


Epoch 011/999 | Loss 15.4219 | Win count 9.0


Epoch 012/999 | Loss 30.2700 | Win count 11.0


Epoch 013/999 | Loss 18.7260 | Win count 15.0


Epoch 014/999 | Loss 7.4709 | Win count 11.0


Epoch 015/999 | Loss 13.1727 | Win count 14.0


Epoch 016/999 | Loss 12.6337 | Win count 23.0


Epoch 017/999 | Loss 3.8427 | Win count 16.0


Epoch 018/999 | Loss 3.1889 | Win count 11.0


Epoch 019/999 | Loss 18.2188 | Win count 11.0


Epoch 020/999 | Loss 19.2544 | Win count 12.0


Epoch 021/999 | Loss 21.6503 | Win count 12.0


Epoch 022/999 | Loss 14.5456 | Win count 15.0


Epoch 023/999 | Loss 9.0977 | Win count 12.0


Epoch 024/999 | Loss 33.7049 | Win count 24.0


Epoch 025/999 | Loss 22.0652 | Win count 17.0


Epoch 026/999 | Loss 106.9227 | Win count 87.0


Epoch 027/999 | Loss 28.0702 | Win count 66.0


Epoch 028/999 | Loss 21.7220 | Win count 71.0


Epoch 029/999 | Loss 40.0921 | Win count 14.0


Epoch 030/999 | Loss 2.7228 | Win count 9.0


Epoch 031/999 | Loss 11.3263 | Win count 15.0


Epoch 032/999 | Loss 6.3497 | Win count 12.0


Epoch 033/999 | Loss 29.1795 | Win count 12.0


Epoch 034/999 | Loss 23.7978 | Win count 12.0


Epoch 035/999 | Loss 39.5641 | Win count 24.0
Epoch 036/999 | Loss 8.0863 | Win count 9.0


Epoch 037/999 | Loss 5.9552 | Win count 9.0
Epoch 038/999 | Loss 5.0677 | Win count 8.0


Epoch 039/999 | Loss 10.6600 | Win count 10.0


Epoch 040/999 | Loss 43.0400 | Win count 59.0


Epoch 041/999 | Loss 23.7926 | Win count 25.0


Epoch 042/999 | Loss 9.9827 | Win count 10.0


Epoch 043/999 | Loss 13.7307 | Win count 13.0


Epoch 044/999 | Loss 9.6475 | Win count 22.0


Epoch 045/999 | Loss 12.2564 | Win count 22.0


Epoch 046/999 | Loss 8.3368 | Win count 10.0


Epoch 047/999 | Loss 6.3506 | Win count 26.0


Epoch 048/999 | Loss 42.8606 | Win count 68.0


Epoch 049/999 | Loss 17.7614 | Win count 69.0


Epoch 050/999 | Loss 17.9108 | Win count 67.0


Epoch 051/999 | Loss 26.5363 | Win count 120.0


Epoch 052/999 | Loss 3.9556 | Win count 103.0


Epoch 053/999 | Loss 17.2948 | Win count 40.0


Epoch 054/999 | Loss 18.2326 | Win count 45.0


Epoch 055/999 | Loss 22.7491 | Win count 30.0


Epoch 056/999 | Loss 5.2369 | Win count 16.0


Epoch 057/999 | Loss 17.4323 | Win count 43.0


Epoch 058/999 | Loss 35.8433 | Win count 148.0


Epoch 059/999 | Loss 11.7839 | Win count 132.0


Epoch 060/999 | Loss 7.6317 | Win count 113.0


Epoch 061/999 | Loss 10.1213 | Win count 110.0


Epoch 062/999 | Loss 7.3605 | Win count 200.0


Epoch 063/999 | Loss 16.5247 | Win count 200.0


Epoch 064/999 | Loss 24.6025 | Win count 143.0


Epoch 065/999 | Loss 8.6684 | Win count 186.0


Epoch 066/999 | Loss 31.1879 | Win count 139.0


Epoch 067/999 | Loss 20.5180 | Win count 160.0


Epoch 068/999 | Loss 19.2816 | Win count 163.0


Epoch 069/999 | Loss 2.8247 | Win count 21.0


Epoch 070/999 | Loss 10.4576 | Win count 83.0


Epoch 071/999 | Loss 23.1910 | Win count 91.0


Epoch 072/999 | Loss 1.0289 | Win count 15.0


Epoch 073/999 | Loss 10.8988 | Win count 21.0


Epoch 074/999 | Loss 32.7529 | Win count 99.0


Epoch 075/999 | Loss 51.2912 | Win count 163.0


Epoch 076/999 | Loss 4.6198 | Win count 21.0


Epoch 077/999 | Loss 52.5633 | Win count 73.0


Epoch 078/999 | Loss 7.4435 | Win count 14.0


Epoch 079/999 | Loss 54.0140 | Win count 41.0


Epoch 080/999 | Loss 24.4933 | Win count 23.0


Epoch 081/999 | Loss 12.0907 | Win count 24.0


Epoch 082/999 | Loss 27.8288 | Win count 69.0


Epoch 083/999 | Loss 31.0163 | Win count 128.0


Epoch 084/999 | Loss 16.6147 | Win count 159.0


Epoch 085/999 | Loss 15.4845 | Win count 151.0


Epoch 086/999 | Loss 21.0783 | Win count 156.0


Epoch 087/999 | Loss 10.6723 | Win count 159.0


Epoch 088/999 | Loss 11.1377 | Win count 129.0


Epoch 089/999 | Loss 13.3280 | Win count 140.0


Epoch 090/999 | Loss 1.7756 | Win count 20.0


Epoch 091/999 | Loss 48.1553 | Win count 162.0


Epoch 092/999 | Loss 16.5225 | Win count 200.0


Epoch 093/999 | Loss 15.2961 | Win count 200.0


Epoch 094/999 | Loss 28.8817 | Win count 54.0


Epoch 095/999 | Loss 6.1414 | Win count 15.0


Epoch 096/999 | Loss 17.2157 | Win count 30.0


Epoch 097/999 | Loss 28.1657 | Win count 53.0


Epoch 098/999 | Loss 65.7672 | Win count 145.0


Epoch 099/999 | Loss 19.0596 | Win count 20.0


Epoch 100/999 | Loss 5.0179 | Win count 8.0


Epoch 101/999 | Loss 32.7015 | Win count 151.0


Epoch 102/999 | Loss 11.6606 | Win count 200.0


Epoch 103/999 | Loss 17.6442 | Win count 200.0


Epoch 104/999 | Loss 14.5797 | Win count 200.0


Epoch 105/999 | Loss 11.7969 | Win count 200.0


Epoch 106/999 | Loss 48.5193 | Win count 200.0


Epoch 107/999 | Loss 12.5834 | Win count 200.0


Epoch 108/999 | Loss 44.1685 | Win count 174.0


Epoch 109/999 | Loss 6.9426 | Win count 176.0


Epoch 110/999 | Loss 1.4122 | Win count 135.0


Epoch 111/999 | Loss 0.9272 | Win count 152.0


Epoch 112/999 | Loss 7.0249 | Win count 117.0


Epoch 113/999 | Loss 17.1359 | Win count 155.0


Epoch 114/999 | Loss 10.3202 | Win count 145.0


Epoch 115/999 | Loss 12.6665 | Win count 125.0


Epoch 116/999 | Loss 6.5823 | Win count 116.0


Epoch 117/999 | Loss 3.8787 | Win count 113.0


Epoch 118/999 | Loss 7.0715 | Win count 138.0


Epoch 119/999 | Loss 6.9089 | Win count 148.0


Epoch 120/999 | Loss 3.2700 | Win count 151.0


Epoch 121/999 | Loss 6.8715 | Win count 135.0


Epoch 122/999 | Loss 2.6089 | Win count 133.0


Epoch 123/999 | Loss 2.6398 | Win count 25.0


Epoch 124/999 | Loss 21.4274 | Win count 35.0


Epoch 125/999 | Loss 16.7057 | Win count 28.0


Epoch 126/999 | Loss 64.0554 | Win count 200.0


Epoch 127/999 | Loss 7.8886 | Win count 141.0


Epoch 128/999 | Loss 15.0316 | Win count 149.0


Epoch 129/999 | Loss 8.1909 | Win count 140.0


Epoch 130/999 | Loss 10.9653 | Win count 100.0


Epoch 131/999 | Loss 28.2814 | Win count 155.0


Epoch 132/999 | Loss 36.7819 | Win count 200.0


SystemError: <built-in function Trainer_train_minibatch> returned a result with an error set