###### Imports

In [8]:
import sys
import timeit
import theano
import theano.tensor as T
import matplotlib.pyplot as plt
import gym
import keras
import numpy as np
import scipy
import math
import json
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers.core import Dense
from keras.optimizers import sgd
from keras.models import model_from_json
import time
import operator

###### Definition of the Neural Network with Theano

In [9]:
def relu(X):
    X[np.where(X < 0)] = 0
    return(X)

class output_layer(object):

    def __init__(self, input, n_in, n_out):
        
 
        self.W = theano.shared(
            value=np.zeros(
                (n_in, n_out),
                dtype=theano.config.floatX
            ),
            name='W',
            borrow=True
        )

        self.b = theano.shared(
            value=np.zeros(
                (n_out,),
                dtype=theano.config.floatX
            ),
            name='b',
            borrow=True
        )

        self.y_pred = T.dot(input, self.W) + self.b

        self.params = [self.W, self.b]

        self.input = input

    def mse(self, y):
        return T.mean((self.y_pred - y) ** 2) 
    
class HiddenLayer(object):
    def __init__(self, input, n_in, n_out, W=None, b=None):

        self.input = input
        if W is None:
            W_values = np.asarray(
                np.random.uniform(low=-0.1,high=0.1,size=(n_in,  n_out)),
                dtype=theano.config.floatX)
            W_h = theano.shared(value = W_values, name='W_h', borrow=True)
            
        if b is None:
            b_values = np.zeros((n_out,), dtype=theano.config.floatX)
            b_h = theano.shared(value=b_values, name='b_h', borrow=True)

        self.W_h = W_h
        self.b_h = b_h
        
        self.params = [self.W_h, self.b_h] # Parameters of the model
        
        self.output = T.nnet.relu(T.dot(input, self.W_h) + self.b_h)

        self.input = input
    
class Neural_Network(object):
    
    def __init__(self, input, n_in, n_hidden, n_out):

        self.hiddenLayer = HiddenLayer(
            input=input,
            n_in= n_in,
            n_out=n_hidden,
        )

        self.output_layer = output_layer(
            input=self.hiddenLayer.output,
            n_in=n_hidden,
            n_out=n_out
        )


        self.mse = (
            self.output_layer.mse
        )

        self.params = self.hiddenLayer.params + self.output_layer.params
        
        self.input = input
        
def train_on_batch(dataset_X, dataset_y, classifier, x,y,index,learning_rate=0.01, batch_size=1, n_hidden=500):

    train_set_x = theano.shared(np.asarray(dataset_X,
                                            dtype=theano.config.floatX),
                                borrow=True)
    train_set_y = theano.shared(np.asarray(dataset_y,
                                            dtype=theano.config.floatX),
                                borrow=True)

    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size



    cost = (
        classifier.mse(y)
    )

    gparams = [T.grad(cost, param) for param in classifier.params]
    
    updates = [
        (param, param - learning_rate * gparam)
        for param, gparam in zip(classifier.params, gparams)
    ]


    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
  
    start_time = timeit.default_timer()

    error = []
    error_min = np.inf
    for minibatch_index in range(n_train_batches):
        minibatch_avg_cost = train_model(minibatch_index)
        linear_1 = np.dot(dataset_X, classifier.hiddenLayer.W_h.eval()) +  classifier.hiddenLayer.b_h.eval()
        h1 = relu(linear_1)
        output_nn = np.dot(h1,classifier.output_layer.W.eval()) + classifier.output_layer.b.eval()
        error.append(np.mean((output_nn - dataset_y)**2))
        
        '''if error[len(error)-1] < error_min:
            error_min = error[len(error)-1]
            best_W1 = classifier.hiddenLayer.W_h.eval()
            best_b1 = classifier.hiddenLayer.b_h.eval()
            best_Wo = classifier.output_layer.W.eval()
            best_bo = classifier.output_layer.b.eval()'''

    end_time = timeit.default_timer()
    
    return(error, classifier.hiddenLayer.W_h.eval(), classifier.hiddenLayer.b_h.eval(), classifier.output_layer.W.eval(), classifier.output_layer.b.eval())

def predict_NN(X_input, W1, b1, Wo, bo):
    return(np.dot(relu(np.dot(X_input,W1) + b1),Wo) + bo)


###### Experience Replay

In [10]:
class ExperienceReplay(object):
    
    
    def __init__(self, max_memory=100, discount=.9):
        
        """Define max length of memory and gamma"""
         
        self.max_memory = max_memory
        self.memory = list()
        self.discount = discount

    def remember(self, states, game_over):

        """Add experience to memory"""
        
        self.memory.append([states, game_over])
        if len(self.memory) > self.max_memory:
            del self.memory[0]

    def get_batch(self, W1, b1, Wo, bo, batch_size=10):
        
        """Get the batch input and targets we will train on"""
        
        len_memory = len(self.memory)
        num_actions = 2
        env_dim = self.memory[0][0][0].shape[1]
        inputs = np.zeros((min(len_memory, batch_size), env_dim))
        
        targets = np.zeros((inputs.shape[0], num_actions))
        
        for i, idx in enumerate(np.random.randint(0, len_memory,
                                                  size=inputs.shape[0])):
            
            state_t, action_t, reward_t, state_tp1 = self.memory[idx][0]
            game_over = self.memory[idx][1]
            inputs[i:i+1] = state_t
            targets[i] = predict_NN(state_t, W1, b1, Wo, bo)
            Q_sa = np.max(targets[i])
            if game_over:
                targets[i, action_t] = reward_t
            else:
                targets[i, action_t] = reward_t + self.discount * Q_sa
        return inputs, targets

###### CartPole on OpenAI Gym and Parameters

In [11]:
env = gym.make('CartPole-v0')

learning_rate=0.1
epsilon = .1
num_actions = env.action_space.n 
max_memory = 4000000000
hidden_size = 200
batch_size = 50
acc_reward=0
observation_shape = env.observation_space.shape[0]
time_step=0
max_time_steps=2000
everyC=50
exp_replay = ExperienceReplay(max_memory=max_memory)
win_cnt = 0
t0 = time.time()
actual_total=0
e=0

print('Parameters :','epsilon :', epsilon,'C :', everyC,', learning rate :', learning_rate, 'batch size for training :', batch_size)

print('Training for ',max_time_steps,'time-steps ...')
N = 6
X = np.random.uniform(low=-5.,high=5.,size=(N, 4)).astype('float32')
W = np.random.uniform(low=-5.,high=5.,size=(4, 2)).astype('float32')
b = np.random.uniform(low=-5.,high=5.,size=2).astype('float32') 

noise = np.random.normal(0,1,(N,2))

y = np.dot(X**2,W) + 5*np.dot(X,W) +  b + noise
y=y.astype('float32')


# allocate symbolic variables for the data
index = T.lscalar()  
x = T.matrix('x') 
y_g = T.matrix('y')  
# construct the neural net
classifier = Neural_Network(input=x,n_in=4,n_hidden=200,n_out=2)

#initalize with a random backpropagation
err, W1, b1, Wo, bo = train_on_batch(X, y, classifier, x=x,y=y_g,index=index, learning_rate=0.0003, batch_size=N)

Parameters : epsilon : 0.1 C : 50 , learning rate : 0.1 batch size for training : 50
Training for  2000 time-steps ...


##### Training the algorithm

In [12]:
while time_step<max_time_steps:
    loss = 0.
    acc_reward = 0
    C=0
    e+=1
    input_t = env.reset()
    input_t = input_t.reshape((1,observation_shape))
    game_over = False
    
    while not game_over:

        input_tm1 = input_t.astype('float32')
        
        if np.random.rand() <= epsilon:
            action = np.random.randint(0, num_actions, size=1)[0]
        else:
            q = predict_NN(input_tm1, W1, b1, Wo, bo)
            action = np.argmax(q)

        input_t, reward, game_over, infodemerde = env.step(action)
        input_t = input_t.reshape((1,observation_shape))
        
        acc_reward += reward

        exp_replay.remember([input_tm1, action, reward, input_t], game_over)

        inputs, targets = exp_replay.get_batch(W1, b1, Wo, bo, batch_size=batch_size)
        inputs=inputs.astype('float32')
        targets=targets.astype('float32')

        t2 = time.time()  #start of actual training time

        #TRAIN
        err, W1, b1, Wo, bo = train_on_batch(inputs, targets, classifier, x=x, y=y_g, index=index, learning_rate=learning_rate, batch_size=len(inputs))

        t3 = time.time() #end of actual training time
        actual_total += t3-t2

        time_step += 1

        if acc_reward>=200:
            game_over=True
            win_cnt+=1

    print(time_step,'time steps done, ',e,'episodes done. Reward :', acc_reward, ', loss :', err[0])

t1 = time.time() #end of training time
total = t1-t0
print('Total training time :', total,'Actual training time :', actual_total)
print('Win ratio (nb of games won/nb of games played) :', win_cnt/e)

10 time steps done,  1 episodes done. Reward : 10.0 , loss : 0.4012257924446841
18 time steps done,  2 episodes done. Reward : 8.0 , loss : 0.5729455781534732
28 time steps done,  3 episodes done. Reward : 10.0 , loss : 0.5932333386763372
38 time steps done,  4 episodes done. Reward : 10.0 , loss : 0.9425078980307213
49 time steps done,  5 episodes done. Reward : 11.0 , loss : 1.1837238921061104
58 time steps done,  6 episodes done. Reward : 9.0 , loss : 1.0481504387440348
66 time steps done,  7 episodes done. Reward : 8.0 , loss : 0.659323204597631
76 time steps done,  8 episodes done. Reward : 10.0 , loss : 0.7448139303352894
84 time steps done,  9 episodes done. Reward : 8.0 , loss : 0.5527741857669465
93 time steps done,  10 episodes done. Reward : 9.0 , loss : 0.6443421888676423
104 time steps done,  11 episodes done. Reward : 11.0 , loss : 1.0688422354038702
113 time steps done,  12 episodes done. Reward : 9.0 , loss : 0.6668446120723764
123 time steps done,  13 episodes done. Re

1343 time steps done,  101 episodes done. Reward : 9.0 , loss : 0.15978445998465096
1354 time steps done,  102 episodes done. Reward : 11.0 , loss : 0.15233961273833696
1382 time steps done,  103 episodes done. Reward : 28.0 , loss : 0.11731342088278089
1397 time steps done,  104 episodes done. Reward : 15.0 , loss : 0.21231766407781946
1422 time steps done,  105 episodes done. Reward : 25.0 , loss : 0.43454560671561
1440 time steps done,  106 episodes done. Reward : 18.0 , loss : 0.304129751422211
1451 time steps done,  107 episodes done. Reward : 11.0 , loss : 1.354383074393273
1474 time steps done,  108 episodes done. Reward : 23.0 , loss : 3.485006194801568
1491 time steps done,  109 episodes done. Reward : 17.0 , loss : 0.11202781870221105
1522 time steps done,  110 episodes done. Reward : 31.0 , loss : 0.22982442397148972
1534 time steps done,  111 episodes done. Reward : 12.0 , loss : 0.7025360164272869
1561 time steps done,  112 episodes done. Reward : 27.0 , loss : 0.331797815

##### Testing the algorithm

In [13]:
#nb of episodes to test
nb_e_test=10

#Total reward over the episodes
total_rew=0

print('Testing for ',nb_e_test,'episodes ...')

for episode in range(nb_e_test):
    acc_reward = 0
    input_t = env.reset()
    input_t = input_t.reshape((1,observation_shape))
    game_over = False

    while not game_over:
        input_tm1 = input_t.astype('float32')
        
        q = predict_NN(input_tm1, W1, b1, Wo, bo)
        action = np.argmax(q[0])

        input_t, reward, game_over, infodemerde = env.step(action)
        input_t = input_t.reshape((1,observation_shape))
        
        acc_reward += reward

        if acc_reward>=200:
            game_over=True
            
    total_rew+=acc_reward

    print(episode,'episodes done. Reward :', acc_reward)


print('The average reward over the test was :',total_rew/nb_e_test)

Testing for  10 episodes ...
0 episodes done. Reward : 17.0
1 episodes done. Reward : 15.0
2 episodes done. Reward : 12.0
3 episodes done. Reward : 19.0
4 episodes done. Reward : 17.0
5 episodes done. Reward : 15.0
6 episodes done. Reward : 15.0
7 episodes done. Reward : 13.0
8 episodes done. Reward : 15.0
9 episodes done. Reward : 13.0
The average reward over the test was : 15.1
