# Module Five Assignment: Cartpole Problem
Review the code in this notebook and in the score_logger.py file in the *scores* folder (directory). Once you have reviewed the code, return to this notebook and select **Cell** and then **Run All** from the menu bar to run this code. The code takes several minutes to run.

In [8]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  
  
GAMMA = 0.95  
LEARNING_RATE = 0.001  
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 1.0  
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.995  
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  



In [9]:
cartpole()

Run: 1, exploration: 0.985074875, score: 23
Scores: (min: 23, avg: 23, max: 23)

Run: 2, exploration: 0.8911090557802088, score: 21
Scores: (min: 21, avg: 22, max: 23)

Run: 3, exploration: 0.8390886103705794, score: 13
Scores: (min: 13, avg: 19, max: 23)

Run: 4, exploration: 0.7628626641409962, score: 20
Scores: (min: 13, avg: 19.25, max: 23)

Run: 5, exploration: 0.7255664080186093, score: 11
Scores: (min: 11, avg: 17.6, max: 23)

Run: 6, exploration: 0.6866430931872001, score: 12
Scores: (min: 11, avg: 16.666666666666668, max: 23)

Run: 7, exploration: 0.653073201944699, score: 11
Scores: (min: 11, avg: 15.857142857142858, max: 23)

Run: 8, exploration: 0.6242658676435396, score: 10
Scores: (min: 10, avg: 15.125, max: 23)

Run: 9, exploration: 0.5997278763867329, score: 9
Scores: (min: 9, avg: 14.444444444444445, max: 23)

Run: 10, exploration: 0.5732736268885887, score: 10
Scores: (min: 9, avg: 14, max: 23)

Run: 11, exploration: 0.5452463540625918, score: 11
Scores: (min: 9, avg:

NameError: name 'exit' is not defined

In [13]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  
  
GAMMA = 0.95  
LEARNING_RATE = 0.01  #Adjusted learning rate larger
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 1.0  
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.995  
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  



In [14]:
cartpole()

Run: 1, exploration: 1.0, score: 18
Scores: (min: 18, avg: 18, max: 18)

Run: 2, exploration: 0.9091562615825302, score: 21
Scores: (min: 18, avg: 19.5, max: 21)

Run: 3, exploration: 0.8307187014821328, score: 19
Scores: (min: 18, avg: 19.333333333333332, max: 21)

Run: 4, exploration: 0.7292124703704616, score: 27
Scores: (min: 18, avg: 21.25, max: 27)

Run: 5, exploration: 0.6935613678313175, score: 11
Scores: (min: 11, avg: 19.2, max: 27)

Run: 6, exploration: 0.653073201944699, score: 13
Scores: (min: 11, avg: 18.166666666666668, max: 27)

Run: 7, exploration: 0.5878229785513479, score: 22
Scores: (min: 11, avg: 18.714285714285715, max: 27)

Run: 8, exploration: 0.5507399854171277, score: 14
Scores: (min: 11, avg: 18.125, max: 27)

Run: 9, exploration: 0.5185893309484582, score: 13
Scores: (min: 11, avg: 17.555555555555557, max: 27)

Run: 10, exploration: 0.4883155414435353, score: 13
Scores: (min: 11, avg: 17.1, max: 27)

Run: 11, exploration: 0.46444185833082485, score: 11
Score

KeyboardInterrupt: 

In [16]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  
  
GAMMA = 0.95  
LEARNING_RATE = 0.0001  #Adjusted learning rate smaller
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 1.0  
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.995  
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  



In [17]:
cartpole()

Run: 1, exploration: 1.0, score: 11
Scores: (min: 11, avg: 11, max: 11)

Run: 2, exploration: 0.9558895783575597, score: 18
Scores: (min: 11, avg: 14.5, max: 18)

Run: 3, exploration: 0.9000874278732445, score: 13
Scores: (min: 11, avg: 14, max: 18)

Run: 4, exploration: 0.7111635524897149, score: 48
Scores: (min: 11, avg: 22.5, max: 48)

Run: 5, exploration: 0.6337242817644086, score: 24
Scores: (min: 11, avg: 22.8, max: 48)

Run: 6, exploration: 0.5790496471185967, score: 19
Scores: (min: 11, avg: 22.166666666666668, max: 48)

Run: 7, exploration: 0.5507399854171277, score: 11
Scores: (min: 11, avg: 20.571428571428573, max: 48)

Run: 8, exploration: 0.46677573701590436, score: 34
Scores: (min: 11, avg: 22.25, max: 48)

Run: 9, exploration: 0.39166620452737816, score: 36
Scores: (min: 11, avg: 23.77777777777778, max: 48)

Run: 10, exploration: 0.37251769488706843, score: 11
Scores: (min: 11, avg: 22.5, max: 48)

Run: 11, exploration: 0.3543053533848483, score: 11
Scores: (min: 11, avg



Run: 453, exploration: 0.01, score: 154
Scores: (min: 125, avg: 165.95, max: 258)

Run: 454, exploration: 0.01, score: 134
Scores: (min: 125, avg: 164.71, max: 255)

Run: 455, exploration: 0.01, score: 134
Scores: (min: 125, avg: 164.14, max: 255)

Run: 456, exploration: 0.01, score: 143
Scores: (min: 125, avg: 163.9, max: 255)

Run: 457, exploration: 0.01, score: 139
Scores: (min: 125, avg: 163.16, max: 255)

Run: 458, exploration: 0.01, score: 158
Scores: (min: 125, avg: 162.23, max: 255)

Run: 459, exploration: 0.01, score: 213
Scores: (min: 125, avg: 162.71, max: 255)

Run: 460, exploration: 0.01, score: 170
Scores: (min: 125, avg: 162.21, max: 255)

Run: 461, exploration: 0.01, score: 186
Scores: (min: 125, avg: 161.86, max: 255)

Run: 462, exploration: 0.01, score: 167
Scores: (min: 125, avg: 160.98, max: 220)

Run: 463, exploration: 0.01, score: 172
Scores: (min: 125, avg: 160.73, max: 220)

Run: 464, exploration: 0.01, score: 212
Scores: (min: 125, avg: 160.94, max: 220)

Run: 

KeyboardInterrupt: 

In [19]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  
  
GAMMA = 0.95  
LEARNING_RATE = 0.001
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 1.0  
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.5  #Changed decay rate
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  



In [20]:
cartpole()

Run: 1, exploration: 0.01, score: 33
Scores: (min: 33, avg: 33, max: 33)

Run: 2, exploration: 0.01, score: 10
Scores: (min: 10, avg: 21.5, max: 33)

Run: 3, exploration: 0.01, score: 9
Scores: (min: 9, avg: 17.333333333333332, max: 33)

Run: 4, exploration: 0.01, score: 9
Scores: (min: 9, avg: 15.25, max: 33)

Run: 5, exploration: 0.01, score: 10
Scores: (min: 9, avg: 14.2, max: 33)

Run: 6, exploration: 0.01, score: 11
Scores: (min: 9, avg: 13.666666666666666, max: 33)

Run: 7, exploration: 0.01, score: 9
Scores: (min: 9, avg: 13, max: 33)

Run: 8, exploration: 0.01, score: 9
Scores: (min: 9, avg: 12.5, max: 33)

Run: 9, exploration: 0.01, score: 9
Scores: (min: 9, avg: 12.11111111111111, max: 33)

Run: 10, exploration: 0.01, score: 10
Scores: (min: 9, avg: 11.9, max: 33)

Run: 11, exploration: 0.01, score: 10
Scores: (min: 9, avg: 11.727272727272727, max: 33)

Run: 12, exploration: 0.01, score: 11
Scores: (min: 9, avg: 11.666666666666666, max: 33)

Run: 13, exploration: 0.01, score:

NameError: name 'exit' is not defined

In [21]:
import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  
  
GAMMA = 0.95  
LEARNING_RATE = 0.001
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  
  
EXPLORATION_MAX = 2.0  #Increased the max exploration
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.995
  
  
class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  



In [22]:
cartpole()

Run: 1, exploration: 2.0, score: 14
Scores: (min: 14, avg: 14, max: 14)

Run: 2, exploration: 1.8738293857596078, score: 19
Scores: (min: 14, avg: 16.5, max: 19)

Run: 3, exploration: 1.6122131818527914, score: 31
Scores: (min: 14, avg: 21.333333333333332, max: 31)

Run: 4, exploration: 1.39409320167099, score: 30
Scores: (min: 14, avg: 23.5, max: 31)

Run: 5, exploration: 1.306146403889398, score: 14
Scores: (min: 14, avg: 21.6, max: 31)

Run: 6, exploration: 1.1994557527734657, score: 18
Scores: (min: 14, avg: 21, max: 31)

Run: 7, exploration: 1.095972570980084, score: 19
Scores: (min: 14, avg: 20.714285714285715, max: 31)

Run: 8, exploration: 1.0423906149717752, score: 11
Scores: (min: 11, avg: 19.5, max: 31)

Run: 9, exploration: 0.9717479274726352, score: 15
Scores: (min: 11, avg: 19, max: 31)

Run: 10, exploration: 0.8277366916839737, score: 33
Scores: (min: 11, avg: 20.4, max: 33)

Run: 11, exploration: 0.771641074933063, score: 15
Scores: (min: 11, avg: 19.90909090909091, max

NameError: name 'exit' is not defined

In this RL =, the agent is attempting to balance the inverted pendulum and is represented by the solver(DQNSolver). The possible state values describe the position, pole angle, velocity, and it's tip velocity. What the cart can do to change the position of the pole isdescribed by the actions(move left, move right). 
DQN (Deep Q-Learning) is used as its reinforcment algorithm and uses the DQNSolver. The algorithm uses expierence replay, this allows the agent to remember the states experienced, then have those expierneces sampled. The algorithm uses those samples to "reduce correlation between subsequebt actions".
The algorithm uses a discount factor that is not myopic, 0.95. The importance of future rewards is detwermined by the discount factor. A 0 factor will cause the agent to be short sighrted and only consider current rewards. A 1 factor will make it strive for the long term higher rewards. 
In long running training sessions the Q-table has the potential to become to lengthy and not able to fit into the memory, or just be unsearchable do toit's size. The use of a neural network may be substituted fot this table to approximate the response with Q-values computed usimng the Bellman equation. 
Usuing the default values of rate, alpha, (0.001) the algorithm Solved in 36 runs, 136 total runs. At 0.01, the run was terminated after 1399 runs while at 0.0001it was terminated after 480 runs. The learning rate is the 'step size'. If a step size is too large, the simulation can hang up and never get to converging at the minimum. That was the case with the 0.01 learning rate. Having a larger step size can be usefull in the begining, however it does tend to become detrimental in the later stages. An option is to adjust the decay rate, this would allow for a higher learing rate early on, while being less as the simulation progresses. 