# 9. On Policy Prediction with Approximation
## CartPole-v0

### Overview:
[CartPole-v0](https://github.com/openai/gym/wiki/CartPole-v0)
A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The pendulum starts upright, and the goal is to prevent it from falling over by increasing and reducing the cart's velocity.

### State (Observation)
| num | observation          | min      | max     |
|----:|:---------------------|---------:|--------:|
| 0   | Cart Position        | -2.4     | 2.4     |
| 1   | Cart Velocity        | -Inf     | Inf     |
| 2   | Pole Angle           | ~ -41.8° | ~ 41.8° |
| 3   | Pole Velocity at Tip | -Inf     | Inf     |

### Action
| num | action          | 
|----:|:----------------|
| 0   | Push cart left  |
| 1   | Push cart right |

### Reward
1 for every step the pole is upright, including termination. <br />

### Termination
* Pole angle more than ±12°
* Cart Position more than ±2.4
* Episode length > 200

### Solved
Average reward ≥195.0 over 100 episodes

### Code
based on ruippeixotog's solution on openai.com: https://gym.openai.com/evaluations/eval_aCiCDmwhTCytFuxMpKoyvQ/

In [None]:
import os
import abc
import random
from collections import deque

import gym
import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [None]:
class GymRunner:
    def __init__(self, env_id, max_timesteps=200):
        self.max_timesteps = max_timesteps
        self.env = gym.make(env_id)

    def train(self, agent, num_episodes):
        self.run(agent, num_episodes, do_train=True)

    def train_until_solved(self, agent):
        loop          = 0
        trained       = False
        training_eps  = 100
        testing_stops = [10,  15,  25,  25,  25]
        testing_evals = [150, 180, 190, 193, 195]
        
        while not trained:
            train_rew = np.mean(self.run(agent, training_eps, do_train=True))
            avg_rew = 0
            
            for stop in range(len(testing_stops)):
                avg_rew = (np.mean(self.run(agent, testing_stops[stop])) + avg_rew) / 2
                if avg_rew < testing_evals[stop]:
                    break
                elif testing_evals[stop] == 195 and avg_rew > testing_evals[stop]:
                    print(f"Model fully learned after {loop * training_eps} Episodes")
                    return 0
            
            loop += 1
            print(f"Episode {loop * training_eps}\t Average Reward {train_rew}")
            
        
    def run(self, agent, num_episodes, do_train=False):
        rewards = []
        for episode in range(num_episodes):
            state = self.env.reset().reshape(1, self.env.observation_space.shape[0])
            total_reward = 0

            for t in range(self.max_timesteps):
                action = agent.select_action(state, do_train)

                # execute the selected action
                next_state, reward, done, _ = self.env.step(action)
                next_state = next_state.reshape(1, self.env.observation_space.shape[0])
                #reward = self.calc_reward(state, action, reward, next_state, done)

                # record the results of the step
                if do_train:
                    agent.record(state, action, reward, next_state, done)

                total_reward += reward
                state = next_state
                if done:
                    break

            rewards.append(total_reward)
            # train the agent based on a sample of past experiences
            if do_train:
                agent.replay()
                
            return rewards
            
    def close(self):
        self.env.close()

In [None]:
class QLearningAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size

        # hyperparameters
        self.gamma = 0.95  # discount rate on future rewards
        self.epsilon = 1.0  # exploration rate
        self.epsilon_decay = 0.995  # the decay of epsilon after each training batch
        self.epsilon_min = 0.1  # the minimum exploration rate permissible
        self.batch_size = 32  # maximum size of the batches sampled from memory

        # agent state
        self.model = self.build_model()
        self.memory = deque(maxlen=2000)

    @abc.abstractmethod
    def build_model(self):
        return None

    def select_action(self, state, do_train=True):
        if do_train and np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        return np.argmax(self.model.predict(state)[0])

    def record(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay(self):
        if len(self.memory) < self.batch_size:
            return 0

        minibatch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [None]:
class CartPoleAgent(QLearningAgent):
    def __init__(self):
        super().__init__(4, 2)

    def build_model(self):
        model = Sequential()
        model.add(Dense(12, activation='relu', input_dim=4))
        model.add(Dense(12, activation='relu'))
        model.add(Dense(2))
        model.compile(Adam(lr=1e-3), 'mse')

        # load the weights of the model if reusing previous training session
        #model.load_weights("models/cartpole-v0.h5")
        return model

#### Deep-Q-Network Agent

In [None]:
gym   = GymRunner('CartPole-v0')
agent = CartPoleAgent()
gym.train_until_solved(agent)
agent.model.save_weights("models/dqn-cartpole-v0.h5", overwrite=True)
gym.close()