# Reinforcement Learning

## Some Common Terms

* Agent
* Environment
* Actions, Rewards, Observations

# 1. Interacting with the Gym API

In [1]:
import gym

In [2]:
env = gym.make('CartPole-v0')

### Each environment comes with certain import methods/attributes

* action_space
* observation_space
* reset() : returns init state and also resets the environment
* step()
* render()

In [3]:
def RenderGame(e):
    istate = env.reset() # Initial State
    for t in range(50):
        env.render()
        action = env.action_space.sample()
        observation,reward,done,info = env.step(action)
        #print(observation,reward,info)
        if done:
            print("Game Episode : {}/{} \nHigh Score : {}".format(e,20,t))
            break

In [4]:
RenderGame(1)
env.close()

Game Episode : 1/20 
High Score : 9


In [5]:
env.action_space

Discrete(2)

In [6]:
env.action_space.sample()

0

In [7]:
env.observation_space

Box(4,)

In [8]:
env.action_space.n

2

## 2. Playing Games with a Random Strategy
* Game Episode
* Step() Function in More Detail
* Game Over?

In [9]:
for e in range(20):
    RenderGame(e+1)
env.close()
print("Game Over!!")

Game Episode : 1/20 
High Score : 34
Game Episode : 2/20 
High Score : 14
Game Episode : 3/20 
High Score : 13
Game Episode : 4/20 
High Score : 16
Game Episode : 5/20 
High Score : 33
Game Episode : 6/20 
High Score : 33
Game Episode : 7/20 
High Score : 19
Game Episode : 8/20 
High Score : 19
Game Episode : 9/20 
High Score : 20
Game Episode : 10/20 
High Score : 22
Game Episode : 11/20 
High Score : 8
Game Episode : 12/20 
High Score : 10
Game Episode : 13/20 
High Score : 31
Game Episode : 14/20 
High Score : 21
Game Episode : 15/20 
High Score : 13
Game Episode : 16/20 
High Score : 13
Game Episode : 17/20 
High Score : 15
Game Episode : 18/20 
High Score : 31
Game Episode : 19/20 
High Score : 10
Game Episode : 20/20 
High Score : 17
Game Over!!


# 3. Q-Learning
### Designing an AI Agent

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import random
plt.style.use('seaborn')

Using TensorFlow backend.


In [15]:
class Agent:
    def __init__(self,state_size,action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen = 2000)
        self.gamma = 0.95
        # Exploration vs Explotation TradeOff
        # Exploration : Good in the beginning --> helps you to try various random things
        # Explotation : Sample Good Experience from past -->good in the end
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.epsilon_min = 0.01
        self.model = self.create_model()
    
    def create_model(self):
        model = Sequential()
        model.add(Dense(24,input_dim = 4,activation='relu'))
        model.add(Dense(24,activation='relu'))
        model.add(Dense(2,activation='linear'))
        model.compile(loss='mse',optimizer=Adam(learning_rate=self.learning_rate))
        return model
    
    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
    
    def act(self,state):
        if np.random.rand()<=self.epsilon:
            return random.randrange(self.action_size)
        else:
            return np.argmax(self.model.predict(state)[0])
        
    def train(self,batch_size=32):
        minibatch = random.sample(self.memory,batch_size)
        for experience in minibatch:
            state,action,reward,next_state,done = experience
            if not done:
                target = reward + self.gamma * np.max(self.model.predict(next_state)[0])
            else:
                target = reward
                
            y = self.model.predict(state)
            y[0][action] = target
            
            self.model.fit(x=state,y=y,epochs = 1,verbose=0)
            
        if self.epsilon>self.epsilon_min:
            self.epsilon*=self.epsilon_decay
    
    def load(self,name):
        self.model.load_weights(name)
    def save(self,name):
        self.model.save(name)

## *Training the DQN Agent (Deep Q-Learner)*

In [19]:
n_episodes = 1000
output_dir = "cartpole_model/"
state_size = 4
action_size = 2
batch_size = 32

In [20]:
agent = Agent(state_size=state_size,action_size=action_size)

In [None]:
for e in range(1,n_episodes+1):
    istate = env.reset()
    istate = np.reshape(istate,[1,state_size])
    
    for t in range(500):
        env.render()
        action = agent.act(istate)
        next_state,reward,done,info = env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state,[1,state_size])
        agent.remember(istate,action,reward,next_state,done)
        
        if done:
            print("Game Episode : {}/{} \nHigh Score : {} \nExploration Rate : {:.2}".format(e,n_episodes,t,agent.epsilon))
            break
        
        if len(agent.memory)>batch_size:
            agent.train(batch_size=batch_size)
        
        if e%50==0:
            agent.save(output_dir+"weights_{:04d}".format(e)+".hdf5")
            
print("Deep Q-Learning Model Trained")
env.close()

Game Episode : 1/1000 
High Score : 20 
Exploration Rate : 1.0
Game Episode : 2/1000 
High Score : 29 
Exploration Rate : 0.91
Game Episode : 3/1000 
High Score : 17 
Exploration Rate : 0.84
Game Episode : 4/1000 
High Score : 18 
Exploration Rate : 0.77
Game Episode : 5/1000 
High Score : 12 
Exploration Rate : 0.72
Game Episode : 6/1000 
High Score : 25 
Exploration Rate : 0.64
Game Episode : 7/1000 
High Score : 10 
Exploration Rate : 0.61
Game Episode : 8/1000 
High Score : 10 
Exploration Rate : 0.58
Game Episode : 9/1000 
High Score : 14 
Exploration Rate : 0.54
Game Episode : 10/1000 
High Score : 8 
Exploration Rate : 0.52
Game Episode : 11/1000 
High Score : 14 
Exploration Rate : 0.48
Game Episode : 12/1000 
High Score : 21 
Exploration Rate : 0.43
Game Episode : 13/1000 
High Score : 8 
Exploration Rate : 0.42
Game Episode : 14/1000 
High Score : 12 
Exploration Rate : 0.39
Game Episode : 15/1000 
High Score : 13 
Exploration Rate : 0.37
Game Episode : 16/1000 
High Score : 