# Introduction

Here I'm practicing with deep-Q networks on atari games. After I get this working, I'll try switch to the taxi scenario.

http://adventuresinmachinelearning.com/reinforcement-learning-tutorial-python-keras/



### Setup

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import random
import keras
import time
%matplotlib inline

from keras import backend as K
K.set_image_dim_ordering('th')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Main

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import keras as ks
import tensorflow as tf
import time
import random
from keras.layers import InputLayer, Dense, Input
from keras.models import Model
from keras import backend as K
import keras
from keras import optimizers
%matplotlib inline



class DQNagent:
    
    def __init__(self,num_states, num_actions):
        self.gamma = 0.99  #discount factor
        self.alpha = 0.1   #learning rate
        self.epsilon = 0.5
        self.epsilon_min = 0.01
        self.memory = []
        self.memory_size = 1000
        self.num_states = num_states
        self.num_actions = num_actions
        self.model = self.make_model()
        
        
    def make_model(self):
        """ Instantiate neural net for predicting Q-vals using Keras """
        
        model = Sequential()
        model.add(InputLayer(batch_input_shape=(1, 5)))
        model.add(Dense(10, activation='sigmoid'))
        model.add(Dense(2, activation='linear'))
        model.compile(loss='mse', optimizer='adam', metrics=['mae'])
        return model
        
        
    def vectorize_state(self,state):
        """ Given a state = 0,1,2,3, ...,  return a 1-hot vector
            since Keras works in this format
        """
        return np.identity(self.num_states)[state:state+1]
        
        
    def get_epsilon_iteration(self,episode_number,num_episodes):
        #return max(self.epsilon_min, self.epsilon / (1.0 + episode_number))
            slope = self.epsilon_min - self.epsilon
            epsilon_effective = slope*(episode_number/(1.0*num_episodes)) + self.epsilon
            return epsilon_effective
        
        
    def act(self,state,episode_number = 0, num_episodes = 1):
        
        # epsilon greedy
        epsilon_effective = self.get_epsilon_iteration(episode_number, num_episodes)
        
        if np.random.random() < epsilon_effective:
            action = np.random.choice(range(self.num_actions))
        else:
            action = self.choose_best_action(state)
        return action
                   
                   
    def choose_best_action(self,state_vector):
        Qs = self.model.predict(state_vector)
        action = np.argmax(Qs)
        return action
                   
                   
    def remember(self,event):
        
        if len(self.memory) <= self.memory_size:
            self.memory.append(event)
        else:
            self.memory.pop(0)
            self.memory.append(event)
        
        
    def replay(self,batchsize):
        
        #create minibatch
        indicies = np.random.choice(range(len(self.memory)),batchsize)
        minibatch = [self.memory[i] for i in indicies]

        #Extract states & Qs
        batch_states = np.zeros((batchsize,self.num_states))
        batch_Qs = np.zeros((batchsize,self.num_actions))
        for i,event in enumerate(minibatch):
            state, action, reward, next_state = event
            batch_states[i] = state[0]   #state is a tensor, extract the vector

            #Find Qs -- first grab what I need
            Q_next_vec = self.model.predict(next_state)[0]   #Qs of next state
            Q_target = reward + self.gamma*max(Q_next_vec)   #scalar, for specific action
            Q_target_vec = self.model.predict(state)[0]      #The new Q will be the old, with one update
            Q_target_vec[action] = Q_target                  #Do the update
            Q_target_vec.resize(1,agent.num_actions)         #Turn into tensor, for keras  
            batch_Qs[i] = Q_target_vec

        #Now I have my stack of losses, do the learning
        self.model.fit(batch_states, batch_Qs, epochs=1,verbose=0)
        
        
    def learn(self,event):
        state, action, reward, next_state = event

        #Find Q target
        Q_next_vec = self.model.predict(next_state)[0]   #Qs of next state
        Q_target = reward + self.gamma*max(Q_next_vec)   #scalar, for specific action
        Q_target_vec = self.model.predict(state)[0]      #The new Q will be the old, with one update
        Q_target_vec[action] = Q_target                  #Do the update
        Q_target_vec.resize(1,agent.num_actions)         #Turn into tensor, for keras  
        
        #Now I have my stack of losses, do the learning
        self.model.fit(state, Q_target_vec, epochs=1,verbose=0)
        
        
        
    def print_Qs(self):
        Qs = np.zeros((self.num_states, self.num_actions))
        for i,state in enumerate(range(self.num_states)):
            state_vector = self.vectorize_state(state)
            Qs_temp = self.model.predict(state_vector)[0]
            Qs[i] = Qs_temp
        return Qs

### My code

In [None]:
env = gym.make('NChain-v0')
num_episodes = 50
gamma = 0.99

num_states = env.observation_space.n
start_state = np.random.choice(range(num_states))
agent = Agent(start_state)
agent.memory_size = num_episodes
agent.batch_size = num_episodes / 2
rList = []

t1 = time.time()
for i in range(num_episodes):
    
    total_reward = 0
    finished = False
    state = start_state
    state_vec = agent.vectorize_state(state)
    env.reset()
    
    while not finished:
        action = agent.act(state_vec)
        next_state, reward, finished, _ = env.step(action)
        next_state_vec = agent.vectorize_state(next_state)
        agent.remember([state, action, reward, next_state, finished])
        state = next_state
        total_reward += reward
    
    agent.learn()
    rList.append(total_reward / (1.0*num_episodes))
t2 = time.time()
print 'took ' + str((t2-t1)/60.0) + ' mins'
plt.plot(rList)

### Website code

In [None]:
from keras import Sequential
from keras.layers import Dense, InputLayer


#Define neural net model
model = Sequential()
model.add(InputLayer(batch_input_shape=(1, 5)))
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(2, activation='linear'))
model.compile(loss='mse', optimizer='adam', metrics=['mae'])


#Define environment
env = gym.make('NChain-v0')
num_episodes = 2000


#Train
y = 0.95
eps = 0.5
decay_factor = 0.999
r_avg_list = []
for i in range(num_episodes):
    s = env.reset()
    eps *= decay_factor
    if i % 100 == 0:
        print("Episode {} of {}".format(i + 1, num_episodes))
    done = False
    r_sum = 0
    while not done:
        if np.random.random() < eps:
            a = np.random.randint(0, 2)
        else:
            a = np.argmax(model.predict(np.identity(5)[s:s + 1]))
        new_s, r, done, _ = env.step(a)
        target = r + y * np.max(model.predict(np.identity(5)[new_s:new_s + 1]))
        target_vec = model.predict(np.identity(5)[s:s + 1])[0]
        target_vec[a] = target
        model.fit(np.identity(5)[s:s + 1], target_vec.reshape(-1, 2), epochs=1, verbose=0)
        s = new_s
        r_sum += r
    r_avg_list.append(r_sum / 1000)

Episode 1 of 2000
Episode 101 of 2000
Episode 201 of 2000
Episode 301 of 2000
Episode 401 of 2000
Episode 501 of 2000
Episode 601 of 2000
Episode 701 of 2000
