# OpenAi-GYM Bipedal_Walker_v2 Deep Q Network Learning


In [1]:
from itertools import count
import numpy as np
import random
import math

import gym

import torch, torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# Check for GPU
use_gpu = torch.cuda.is_available()

if use_gpu:
    print("Using GPU")
    LongTensor = torch.cuda.LongTensor
    FloatTensor = torch.cuda.FloatTensor

else:
    print("Using CPU")
    LongTensor = torch.LongTensor
    FloatTensor = torch.FloatTensor

Using CPU


# Setting and Running Model with Environment

In [2]:
NUM_EPISODES = 1000
SCREEN_WIDTH = 400
SCREEN_LENGTH =600
WINDOW_MAX_Y = 300
WINDOW_MIN_Y = 200
BUFFER_SIZE = 65536
GAMMA = 0.999
START_EXPLORE_RATIO = 0.7
END_EXPLORE_RATIO = 0.05
NUM_FEATURES = 24
FALL_TIME = 30  


class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()
        self.hidden1 = nn.Linear(NUM_FEATURES, 400)
        self.hidden2 = nn.Linear(400, 300)
        self.output = nn.Linear(300, 16)

        # Weights initialization
        for m in self.modules():
            if isinstance(m, nn.Linear):
                n = len(m.weight.data[1])
                # "Xavier" initialization
                m.weight.data.normal_(0, np.sqrt(2. / n))
                m.bias.data.zero_()

        self.optimizer = optim.RMSprop(self.parameters(), lr=0.0001)

    
    def forward(self,x):
        x = F.tanh(self.hidden1(x))
        x = F.tanh(self.hidden2(x))
        return self.output(x)

    def update(self,transition_buffer):
        
        transition_batch = transition_buffer.get_batch()
        if transition_batch is None:
            return

        states, actions, rewards, next_states = transition_batch
        non_final_mask = [i for i, state in enumerate(next_states) if state is not None]
        non_final_mask = LongTensor(non_final_mask)
     
        non_final_next_states = Variable(torch.cat([s for s in next_states if s is not None]).view(-1, NUM_FEATURES),
                                         volatile=True)

        states = Variable(torch.cat(list(states)).view(-1, NUM_FEATURES))
        actions = Variable(torch.cat(list(actions)).view(-1, 1).type(LongTensor))
        rewards = Variable(torch.cat(list(rewards)))

        q_values = self.forward(states).gather(1, actions)

        next_state_values = Variable(torch.zeros(32).type(FloatTensor))

        next_state_values[non_final_mask] = self.forward(non_final_next_states).max(1)[0]

        next_state_values.volatile = False

        # Compute the expected Q values
        expected_state_action_values = (next_state_values * GAMMA) + rewards

        # Compute Huber loss
        loss = F.smooth_l1_loss(q_values, expected_state_action_values)

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in model.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

class TransitionBuffer:
    def __init__(self):
        self.buffer = []
        self.batch_size = 32

    def push(self,transition):
        self.buffer.append(transition)
        if len(self.buffer) > BUFFER_SIZE:
            self.buffer.pop(0)

    def get_batch(self):
        if len(self.buffer) >= self.batch_size:
            batch = random.sample(self.buffer, self.batch_size)
            # Transposing list of lists
            return list(zip(*batch))   
        

## Execute and Train Model

In [3]:
def get_action(model,state, explore_ratio, randomization=True):
    chance = random.random()
    if chance < explore_ratio and randomization:
        return LongTensor([random.randint(0, 15)])
    else:
        q_values = model(state)

        return q_values.max(0)[1].data


def get_action_vec(action_ind):
    action_vec = np.array([int(bit) for bit in '{0:04b}'.format(action_ind)])
    return action_vec*2 - 1


def get_decay_ratio():
    return math.pow(END_EXPLORE_RATIO/START_EXPLORE_RATIO, 1.5/NUM_EPISODES)

In [None]:
%run ACPreTrain.py
env = gym.make('BipedalWalker-v2').unwrapped
env.reset()
model = DQN()

if use_gpu:
    model.cuda()

transition_buffer = TransitionBuffer()

reward_his = np.zeros(NUM_EPISODES)
steps_his = np.zeros(NUM_EPISODES)
distance_his = np.zeros(NUM_EPISODES)
velocity_his = np.zeros(NUM_EPISODES)

min_max_states = np.zeros((NUM_FEATURES, 2))

explore_ratio = START_EXPLORE_RATIO
explore_decay_ratio = get_decay_ratio()

for episode in range(NUM_EPISODES):
    env.reset()
    action_vec = env.action_space.sample()

    current_state = FloatTensor(np.zeros(NUM_FEATURES))

    for i in count():
        #env.render(mode='rgb_array')

        if i < FALL_TIME:
               action_ind = 8

        else:
            randomization = bool(np.mod(episode, 50))
            action_ind = get_action(model, Variable(current_state, volatile=True), explore_ratio, randomization)
            action_vec = get_action_vec(int(action_ind.cpu().numpy()))

        obs, reward, done, info = env.step(action_vec)

        distance_his[episode] += obs[2]

        if done is False:
            next_state = FloatTensor(obs[:NUM_FEATURES])
            reward_his[episode] += reward
        else:
            next_state = None

        if i >= FALL_TIME:
            transition_buffer.push([current_state, action_ind, FloatTensor([reward]), next_state])

        current_state = next_state
        
        #update model
        model.update(transition_buffer)

        if done is True:
            steps_his[episode] = i
            velocity_his[episode] = distance_his[episode]/i
            print("Episode", episode, ", steps = ", i,
                    ", total reward:", reward_his[episode],
                    ", steps_avg:", np.mean(steps_his[:episode+1]),
                    ", reward_avg:", np.mean(reward_his[:episode+1]),
                    ", distance traveled:", distance_his[episode],
                    ", average speed:", velocity_his[episode],
                    ", explore ratio:", explore_ratio)
            break

    if explore_ratio > END_EXPLORE_RATIO:
           explore_ratio = explore_ratio*explore_decay_ratio


Using TensorFlow backend.


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Episode 0 , steps =  119 , total reward: -19.0581164122 , steps_avg: 119.0 , reward_avg: -19.0581164122 , distance traveled: 13.5595203378 , average speed: 0.113945549057 , explore ratio: 0.7
Episode 1 , steps =  78 , total reward: -7.50757806337 , steps_avg: 98.5 , reward_avg: -13.2828472378 , distance traveled: 3.06957130428 , average speed: 0.0393

## Visiualization

In [None]:
plt.ylabel("distance traveled")
plt.xlabel("episode id")
plt.plot(np.arange(0, NUM_EPISODES, 1), distance_his)
plt.show()


In [None]:
plt.ylabel("avg velocity")
plt.xlabel("episode id")
plt.plot(np.arange(0, NUM_EPISODES, 1), velocity_his)
plt.show()