In [1]:
%matplotlib inline
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from random import seed
from sklearn import preprocessing
seed(1)

In [2]:
class linear_layer(nn.Module):
    def __init__(self, lr, input_size, linear_out):
        super(linear_layer, self).__init__()
        self.input_size = input_size
        self.linear_out = linear_out
        self.lr = lr
        self.n_hidden_fc = 1000
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.fc   = nn.Linear(self.input_size,self.n_hidden_fc)
        self.fc1  = nn.Linear(self.n_hidden_fc,self.n_hidden_fc)
        self.fc2  = nn.Linear(self.n_hidden_fc,self.n_hidden_fc)
        
        self.critic = nn.Linear(self.n_hidden_fc,1)
        self.actor  = nn.Linear(self.n_hidden_fc,self.linear_out)
        
        self.optimizer = optim.Adam(params = self.parameters() ,lr = lr)
        self.to(self.device)
        
    def forward(self, x):
        out = F.relu(self.fc(x)).to(self.device)
        out = F.relu(self.fc1(out)).to(self.device)
        out = F.relu(self.fc2(out)).to(self.device)
        
        critic_ = self.critic(out) #value function approximator
        actor_  = self.actor(out)  #deterministic policy approximator 
        return actor_,critic_   

In [3]:
class Agent():
    def __init__(self, lr, input_dims, actions, gamma=0.99):
        self.gamma = gamma
        self.lr = lr
        self.input_dims = input_dims
        self.actions = actions
        self.actor_critic_network = linear_layer(self.lr,self.input_dims,self.actions)
        self.log_prob_action = None #TD(0)
        
    def select_action(self, state):
        state = state.flatten().to(self.actor_critic_network.device)
        actor_,critic_ = self.actor_critic_network(state)
        probabilities = F.softmax(actor_, dim = 0)#makes them all add up to 1
        cat_distr = torch.distributions.Categorical(probabilities)#reshapes distribution such that one action is more
        #likely to be picked than another
        action_taken = cat_distr.sample()#select a sample action based on custom distribution made by network and torch
        self.log_prob_action = cat_distr.log_prob(action_taken)#log prob of action from distribution
        return action_taken.item()
    
    def learn(self, state, reward, next_state):
        self.actor_critic_network.optimizer.zero_grad()#ensures gradients arent carried to next timestep
        reward = torch.tensor(reward).to(self.actor_critic_network.device)
        state = state.flatten()
        next_state = next_state.flatten()
        
        _,current_state_critic = self.actor_critic_network(state)
        _,next_state_critic = self.actor_critic_network(next_state)
        
        G_t = (reward + self.gamma*next_state_critic) - current_state_critic
        
        actor_loss = G_t*-self.log_prob_action
        critic_loss = G_t**2.0
        
        (critic_loss + actor_loss).backward()
        
        self.actor_critic_network.optimizer.step()
        

In [4]:
def plot_running_curve(scores, x, figure_file):
    running_avg = np.zeros(len(scores))
    for i in range (len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i - 100):(i+1)])
    plt.plot(x, running_avg)
    plt.title('Running average of previous 100 scores')
    plt.savefig(figure_file)

In [5]:
class EnvManager():
    def __init__(self, device,environment):
        self.device = device
        #self.env = gym.make(environment).unwrapped
        self.env = gym.make(environment)
        self.env.reset()
        self.current_screen = None
        self.done = False
    
    def reset(self):
        self.env.reset()
        self.current_screen = None
        
    def close(self):
        self.env.close()
        
    def render(self, mode='human'):
        return self.env.render(mode)
        
    def num_actions_available(self):
        return self.env.action_space
        
    def take_action(self, action):   
        _, reward, self.done, _ = self.env.step([action])
        return reward
    
    def just_starting(self):
        return self.current_screen is None
    
    def get_state(self):
        if self.just_starting() or self.done:
            self.current_screen = self.get_processed_screen()
            black_screen = torch.zeros_like(self.current_screen)
            return black_screen
        else:
            s1 = self.current_screen
            s2 = self.get_processed_screen()
            self.current_screen = s2
            return s2 - s1
    
    def get_screen_height(self):
        screen = self.get_processed_screen()
        return screen.shape[2]
    
    def get_screen_width(self):
        screen = self.get_processed_screen()
        return screen.shape[3]
       
    def get_processed_screen(self):
        screen = em.render('rgb_array')
        rgb_weights = [0.2989, 0.5870, 0.1140]
        grayscale_image = np.dot(screen[...,:3], rgb_weights) 
        screen = grayscale_image.transpose((0, 1)) # PyTorch expects CHW
        #print(type(screen)) # numpy
        screen = self.crop_screen(screen)
        return self.transform_screen_data(screen)
    
    def crop_screen(self, screen):
        screen_height = screen.shape[0]
        screen_width  = screen.shape[1]
        #print('screen height(top/bottom): ',screen_height)
        #print('screen height(left/right): ',screen_width)
        # Strip off top and bottom
        top = int(screen_height * 0.2)
        #print('top: ',top)
        bottom = int(screen_height * 0.8)
        #print('bottom: ',bottom)
        
        
        #strip off left/right
        left  = int(screen_width * 0.2)
        #print('left: ',left)
        right = int(screen_width * 0.8)
        #print('right: ',right)
        
        screen = screen[top:bottom, left:right]
        return screen
    
    
    def transform_screen_data(self, screen):       
        # Convert to float, rescale, convert to tensor
        screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
        screen = torch.from_numpy(screen)
        
        # Use torchvision package to compose image transforms
        resize = T.Compose([
            T.ToPILImage()
            ,T.Resize((40,90))
            ,T.ToTensor()
        ])
        
        return resize(screen).unsqueeze(0).to(self.device) # add a batch dimension (BCHW)

In [6]:
def set_action_bins(bin_size,min_val,max_val):
    action_bin_array = np.linspace(min_val, max_val, num=bin_size)
    return action_bin_array

action_bin_array = set_action_bins(100,-2,2)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [8]:
em = EnvManager(device,'Pendulum-v0')

In [None]:
output_num = len(action_bin_array)
agent = Agent(5e-6,em.get_screen_width()*em.get_screen_height(),output_num)
scores = []
for episode in range(2000):
    em.reset()
    state = em.get_state()
    score = 0
    for timestep in count():
        action_idx = agent.select_action(state)
        reward     = em.take_action([action_bin_array[action_idx]])
        score+=reward
        next_state = em.get_state()
        agent.learn(state, reward, next_state)
        state = next_state
        if em.done:
            scores.append(score)
            avg_score = np.mean(scores[-100:])
            print('episode: ', episode, 'score%.2f '% score, 'avg_scr %.2f'%avg_score)
            break
x = [episode + 1 for i in range(len(scores))]
fname = 'graph'
save = fname + '.png'
plot_learning_curve(scores,x,save)
em.close()

episode:  0 score-880.51  avg_scr -880.51
episode:  1 score-1152.16  avg_scr -1016.33
episode:  2 score-1524.04  avg_scr -1185.57
episode:  3 score-903.53  avg_scr -1115.06
episode:  4 score-1676.47  avg_scr -1227.34
episode:  5 score-1840.08  avg_scr -1329.46
episode:  6 score-1852.89  avg_scr -1404.24
episode:  7 score-1414.17  avg_scr -1405.48
episode:  8 score-1299.71  avg_scr -1393.73
episode:  9 score-885.93  avg_scr -1342.95
episode:  10 score-1172.69  avg_scr -1327.47
episode:  11 score-1730.95  avg_scr -1361.09
episode:  12 score-969.97  avg_scr -1331.01
episode:  13 score-1198.52  avg_scr -1321.54
episode:  14 score-968.58  avg_scr -1298.01
episode:  15 score-1164.88  avg_scr -1289.69
episode:  16 score-1532.03  avg_scr -1303.95
episode:  17 score-968.98  avg_scr -1285.34
episode:  18 score-1586.03  avg_scr -1301.16
episode:  19 score-1070.15  avg_scr -1289.61
episode:  20 score-1389.70  avg_scr -1294.38
episode:  21 score-1687.41  avg_scr -1312.24
episode:  22 score-867.72  

episode:  182 score-1435.82  avg_scr -1256.31
episode:  183 score-1636.78  avg_scr -1258.11
episode:  184 score-1203.57  avg_scr -1257.49
episode:  185 score-907.98  avg_scr -1253.10
episode:  186 score-1751.01  avg_scr -1254.52
episode:  187 score-1449.12  avg_scr -1254.18
episode:  188 score-1079.78  avg_scr -1248.52
episode:  189 score-1611.60  avg_scr -1253.91
episode:  190 score-891.10  avg_scr -1250.08
episode:  191 score-865.47  avg_scr -1245.69
episode:  192 score-892.74  avg_scr -1241.95
episode:  193 score-1666.02  avg_scr -1247.25
episode:  194 score-979.05  avg_scr -1240.02
episode:  195 score-816.51  avg_scr -1238.90
episode:  196 score-972.52  avg_scr -1236.11
episode:  197 score-1359.69  avg_scr -1239.95
episode:  198 score-1426.76  avg_scr -1242.29
episode:  199 score-1450.81  avg_scr -1246.29
episode:  200 score-746.00  avg_scr -1244.32
episode:  201 score-1015.84  avg_scr -1241.08
episode:  202 score-977.91  avg_scr -1233.57
episode:  203 score-1332.23  avg_scr -1232.

In [None]:
em.close()