In [None]:
%matplotlib inline
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from random import seed
from sklearn import preprocessing
seed(1)

In [None]:
class linear_layer(nn.Module):
    def __init__(self, lr, input_size, linear_out):
        super(linear_layer, self).__init__()
        self.input_size = input_size
        self.linear_out = linear_out
        self.lr = lr
        self.n_hidden_fc = 400
        
        
        self.fc   = nn.Linear(self.input_size,self.n_hidden_fc)
        self.fc1  = nn.Linear(self.n_hidden_fc,self.n_hidden_fc)
        self.fc2  = nn.Linear(self.n_hidden_fc,self.n_hidden_fc)
        self.fc3  = nn.Linear(self.n_hidden_fc,self.n_hidden_fc)
        self.fc4  = nn.Linear(self.n_hidden_fc,self.linear_out)
        
        self.optimizer = optim.Adam(params = self.parameters() ,lr = lr)
        
    def forward(self, x):
        out = F.relu(self.fc(x))
        out = F.relu(self.fc1(out))
        out = F.relu(self.fc2(out))
        out = F.relu(self.fc3(out))
        out = self.fc4(out)
        return out   

In [None]:
class policy_gradient_agent():
    def __init__(self, lr, n_features, seq_length, outputs, gamma = 0.88, num_layers = 3):
        self.lr = lr
        self.gamma = gamma
        self.reward_memory = []
        self.action_memory = []
        #self.policy = LSTM_V2(lr, n_features, seq_length, outputs).to(device)
        self.length = n_features
        self.width  = seq_length
        self.policy = linear_layer(lr, self.length*self.width, outputs).to(device)
        
    def select_action(self, state):
        prob_outputs = F.softmax(self.policy.forward(state))
        action_probs = torch.distributions.Categorical(prob_outputs) #model random choice out of distributions
        action = action_probs.sample()
        log_prob = action_probs.log_prob(action)
        
        self.action_memory.append(log_prob)
        return action.item()
    
    def store_rewards(self,reward):
        self.reward_memory.append(reward)
        
    def learn(self):
        self.policy.optimizer.zero_grad()
        G = np.zeros_like(self.reward_memory,dtype = np.float64)
        for t in range(len(self.reward_memory)):
            G_sum = 0
            pwr   = 0
            for k in range(t,len(self.reward_memory)):
                G_sum += np.power(self.gamma, pwr)*self.reward_memory[k]
                pwr+=1
            G[t] = G_sum
            #print(G[t])
        G = torch.tensor(G, dtype = float).to(device)
        loss = 0
        for g_t, action_log_prob in zip(G, self.action_memory):
            loss+=-g_t*action_log_prob
        loss.backward()
        self.policy.optimizer.step()
        
        self.reward_memory = []
        self.action_memory = []
        
                
        

In [None]:
def plot_running_curve(scores, x, figure_file):
    running_avg = np.zeros(len(scores))
    for i in range (len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i - 100):(i+1)])
    plt.plot(x, running_avg)
    plt.title('Running average of previous 100 scores')
    plt.savefig(figure_file)

In [None]:
class EnvManager():
    def __init__(self, device,environment):
        self.device = device
        #self.env = gym.make(environment).unwrapped
        self.env = gym.make(environment)
        self.env.reset()
        self.current_screen = None
        self.done = False
    
    def reset(self):
        self.env.reset()
        self.current_screen = None
        
    def close(self):
        self.env.close()
        
    def render(self, mode='human'):
        return self.env.render(mode)
        
    def num_actions_available(self):
        return self.env.action_space
        
    def take_action(self, action):   
        _, reward, self.done, _ = self.env.step([action])
        return reward
    
    def just_starting(self):
        return self.current_screen is None
    
    def get_state(self):
        if self.just_starting() or self.done:
            self.current_screen = self.get_processed_screen()
            black_screen = torch.zeros_like(self.current_screen)
            return black_screen
        else:
            s1 = self.current_screen
            s2 = self.get_processed_screen()
            self.current_screen = s2
            return s2 - s1
    
    def get_screen_height(self):
        screen = self.get_processed_screen()
        return screen.shape[2]
    
    def get_screen_width(self):
        screen = self.get_processed_screen()
        return screen.shape[3]
       
    def get_processed_screen(self):
        screen = em.render('rgb_array')
        rgb_weights = [0.2989, 0.5870, 0.1140]
        grayscale_image = np.dot(screen[...,:3], rgb_weights) 
        screen = grayscale_image.transpose((0, 1)) # PyTorch expects CHW
        #print(type(screen)) # numpy
        screen = self.crop_screen(screen)
        return self.transform_screen_data(screen)
    
    def crop_screen(self, screen):
        screen_height = screen.shape[0]
        screen_width  = screen.shape[1]
        #print('screen height(top/bottom): ',screen_height)
        #print('screen height(left/right): ',screen_width)
        # Strip off top and bottom
        top = int(screen_height * 0.2)
        #print('top: ',top)
        bottom = int(screen_height * 0.8)
        #print('bottom: ',bottom)
        
        
        #strip off left/right
        left  = int(screen_width * 0.2)
        #print('left: ',left)
        right = int(screen_width * 0.8)
        #print('right: ',right)
        
        screen = screen[top:bottom, left:right]
        return screen
    
    
    def transform_screen_data(self, screen):       
        # Convert to float, rescale, convert to tensor
        screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
        screen = torch.from_numpy(screen)
        
        # Use torchvision package to compose image transforms
        resize = T.Compose([
            T.ToPILImage()
            ,T.Resize((40,90))
            ,T.ToTensor()
        ])
        
        return resize(screen).unsqueeze(0).to(self.device) # add a batch dimension (BCHW)

In [None]:
def set_action_bins(bin_size,min_val,max_val):
    action_bin_array = np.linspace(min_val, max_val, num=bin_size)
    return action_bin_array

action_bin_array = set_action_bins(100,-2,2)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
em = EnvManager(device,'Pendulum-v0')

In [None]:
output_num = len(action_bin_array)
agent = policy_gradient_agent(5e-6,em.get_screen_width(),em.get_screen_height(),output_num)
scores = []
for episode in range(5000):
    em.reset()
    state = em.get_state()
    score = 0
    for timestep in count():
        #state   = state.squeeze()
        #state   = state.unsqueeze(dim = 0)
        #print(state.shape)
        state = state.flatten()
        #print(state.shape)
        action_idx = agent.select_action(state)
        reward     = em.take_action([action_bin_array[action_idx]])
        score+=reward
        agent.store_rewards(reward)
        next_state = em.get_state()
        state = next_state
        if em.done:
            agent.learn()
            scores.append(score)
            avg_score = np.mean(scores[-100:])
            print('episode: ', episode, 'score%.2f '% score, 'avg_scr %.2f'%avg_score)
            break
x = [episode + 1 for i in range(len(scores))]
fname = 'graph'
save = fname + '.png'
plot_learning_curve(scores,x,save)
em.close()

In [None]:
em.close()

In [None]:
5e-6