In [1]:
%matplotlib inline
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from random import seed
from sklearn import preprocessing
seed(1)

In [2]:
class EnvManager():
    def __init__(self, device,environment):
        self.device = device
        #self.env = gym.make(environment).unwrapped
        self.env = gym.make(environment)
        self.env.reset()
        self.current_screen = None
        self.done = False
    
    def reset(self):
        self.env.reset()
        self.current_screen = None
        
    def close(self):
        self.env.close()
        
    def render(self, mode='human'):
        return self.env.render(mode)
        
    def num_actions_available(self):
        return self.env.action_space
        
    def take_action(self, action):   
        _, reward, self.done, _ = self.env.step([action])
        return reward
    
    def just_starting(self):
        return self.current_screen is None
    
    def get_state(self):
        if self.just_starting() or self.done:
            self.current_screen = self.get_processed_screen()
            black_screen = torch.zeros_like(self.current_screen)
            return black_screen
        else:
            s1 = self.current_screen
            s2 = self.get_processed_screen()
            self.current_screen = s2
            return s2 - s1
    
    def get_screen_height(self):
        screen = self.get_processed_screen()
        return screen.shape[2]
    
    def get_screen_width(self):
        screen = self.get_processed_screen()
        return screen.shape[3]
       
    def get_processed_screen(self):
        screen = em.render('rgb_array')
        rgb_weights = [0.2989, 0.5870, 0.1140]
        grayscale_image = np.dot(screen[...,:3], rgb_weights) 
        screen = grayscale_image.transpose((0, 1)) # PyTorch expects CHW
        #print(type(screen)) # numpy
        screen = self.crop_screen(screen)
        return self.transform_screen_data(screen)
    
    def crop_screen(self, screen):
        screen_height = screen.shape[0]
        screen_width  = screen.shape[1]
        #print('screen height(top/bottom): ',screen_height)
        #print('screen height(left/right): ',screen_width)
        # Strip off top and bottom
        top = int(screen_height * 0.2)
        #print('top: ',top)
        bottom = int(screen_height * 0.8)
        #print('bottom: ',bottom)
        
        
        #strip off left/right
        left  = int(screen_width * 0.2)
        #print('left: ',left)
        right = int(screen_width * 0.8)
        #print('right: ',right)
        
        screen = screen[top:bottom, left:right]
        return screen
    
    
    def transform_screen_data(self, screen):       
        # Convert to float, rescale, convert to tensor
        screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
        screen = torch.from_numpy(screen)
        
        # Use torchvision package to compose image transforms
        resize = T.Compose([
            T.ToPILImage()
            ,T.Resize((40,90))
            ,T.ToTensor()
        ])
        
        return resize(screen).unsqueeze(0).to(self.device) # add a batch dimension (BCHW)

In [3]:
class ReplayMemory():
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.push_count = 0
        
    def push(self, experience):
        if len(self.memory) < self.capacity:
            self.memory.append(experience)
        else:
            self.memory[self.push_count % self.capacity] = experience
        self.push_count += 1

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def can_provide_sample(self, batch_size):
        return len(self.memory) >= batch_size

In [13]:
def extract_tensors(experiences):
    # Convert batch of Experiences to Experience of batches
    batch = Experience(*zip(*experiences))

    t1 = torch.stack(batch.state)
    t2 = torch.stack(batch.action)
    t3 = torch.stack(batch.reward)
    t4 = torch.stack(batch.next_state)

    return (t1,t2,t3,t4)

In [14]:
Experience = namedtuple(
    'Experience',
    ('state', 'action', 'reward', 'next_state')
)

In [15]:
class OU_noise():
    def __init__(self,mu,sigma=0.15,theta=0.15,dt=1e-2,x0=None):
        self.theta = theta
        self.mu    = mu
        self.sigma = sigma
        self.dt    = dt
        self.x0    = x0
        self.reset()
        
    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev)* \
                self.dt +self.sigma * np.sqrt(self.dt)*np.random.normal(size = self.mu.shape)
        self.x_prev = x
        return x
    
    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

In [16]:
class Actor_network(nn.Module):
    def __init__(self, lr, input_size, fc1, fc2, linear_out):
        super(Actor_network, self).__init__()
        self.input_size = input_size
        self.linear_out = linear_out
        self.lr = lr
        self.n_hidden_fc1 = fc1
        self.n_hidden_fc2 = fc2
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.fc1  = nn.Linear(self.input_size,self.n_hidden_fc1)
        self.fc2  = nn.Linear(self.n_hidden_fc1,self.n_hidden_fc2)
        self.fc3  = nn.Linear(self.n_hidden_fc2,self.linear_out)
        
        self.ln1 =nn.LayerNorm(self.n_hidden_fc1)
        self.ln2 =nn.LayerNorm(self.n_hidden_fc2)
        
        fc1 = 1.0/np.sqrt(self.fc1.weight.data.size()[0])
        self.fc1.weight.data.uniform_(-fc1,fc1)
        self.fc1.bias.data.uniform_(-fc1,fc1)
        
        fc2 = 1.0/np.sqrt(self.fc2.weight.data.size()[0])
        self.fc2.weight.data.uniform_(-fc2,fc2)
        self.fc2.bias.data.uniform_(-fc2,fc2)
        
        fc3 = 0.003
        self.fc3.weight.data.uniform_(-fc3,fc3)
        self.fc3.bias.data.uniform_(-fc3,fc3)
        
        self.optimizer = optim.Adam(params = self.parameters() ,lr = lr)
        self.to(self.device)
        
    def forward(self, states):
        states = self.fc1(states)
        states = self.ln1(states)
        states = F.relu(states)
        
        states = self.fc2(states)
        states = self.ln2(states)
        states = F.relu(states)
        
        states = self.fc3(states)
        out = torch.tanh(states)*2.0 #multiply by max/min env space, tanh because of -1 1 min maxing
    
        return out   

In [17]:
class Critic_network(nn.Module):
    def __init__(self, lr, input_size, fc1, fc2, linear_out):
        super(Critic_network, self).__init__()
        self.input_size = input_size
        self.linear_out = linear_out
        self.lr = lr
        self.n_hidden_fc1 = fc1
        self.n_hidden_fc2 = fc2
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.fc1   = nn.Linear(self.input_size,self.n_hidden_fc1)
        self.fc2  = nn.Linear(self.n_hidden_fc1,self.n_hidden_fc2)
        
        self.action_values = nn.Linear(self.linear_out,self.n_hidden_fc2)
        
        self.state_q_value = nn.Linear(self.n_hidden_fc2, 1)
        
        #normalize data cause we are sampling from so many diff environments
        self.ln1 = nn.LayerNorm(self.n_hidden_fc1) 
        self.ln2 = nn.LayerNorm(self.n_hidden_fc2)
        
        #initialize weights and biases according to number of neurons per laer involved
        fc1 = 1.0/np.sqrt(self.fc1.weight.data.size()[0])
        self.fc1.weight.data.uniform_(-fc1,fc1)
        self.fc1.bias.data.uniform_(-fc1,fc1)
        
        fc2 = 1.0/np.sqrt(self.fc2.weight.data.size()[0])
        self.fc2.weight.data.uniform_(-fc2,fc2)
        self.fc2.bias.data.uniform_(-fc2,fc2)
        
        fc3 = 1.0/np.sqrt(self.action_values.weight.data.size()[0])
        self.action_values.weight.data.uniform_(-fc3,fc3)
        self.action_values.bias.data.uniform_(-fc3,fc3)
        
        #final output layer initialize by 0.003 as desired by paper
        self.state_q_value.weight.data.uniform_(-0.003,0.003)
        self.state_q_value.bias.data.uniform_(-0.003,0.003)
        
        self.optimizer = optim.Adam(params = self.parameters() ,lr = lr, weight_decay = 0.01)#weight decay?
        self.to(self.device)
        
        
    def forward(self, states, actions):
        states = self.fc1(states).to(self.device)
        states = self.ln1(states)
        states = F.relu(states)
        
        states = self.fc2(states)
        states = self.ln2(states)
        #states = F.relu(states)
        
        action_v = self.action_values(actions)#action nn has same output size as states
        state_action_add = F.relu(torch.add(action_v,states))
        
        final_q = self.state_q_value(state_action_add)
         
        return final_q  

In [37]:
class Agent():
    def __init__(self,alpha,beta,input_dim,tau,n_actions,max_replay_mem,fc1_,fc2_,batch_size = 250,gamma = 0.99):
        self.gamma = gamma
        self.tau = tau
        self.batch_size = batch_size
        self.alpha = alpha
        self.beta = beta
        
        self.memory = ReplayMemory(max_replay_mem)
        
        self.noise = OU_noise(mu = np.zeros(n_actions))
        
        self.actor = Actor_network(alpha,input_dim,fc1_,fc2_,linear_out=n_actions)
        
        self.t_actor= Actor_network(alpha,input_dim,fc1_,fc2_,linear_out=n_actions)
        
        self.critic= Critic_network(beta,input_dim,fc1_,fc2_,linear_out=n_actions)
        
        self.t_critic=Critic_network(beta,input_dim,fc1_,fc2_,linear_out=n_actions)
        
        self.update_network_parameters(tau=1)
        
    def choose_action(self, state):
        self.actor.eval()
        state = state.flatten()
        mu = self.actor(state).to(self.actor.device)
        mu_noise = mu + torch.tensor(self.noise(), dtype=torch.float).to(self.actor.device)
        self.actor.train()
        return mu_noise.cpu().detach().numpy()[0]
    
    def store_memory(self, state, action, reward, next_state):
        state = state.flatten()
        next_state = next_state.flatten()
        self.memory.push(Experience(state,action,reward,next_state))
        
    def learn(self):
        if self.memory.can_provide_sample(self.batch_size):
            states,actions,rewards,next_states   = extract_tensors(self.memory.sample(self.batch_size))
            states = torch.tensor(states, dtype  = torch.float).to(self.actor.device)
            actions = torch.tensor(actions, dtype= torch.float).to(self.actor.device)
            rewards = torch.tensor(rewards, dtype= torch.float).to(self.actor.device)
            next_states = torch.tensor(next_states, dtype=torch.float).to(self.actor.device)
            
            target_actions_network = self.t_actor(next_states)
            target_critic_network = self.t_critic(next_states,target_actions_network)
            #print('states crit: ', states.shape,'actions: ', actions.shape)
            critic_network = self.critic(states,actions)
            #print('critic: ',critic_network.shape)
            #print('critic_t: ',target_critic_network.shape)
            target_critic_network = target_critic_network.view(-1)
            rewards = rewards.view(-1)
            #print('critic_t_: ',target_critic_network.shape)
            #print('rewards: ',rewards.shape)
            target = rewards+self.gamma*target_critic_network
            
            #print('target: ', target.shape)
            target = target.view(self.batch_size,1)
            #print('target prime: ', target.shape)
            self.critic.optimizer.zero_grad()
            #target and critic must be same size
            #print('critic: ',critic_network.shape)
            critic_loss = F.mse_loss(target,critic_network)
            critic_loss.backward()
            self.critic.optimizer.step()
            
            #actor loss
            self.actor.optimizer.zero_grad()
            actor_loss = -self.critic(states,self.actor(states))
            actor_loss = torch.mean(actor_loss)
            actor_loss.backward()
            self.actor.optimizer.step()
            
            self.update_network_parameters()
            
    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau
        
        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.t_actor.named_parameters()
        target_critic_params = self.t_critic.named_parameters()
        
        critic_dict = dict(critic_params)
        actor_dict = dict(actor_params)
        target_critic_dict = dict(target_critic_params)
        target_actor_dict = dict(target_actor_params)
        
        for weight in critic_dict:
            critic_dict[weight] = tau*critic_dict[weight].clone() + (1-tau)*\
                target_critic_dict[weight].clone()
        
        for weight in actor_dict:
            actor_dict[weight] = tau*actor_dict[weight].clone() + (1-tau)*\
                target_actor_dict[weight].clone()
        
        self.t_critic.load_state_dict(critic_dict)
        self.t_actor.load_state_dict(actor_dict)

In [41]:
batch_size = 250
gamma = 0.7
eps_start = 1
eps_end = 0.001
eps_decay = 0.0001
target_update = 10
memory_size = 10000
lr = 0.001
num_episodes = 10000#for more episodes for better results

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
em = EnvManager(device,'Pendulum-v0')
#(self,alpha,beta,input_dim,tau,n_actions,max_replay_mem,fc1_,fc2_,batch_size = 250,gamma = 0.99)
agent = Agent(alpha=0.0001,beta = 0.001,input_dim = em.get_screen_width()*em.get_screen_height(),tau = 0.001,n_actions = 1,max_replay_mem = 100000,fc1_=1000,fc2_=1000)
print(em.get_screen_width(),em.get_screen_height())
n_games = 1000
scores = []
for episode in range(n_games):
    em.reset()
    state = em.get_state()
    agent.noise.reset()
    score = 0
    for timestep in count():
        action_idx = agent.choose_action(state)
        reward     = em.take_action(action_idx)
        score+=reward
        next_state = em.get_state()
        action_idx = torch.tensor([action_idx], device=device)
        reward = torch.tensor([reward], device=device)
        agent.store_memory(state,action_idx,reward, next_state)
        agent.learn()
        state = next_state
        if em.done:
            scores.append(score)
            avg_score = np.mean(scores[-100:])
            print('episode: ', episode, 'score%.2f '% score, 'avg_scr %.2f'%avg_score)
            break

90 40
episode:  0 score-1494.03  avg_scr -1494.03


  states = torch.tensor(states, dtype  = torch.float).to(self.actor.device)
  actions = torch.tensor(actions, dtype= torch.float).to(self.actor.device)
  rewards = torch.tensor(rewards, dtype= torch.float).to(self.actor.device)
  next_states = torch.tensor(next_states, dtype=torch.float).to(self.actor.device)


episode:  1 score-1608.67  avg_scr -1551.35
episode:  2 score-1515.85  avg_scr -1539.52
episode:  3 score-1625.27  avg_scr -1560.95
episode:  4 score-1654.32  avg_scr -1579.63
episode:  5 score-1540.66  avg_scr -1573.13
episode:  6 score-1180.43  avg_scr -1517.03
episode:  7 score-1338.10  avg_scr -1494.67
episode:  8 score-974.88  avg_scr -1436.91
episode:  9 score-1374.39  avg_scr -1430.66
episode:  10 score-1340.82  avg_scr -1422.49
episode:  11 score-1296.65  avg_scr -1412.01
episode:  12 score-1488.12  avg_scr -1417.86
episode:  13 score-1385.02  avg_scr -1415.51
episode:  14 score-1619.29  avg_scr -1429.10
episode:  15 score-1281.63  avg_scr -1419.88
episode:  16 score-1379.19  avg_scr -1417.49
episode:  17 score-994.02  avg_scr -1393.96
episode:  18 score-1595.69  avg_scr -1404.58
episode:  19 score-1437.53  avg_scr -1406.23
episode:  20 score-1301.30  avg_scr -1401.23
episode:  21 score-1437.72  avg_scr -1402.89
episode:  22 score-1373.89  avg_scr -1401.63
episode:  23 score-13

In [40]:
em.close()