In [1]:
%matplotlib inline
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from random import seed
from sklearn import preprocessing
seed(1)

In [2]:
class EnvManager():
    def __init__(self, device,environment):
        self.device = device
        #self.env = gym.make(environment).unwrapped
        self.env = gym.make(environment)
        self.env.reset()
        self.current_screen = None
        self.done = False
    
    def reset(self):
        self.env.reset()
        self.current_screen = None
        
    def close(self):
        self.env.close()
        
    def render(self, mode='human'):
        return self.env.render(mode)
        
    def num_actions_available(self):
        return self.env.action_space
        
    def take_action(self, action):   
        _, reward, self.done, _ = self.env.step([action])
        return reward
    
    def just_starting(self):
        return self.current_screen is None
    
    def get_state(self):
        if self.just_starting() or self.done:
            self.current_screen = self.get_processed_screen()
            black_screen = torch.zeros_like(self.current_screen)
            return black_screen
        else:
            s1 = self.current_screen
            s2 = self.get_processed_screen()
            self.current_screen = s2
            return s2 - s1
    
    def get_screen_height(self):
        screen = self.get_processed_screen()
        return screen.shape[2]
    
    def get_screen_width(self):
        screen = self.get_processed_screen()
        return screen.shape[3]
       
    def get_processed_screen(self):
        screen = em.render('rgb_array')
        rgb_weights = [0.2989, 0.5870, 0.1140]
        grayscale_image = np.dot(screen[...,:3], rgb_weights) 
        screen = grayscale_image.transpose((0, 1)) # PyTorch expects CHW
        #print(type(screen)) # numpy
        screen = self.crop_screen(screen)
        return self.transform_screen_data(screen)
    
    def crop_screen(self, screen):
        screen_height = screen.shape[0]
        screen_width  = screen.shape[1]
        #print('screen height(top/bottom): ',screen_height)
        #print('screen height(left/right): ',screen_width)
        # Strip off top and bottom
        top = int(screen_height * 0.2)
        #print('top: ',top)
        bottom = int(screen_height * 0.8)
        #print('bottom: ',bottom)
        
        
        #strip off left/right
        left  = int(screen_width * 0.2)
        #print('left: ',left)
        right = int(screen_width * 0.8)
        #print('right: ',right)
        
        screen = screen[top:bottom, left:right]
        return screen
    
    
    def transform_screen_data(self, screen):       
        # Convert to float, rescale, convert to tensor
        screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
        screen = torch.from_numpy(screen)
        
        # Use torchvision package to compose image transforms
        resize = T.Compose([
            T.ToPILImage()
            ,T.Resize((40,90))
            ,T.ToTensor()
        ])
        
        return resize(screen).unsqueeze(0).to(self.device) # add a batch dimension (BCHW)

In [3]:
class ReplayMemory():
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.push_count = 0
        
    def push(self, experience):
        if len(self.memory) < self.capacity:
            self.memory.append(experience)
        else:
            self.memory[self.push_count % self.capacity] = experience
        self.push_count += 1

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def can_provide_sample(self, batch_size):
        return len(self.memory) >= batch_size

In [4]:
def extract_tensors(experiences):
    # Convert batch of Experiences to Experience of batches
    batch = Experience(*zip(*experiences))

    t1 = torch.stack(batch.state)
    t2 = torch.stack(batch.action)
    t3 = torch.stack(batch.reward)
    t4 = torch.stack(batch.next_state)

    return (t1,t2,t3,t4)

Experience = namedtuple(
    'Experience',
    ('state', 'action', 'reward', 'next_state')
)

In [5]:
class Actor_network(nn.Module):
    def __init__(self, lr, input_size, fc1, fc2, linear_out):
        super(Actor_network, self).__init__()
        self.input_size = input_size
        self.linear_out = linear_out
        self.lr = lr
        self.n_hidden_fc1 = fc1
        self.n_hidden_fc2 = fc2
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.fc1  = nn.Linear(self.input_size,self.n_hidden_fc1)
        self.fc2  = nn.Linear(self.n_hidden_fc1,self.n_hidden_fc2)
        self.fc3  = nn.Linear(self.n_hidden_fc2,self.linear_out)
        
        self.optimizer = optim.Adam(params = self.parameters() ,lr = lr)
        self.to(self.device)
        
    def forward(self, states):
        states = self.fc1(states)
        states = F.relu(states)
        
        states = self.fc2(states)
        states = F.relu(states)
        
        states = self.fc3(states)
        out = torch.tanh(states)*2.0 #multiply by max/min env space, tanh because of -1 1 min maxing
    
        return out 

In [15]:
class Critic_network(nn.Module):
    def __init__(self, lr, input_size, fc1, fc2, linear_out):
        super(Critic_network, self).__init__()
        self.input_size_cat = input_size + linear_out
        self.linear_out = linear_out
        self.lr = lr
        self.n_hidden_fc1 = fc1
        self.n_hidden_fc2 = fc2
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.fc1   = nn.Linear(self.input_size_cat,self.n_hidden_fc1)
        self.fc2  = nn.Linear(self.n_hidden_fc1,self.n_hidden_fc2)
        self.state_q_value = nn.Linear(self.n_hidden_fc2, 1)
        
        self.optimizer = optim.Adam(params = self.parameters() ,lr = lr)#weight decay?
        self.to(self.device)
        
        
    def forward(self, states, actions):
        #print(self.input_size_cat)
        #print(states.shape, actions.shape)
        states = torch.cat((states,actions),dim = 1)
        #print(states.shape)
        states = self.fc1(states).to(self.device)
        states = F.relu(states)
        states = self.fc2(states)
        states = F.relu(states)
        final_q = self.state_q_value(states)
         
        return final_q  

In [27]:
class Agent():
    def __init__(self,alpha,beta,noise,input_dims,output_dims,tau,max_replay_mem,fc_1,fc_2,update_actor_counter = 5,pre_train_duration = 1000,batch_size = 250,gamma = 0.99):
        self.input_dims = input_dims
        self.n_actions = output_dims
        self.hidden_1 = fc_1
        self.hidden_2 = fc_2
        self.alpha = alpha
        self.beta = beta
        self.noise = noise
        self.gamma = gamma
        self.tau = tau
        self.pre_train_duration = pre_train_duration
        self.time_step_counter = 0
        self.update_actor_counter = update_actor_counter #delayed ddpg remember, we only been updating the critics
        self.actor_learn_counter = 0
        self.batch_size = batch_size
        self.memory = ReplayMemory(max_replay_mem)
        
        
        self.actor = Actor_network(self.alpha,self.input_dims,self.hidden_1,self.hidden_2,self.n_actions)
        self.critic_1= Critic_network(self.beta,self.input_dims,self.hidden_1,self.hidden_2,self.n_actions)
        self.critic_2= Critic_network(self.beta,self.input_dims,self.hidden_1,self.hidden_2,self.n_actions)
        
        self.t_actor = Actor_network(self.alpha,self.input_dims,self.hidden_1,self.hidden_2,self.n_actions)
        self.t_critic_1= Critic_network(self.beta,self.input_dims,self.hidden_1,self.hidden_2,self.n_actions)
        self.t_critic_2= Critic_network(self.beta,self.input_dims,self.hidden_1,self.hidden_2,self.n_actions)
        
        self.update_network_parameters(tau=1)
        
    def select_action(self,state):
        if self.time_step_counter < self.pre_train_duration:
            mu = torch.tensor(np.random.normal(loc = 0, scale = self.noise, size = self.n_actions)).to(self.actor.device)
        else:
            mu_noise = np.random.normal(loc = 0, scale = self.noise, size = self.n_actions)
            state = state.flatten()
            mu = self.actor(state)+torch.tensor(mu_noise).to(self.actor.device)
        mu = torch.clamp(mu,-2,2)#set bounds of output to match min-max torque
        self.time_step_counter+=1
        return mu.cpu().detach().numpy()[0]
    
    def store_memory(self,state,action,reward,next_state):
        state = state.flatten()
        next_state = next_state.flatten()
        action = torch.tensor([action], device=device)
        reward =torch.tensor([reward], device=device)
        self.memory.push(Experience(state,action,reward,next_state))
        
    def learn(self):
        if self.memory.can_provide_sample(self.batch_size):
            states,actions,rewards,next_states   = extract_tensors(self.memory.sample(self.batch_size))
            states = torch.tensor(states, dtype  = torch.float).to(self.actor.device)
            actions = torch.tensor(actions, dtype= torch.float).to(self.actor.device)
            rewards = torch.tensor(rewards, dtype= torch.float).to(self.actor.device)
            next_states = torch.tensor(next_states, dtype=torch.float).to(self.actor.device)
            
            t_actor_values = self.t_actor(next_states)
            t_noise = torch.clamp(torch.tensor(np.random.normal(loc = 0, scale = 0.15)),-0.5,0.5)#between 0 - 2
            t_actor_values = t_actor_values + t_noise
            
            t_actor_values = torch.clamp(t_actor_values,-2,2)#set bounds of output to match min-max torque
        
            q1_ = self.t_critic_1(next_states,t_actor_values)
            q2_ = self.t_critic_2(next_states,t_actor_values)
            
            q1 = self.critic_1(states,actions)
            q2 = self.critic_2(states,actions)
            
            q_prime_select = torch.min(q1_,q2_)
            #take min then use it as update
            target = rewards + self.gamma*(q_prime_select)
            
            self.critic_1.optimizer.zero_grad()
            self.critic_2.optimizer.zero_grad()
            #the target network is ALWAYS used to update the 'current' network
            #the target network is 'updated' using tau. PLEASE REMEMBER THIS
            critic1_loss = F.mse_loss(target,q1)
            critic2_loss = F.mse_loss(target,q2)
            
            #backpropagate on 2 network always problematic so we add
            critic_loss_ovr = critic1_loss + critic2_loss
            critic_loss_ovr.backward()
            self.critic_1.optimizer.step()
            self.critic_2.optimizer.step()
            
            self.actor_learn_counter +=1 
            
            if self.actor_learn_counter % self.update_actor_counter==0:
                self.actor.optimizer.zero_grad()
                main_actor_loss=self.critic_1(states, self.actor(states))
                main_actor_loss=-torch.mean(main_actor_loss)
                main_actor_loss.backward()
                self.actor.optimizer.step()
                
                self.update_network_parameters()
                
    def update_network_parameters(self,tau=None):
        if tau is None:
            tau = self.tau
        #mains
        actor_params = self.actor.named_parameters()
        critic_1_params = self.critic_1.named_parameters()
        critic_2_params = self.critic_2.named_parameters()
        
        #targets
        target_actor_params = self.t_actor.named_parameters()
        target_critic_1_params = self.t_critic_1.named_parameters()
        target_critic_2_params = self.t_critic_2.named_parameters()
        
        #main dicts
        actor_dict = dict(actor_params)
        critic_1_dict = dict(critic_1_params)
        critic_2_dict = dict(critic_2_params)
        
        #target_dicts
        t_actor_dict = dict(target_actor_params)
        t_critic_1_dict = dict(target_critic_1_params)
        t_critic_2_dict = dict(target_critic_2_params)
        
        
        
        for weight in critic_1_dict:
            critic_1_dict[weight] = tau*critic_1_dict[weight].clone() + (1-tau)*\
                t_critic_1_dict[weight].clone()
        
        for weight in critic_2_dict:
            critic_2_dict[weight] = tau*critic_2_dict[weight].clone() + (1-tau)*\
                t_critic_2_dict[weight].clone()
        
        for weight in actor_dict:
            actor_dict[weight] = tau*actor_dict[weight].clone() + (1-tau)*\
                t_actor_dict[weight].clone()
        
        self.t_critic_1.load_state_dict(critic_1_dict)
        self.t_critic_2.load_state_dict(critic_2_dict)
        self.t_actor.load_state_dict(actor_dict)
            

In [28]:
batch_size = 250
gamma = 0.7
eps_start = 1
eps_end = 0.001
eps_decay = 0.0001
target_update = 10
memory_size = 10000
lr = 0.001
num_episodes = 10000#for more episodes for better results

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [29]:
em = EnvManager(device,'Pendulum-v0')
#(self,alpha,beta,noise,input_dims,output_dims,tau,max_replay_mem,fc_1,fc_2,update_actor_counter = 5,pre_train_duration = 10000,batch_size = 250,gamma = 0.99)
agent = Agent(noise = 0.1,alpha=0.001,beta = 0.001,input_dims = em.get_screen_width()*em.get_screen_height(),tau = 0.005,output_dims = 1,max_replay_mem = 100000,fc_1=1000,fc_2=1000)
print(em.get_screen_width(),em.get_screen_height())
n_games = 1000
scores = []
for episode in range(n_games):
    em.reset()
    state = em.get_state()
    score = 0
    for timestep in count():
        action_idx = agent.select_action(state)
        reward     = em.take_action(action_idx)
        score+=reward
        next_state = em.get_state()
        agent.store_memory(state,action_idx,reward, next_state)
        agent.learn()
        state = next_state
        if em.done:
            scores.append(score)
            avg_score = np.mean(scores[-100:])
            print('episode: ', episode, 'score%.2f '% score, 'avg_scr %.2f'%avg_score)
            break

90 40
episode:  0 score-973.09  avg_scr -973.09


  states = torch.tensor(states, dtype  = torch.float).to(self.actor.device)
  actions = torch.tensor(actions, dtype= torch.float).to(self.actor.device)
  rewards = torch.tensor(rewards, dtype= torch.float).to(self.actor.device)
  next_states = torch.tensor(next_states, dtype=torch.float).to(self.actor.device)


episode:  1 score-1362.66  avg_scr -1167.88
episode:  2 score-1126.54  avg_scr -1154.10
episode:  3 score-1263.90  avg_scr -1181.55
episode:  4 score-1727.79  avg_scr -1290.80
episode:  5 score-1504.51  avg_scr -1326.42
episode:  6 score-1503.40  avg_scr -1351.70
episode:  7 score-1654.61  avg_scr -1389.56
episode:  8 score-1255.75  avg_scr -1374.69
episode:  9 score-1712.44  avg_scr -1408.47
episode:  10 score-1655.77  avg_scr -1430.95
episode:  11 score-1533.51  avg_scr -1439.50
episode:  12 score-1562.13  avg_scr -1448.93
episode:  13 score-1208.69  avg_scr -1431.77
episode:  14 score-1620.96  avg_scr -1444.38
episode:  15 score-1332.80  avg_scr -1437.41
episode:  16 score-1453.19  avg_scr -1438.34
episode:  17 score-1455.75  avg_scr -1439.30
episode:  18 score-1833.31  avg_scr -1460.04
episode:  19 score-1924.77  avg_scr -1483.28
episode:  20 score-1562.45  avg_scr -1487.05
episode:  21 score-1907.21  avg_scr -1506.15
episode:  22 score-1884.64  avg_scr -1522.60
episode:  23 score-

episode:  185 score-728.67  avg_scr -841.14
episode:  186 score-739.41  avg_scr -840.93
episode:  187 score-821.34  avg_scr -839.00
episode:  188 score-619.65  avg_scr -835.55
episode:  189 score-777.45  avg_scr -832.51
episode:  190 score-623.86  avg_scr -831.25
episode:  191 score-1774.68  avg_scr -840.38
episode:  192 score-627.48  avg_scr -835.85
episode:  193 score-750.11  avg_scr -832.91
episode:  194 score-628.75  avg_scr -830.45
episode:  195 score-579.65  avg_scr -826.83
episode:  196 score-654.70  avg_scr -821.97
episode:  197 score-708.54  avg_scr -819.38
episode:  198 score-641.82  avg_scr -816.05
episode:  199 score-840.18  avg_scr -813.02
episode:  200 score-630.53  avg_scr -808.09
episode:  201 score-618.65  avg_scr -805.66
episode:  202 score-625.84  avg_scr -803.28
episode:  203 score-506.64  avg_scr -799.62
episode:  204 score-868.14  avg_scr -798.76
episode:  205 score-525.42  avg_scr -796.20
episode:  206 score-626.17  avg_scr -794.89
episode:  207 score-741.66  avg

episode:  371 score-502.83  avg_scr -637.00
episode:  372 score-724.16  avg_scr -638.02
episode:  373 score-501.94  avg_scr -636.08
episode:  374 score-934.61  avg_scr -640.38
episode:  375 score-504.71  avg_scr -640.40
episode:  376 score-610.73  avg_scr -640.29
episode:  377 score-502.09  avg_scr -640.27
episode:  378 score-616.20  avg_scr -640.19
episode:  379 score-619.99  avg_scr -641.33
episode:  380 score-624.05  avg_scr -642.51
episode:  381 score-510.64  avg_scr -637.96
episode:  382 score-506.14  avg_scr -631.25
episode:  383 score-583.93  avg_scr -632.07
episode:  384 score-503.64  avg_scr -632.02
episode:  385 score-575.34  avg_scr -632.70
episode:  386 score-624.18  avg_scr -631.73
episode:  387 score-626.14  avg_scr -629.47
episode:  388 score-505.24  avg_scr -628.30
episode:  389 score-505.32  avg_scr -627.15
episode:  390 score-504.68  avg_scr -627.19
episode:  391 score-504.95  avg_scr -626.79
episode:  392 score-514.56  avg_scr -621.45
episode:  393 score-970.96  avg_

KeyboardInterrupt: 

In [30]:
em.close()