# LunarLander: Ein kleiner Einblick in Reinforcement Learning

Das Ziel von Reinforcement Learning ist es, einen Agenten rein durch Beobachtung und Interaktion mit einer meist simulierten Umgebung,so zu trainieren, dass er ein Problem in dieser lösen kann. Hierbei versucht der Agent ständig seinen Reward zu erhöhen.

In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import numpy as np
%matplotlib inline
from matplotlib import pyplot
import matplotlib.pyplot as plt
from IPython.core.debugger import set_trace
from IPython.display import clear_output
import copy
import gym

In [28]:
device = torch.device("cuda")

In [29]:
class ReplayBuffer(object):
	def __init__(self, state_dim, action_dim, max_size=int(1e6)):
		self.max_size = max_size
		self.ptr = 0
		self.size = 0

		self.state = np.zeros((max_size, state_dim))
		self.action = np.zeros((max_size, action_dim))
		self.next_state = np.zeros((max_size, state_dim))
		self.reward = np.zeros((max_size, 1))
		self.not_done = np.zeros((max_size, 1))

    # adds a transition tuple to the buffer    
	def add(self, state, action, next_state, reward, done):
		self.state[self.ptr] = state
		self.action[self.ptr] = action
		self.next_state[self.ptr] = next_state
		self.reward[self.ptr] = reward
		self.not_done[self.ptr] = 1. - done

		self.ptr = (self.ptr + 1) % self.max_size
		self.size = min(self.size + 1, self.max_size)

    # returns a batch of the size batch_size, containing the past transitions
	def sample(self, batch_size):
		ind = np.random.randint(0, self.size, size=batch_size)

		return (
			torch.FloatTensor(self.state[ind]).to(device),
			torch.FloatTensor(self.action[ind]).to(device),
			torch.FloatTensor(self.next_state[ind]).to(device),
			torch.FloatTensor(self.reward[ind]).to(device),
			torch.FloatTensor(self.not_done[ind]).to(device)
		)

In [30]:
class Actor(nn.Module):
	def __init__(self, state_dim, action_dim, max_action):
		super(Actor, self).__init__()

		self.l1 = nn.Linear(state_dim, 256)
		self.l2 = nn.Linear(256, 256)
		self.l3 = nn.Linear(256, action_dim)
		
		self.max_action = max_action
		

	def forward(self, state):
		a = F.relu(self.l1(state))
		a = F.relu(self.l2(a))
		return self.max_action * torch.tanh(self.l3(a))

In [31]:
class Critic(nn.Module):
	def __init__(self, state_dim, action_dim):
		super(Critic, self).__init__()

		# Q1 architecture
		self.l1 = nn.Linear(state_dim + action_dim, 256)
		self.l2 = nn.Linear(256, 256)
		self.l3 = nn.Linear(256, 1)

		# Q2 architecture
		self.l4 = nn.Linear(state_dim + action_dim, 256)
		self.l5 = nn.Linear(256, 256)
		self.l6 = nn.Linear(256, 1)


	def forward(self, state, action):
		sa = torch.cat([state, action], 1)

		q1 = F.relu(self.l1(sa))
		q1 = F.relu(self.l2(q1))
		q1 = self.l3(q1)

		q2 = F.relu(self.l4(sa))
		q2 = F.relu(self.l5(q2))
		q2 = self.l6(q2)
		return q1, q2


	def Q1(self, state, action):
		sa = torch.cat([state, action], 1)

		q1 = F.relu(self.l1(sa))
		q1 = F.relu(self.l2(q1))
		q1 = self.l3(q1)
		return q1

In [32]:
class TD3(object):
    def __init__(
        self, 
        state_dimension, 
        action_dimension, 
        max_action,
        discount=0.99, 
        tau=0.005, 
        policy_noise=0.2, 
        noise_clip=0.5, 
        policy_frequency=2):
        
        
        self.actor = Actor(state_dimension, action_dimension, max_action).to(device)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4)
        
        self.critic = Critic(state_dimension, action_dimension).to(device)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4)
        
        self.max_action = max_action
        self.discount = discount
        self.tau = tau
        self.policy_noise = policy_noise 
        self.noise_clip = noise_clip
        self.policy_frequency = policy_frequency
        
        
        self.total_it = 0
    
    def select_action(self, state):
        state = torch.FloatTensor(state.reshape(1,-1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()
    
    def train(self,replayBuffer, batch_size=200):
        self.total_it += 1
        
        state, action, next_state, reward, not_done = replayBuffer.sample(batch_size)
        
        with torch.no_grad():
            # calculate the exploration noise a
            noise = (torch.randn_like(action) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip)
            # select the action with the exploration noise a
            next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action)
            
            target_Q1, target_Q2 = self.critic_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + not_done * self.discount * target_Q
            
        
        current_Q1, current_Q2 = self.critic(state, action)
        
        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
        
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()
        
        if self.total_it % self.policy_frequency == 0:
            
            actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
            
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
            
            for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

            for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
                target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

In [33]:
def eval_policy(policy, env_name, eval_episodes=10):
	eval_env = gym.make("LunarLanderContinuous-v2")

	avg_reward = 0.
	for _ in range(eval_episodes):
		state, done = eval_env.reset(), False
		while not done:
			action = policy.select_action(np.array(state))
			state, reward, done, _ = eval_env.step(action)
			avg_reward += reward

	avg_reward /= eval_episodes

	#print("---------------------------------------")
	#print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
	#print("---------------------------------------")
	return avg_reward

In [None]:
env = gym.make("LunarLanderContinuous-v2")
state_dimension = env.observation_space.shape[0]
action_dimension = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

B = ReplayBuffer(state_dimension, action_dimension)
policy = TD3(state_dimension, action_dimension, float(env.action_space.high[0]))

state, done = env.reset(), False
episode_reward = 0
episode_timesteps = 0
episode_num = 0

render = False

evaluations = [eval_policy(policy, env)]

for t in range(int(1e6)):
        
        episode_timesteps += 1

        # Select action randomly or according to policy
        if t < 25e3:
            action = env.action_space.sample()
        else:
            action = (
                policy.select_action(np.array(state))
                + np.random.normal(0, max_action * 0.1, size=action_dimension)
            ).clip(-max_action, max_action)

        # Perform action
        next_state, reward, done, _ = env.step(action) 
        done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0
        
        # Store data in replay buffer
        B.add(state, action, next_state, reward, done_bool)
        state = next_state
        episode_reward += reward
        
        if render:
            env.render()

        # Train agent after collecting sufficient data
        if t >= 25e3:
            policy.train(B, 256)

        if done: 
            # Reset environment
            state, done = env.reset(), False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1
            
            
        # Evaluate episode
        if (t + 1) % 5e3 == 0:
            avg_reward = eval_policy(policy, env)
            evaluations.append(avg_reward)
            #if avg_reward >= 230:
               # render = True
            clear_output(wait=True)
            plt.plot(evaluations)
            plt.title('Reward over episodes times ten')
            plt.show()