# Install Dependencies

In [None]:
! bash install.sh

In [1]:
import gym
from collections import namedtuple
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# Neural Network

In [2]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(obs_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, n_actions)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)


# Generate Sessions

In [3]:
def generate_batch(env,batch_size, t_max=5000):
    
    activation = nn.Softmax(dim=1)
    batch_actions,batch_states, batch_rewards = [],[],[]
    
    for b in range(batch_size):
        states,actions = [],[]
        total_reward = 0
        s = env.reset()
        for t in range(t_max):
            
            s_v = torch.FloatTensor([s])
            act_probs_v = activation(net(s_v))
            act_probs = act_probs_v.data.numpy()[0]
            a = np.random.choice(len(act_probs), p=act_probs)

            new_s, r, done, info = env.step(a)

            #record sessions like you did before
            states.append(s)
            actions.append(a)
            total_reward += r

            s = new_s
            if done:
                batch_actions.append(actions)
                batch_states.append(states)
                batch_rewards.append(total_reward)
                break
                
    return batch_states, batch_actions, batch_rewards

# Filter Elite Episodes

In [4]:
def filter_batch(states_batch,actions_batch,rewards_batch,percentile=50):
    
    reward_threshold = np.percentile(rewards_batch, percentile)
    
    elite_states = []
    elite_actions = []
    
    
    for i in range(len(rewards_batch)):
        if rewards_batch[i] > reward_threshold:
            for j in range(len(states_batch[i])):
                elite_states.append(states_batch[i][j])
                elite_actions.append(actions_batch[i][j])
    
    return elite_states,elite_actions
    

In [15]:
def filter_batch_pos_negative(states_batch,actions_batch,rewards_batch,percentile=50):
    reward_threshold = np.percentile(rewards_batch, percentile)
    
    elite_states = []
    elite_actions = []
    rewards_list = []
    
    for i in range(len(rewards_batch)):
        for j in range(len(states_batch[i])):
            elite_states.append(states_batch[i][j])
            elite_actions.append(actions_batch[i][j])
            if rewards_batch[i] > reward_threshold:
                rewards_list.append(1)
            else:
                rewards_list.append(-1)
    
    return elite_states,elite_actions,rewards_list

In [31]:
def custom_cross_entropy(outputs, labels,reward):
    batch_size = outputs.size()[0]            # batch_size
    outputs = F.log_softmax(outputs, dim=1)   # compute the log of softmax values
    outputs = outputs[range(batch_size), labels] # pick the values corresponding to the labels
    outputs = outputs*reward
    return -torch.sum(outputs)/batch_size

# Carry Out Training

In [None]:
import pdb
batch_size = 100
session_size = 500
percentile = 80
hidden_size = 200
completion_score = 200
learning_rate = 0.01

env = gym.make("LunarLander-v2")
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n

#neural network
net = Net(n_states, hidden_size, n_actions)
#loss function
#objective = nn.CrossEntropyLoss()
objective=custom_cross_entropy
#optimisation function
optimizer = optim.Adam(params=net.parameters(), lr=learning_rate)

for i in range(session_size):
    #generate new sessions
    batch_states,batch_actions,batch_rewards = generate_batch(env, batch_size, t_max=5000)
    #pdb.set_trace()
    #elite_states, elite_actions = filter_batch(batch_states,batch_actions,batch_rewards,percentile)
    #pdb.set_trace()
    all_states,all_actions,rewards_list=filter_batch_pos_negative(batch_states,batch_actions,batch_rewards,percentile)
    
    optimizer.zero_grad()
    #pdb.set_trace()
    tensor_states = torch.FloatTensor(all_states)
    tensor_actions = torch.LongTensor(all_actions)
    tensor_rewards_list = torch.FloatTensor(rewards_list)
    
    action_scores_v = net(tensor_states)
    loss_v = objective(action_scores_v, tensor_actions,tensor_rewards_list)
    loss_v.backward()
    optimizer.step()

    #show results
    mean_reward, threshold = np.mean(batch_rewards), np.percentile(batch_rewards, percentile)
    print("%d: loss=%.3f, reward_mean=%.1f, reward_threshold=%.1f" % (
            i, loss_v.item(), mean_reward, threshold))
    
    #check if 
    if np.mean(batch_rewards)> completion_score:
        print("Environment has been successfullly completed!")

0: loss=-0.888, reward_mean=-209.8, reward_threshold=-96.2
1: loss=-0.850, reward_mean=-245.6, reward_threshold=-110.9
2: loss=-0.819, reward_mean=-277.5, reward_threshold=-117.9
3: loss=-0.810, reward_mean=-233.2, reward_threshold=-100.1
4: loss=-0.870, reward_mean=-159.5, reward_threshold=-77.9
5: loss=-0.839, reward_mean=-137.3, reward_threshold=-79.1
6: loss=-0.775, reward_mean=-162.8, reward_threshold=-78.8
7: loss=-0.685, reward_mean=-162.3, reward_threshold=-85.5
8: loss=-0.692, reward_mean=-147.6, reward_threshold=-83.0
9: loss=-0.810, reward_mean=-138.3, reward_threshold=-78.6
10: loss=-0.863, reward_mean=-113.9, reward_threshold=-67.2
11: loss=-0.825, reward_mean=-110.5, reward_threshold=-64.5
12: loss=-0.628, reward_mean=-104.0, reward_threshold=-58.8
13: loss=-0.605, reward_mean=-92.4, reward_threshold=-49.1
14: loss=-0.557, reward_mean=-96.6, reward_threshold=-54.2
15: loss=-0.796, reward_mean=-103.6, reward_threshold=-37.0
16: loss=-0.588, reward_mean=-105.4, reward_thres

# Results

In [None]:
#record sessions
import gym.wrappers
env = gym.wrappers.Monitor(gym.make("LunarLander-v2"), directory="videos", force=True)
generate_batch(env, 1, t_max=5000)
env.close()

# Save our model

In [None]:
# save the model
torch.save(net, 'model_best.pth.tar')