In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import numpy as np

env = gym.make('LunarLander-v2')


In [2]:
class LunarLanderNet(nn.Module):
    def __init__(self, dim_in, hidden_dim, dim_out):
        super(LunarLanderNet, self).__init__()
        self.hidden_dim = hidden_dim

        # The linear layer that maps from hidden state space to tag space
        self.fc1 = nn.Linear(dim_in,hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, dim_out)

    def forward(self, input):
        x = F.relu(self.fc1(input))
        x = self.fc2(x)
        return x

In [3]:
model = LunarLanderNet(8, 200, 4).cuda()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=0.01)

In [4]:
def train(obsList, actionList):
    optimizer.zero_grad()
    inputTensor = torch.tensor(obsList, dtype=float).cuda()
    output = model(inputTensor.float().cuda())
    target = torch.tensor(actionList, dtype = torch.long).cuda()
    loss = loss_function(output, target)
    loss.backward()
    optimizer.step()

In [5]:
def takeAction(actionArray):
    normalized = F.softmax(actionArray, dim=0)
    return random.choices([0,1,2,3], normalized)[0]

In [6]:
def filter_batch(states_batch, actions_batch, rewards_batch, percentile):
    
    reward_threshold = np.percentile(rewards_batch, percentile)
    
    elite_states = []
    elite_actions = []
    
    for i in range(len(rewards_batch)):
        if rewards_batch[i] > reward_threshold:
            for j in range(len(states_batch[i])):
                elite_states.append(states_batch[i][j])
                elite_actions.append(actions_batch[i][j])
    
    return elite_states, elite_actions

In [7]:
from collections import defaultdict

meanReward = 0
iterations = 100
for iteration in range(iterations):
    meanReward = 0
    actionList = []
    rewardList = []
    observationList = []
    for i_episode in range(100):
        envObs = env.reset()
        episodeReward = 0
        actions, observations = [],[]
        for t in range(500):
            actionArray = model(torch.tensor(envObs, dtype=torch.float).cuda())
            action = takeAction(actionArray)
            observation, reward, done, info = env.step(action)
            episodeReward += reward
            actions.append(action)
            observations.append(envObs)
            envObs = observation
            if done:
                rewardList.append(episodeReward)
                actionList.append(actions)
                observationList.append(observations)
                break
        
    elite_states, elite_actions = filter_batch(observationList, actionList, rewardList, 80)
    train(elite_states, elite_actions)
    meanReward =  np.mean(rewardList)
    print("\riteration: %d meanReward= %d " %(iteration,meanReward))
    if meanReward > 100:
        break
env.close()

iteration: 0 meanReward= -201 
iteration: 1 meanReward= -141 
iteration: 2 meanReward= -132 
iteration: 3 meanReward= -120 
iteration: 4 meanReward= -135 
iteration: 5 meanReward= -148 
iteration: 6 meanReward= -130 
iteration: 7 meanReward= -120 
iteration: 8 meanReward= -116 
iteration: 9 meanReward= -121 
iteration: 10 meanReward= -120 
iteration: 11 meanReward= -125 
iteration: 12 meanReward= -120 
iteration: 13 meanReward= -116 
iteration: 14 meanReward= -108 
iteration: 15 meanReward= -96 
iteration: 16 meanReward= -85 
iteration: 17 meanReward= -78 
iteration: 18 meanReward= -74 
iteration: 19 meanReward= -80 
iteration: 20 meanReward= -94 
iteration: 21 meanReward= -138 
iteration: 22 meanReward= -181 
iteration: 23 meanReward= -214 
iteration: 24 meanReward= -187 
iteration: 25 meanReward= -211 
iteration: 26 meanReward= -137 
iteration: 27 meanReward= -90 
iteration: 28 meanReward= -73 
iteration: 29 meanReward= -44 
iteration: 30 meanReward= -53 
iteration: 31 meanReward= -4

In [8]:
# Loss Function? NLL wird nicht wirklich besser. 
# Reward bleibt maximal gleich/wird eher schlechter
# Reward wird schlechter, wenn die Action mit der höchsten WK genommen wird, statt "zufällig"
# Mean reward bleibt bei ca. -125 bis -135 und meanReward20 bei -70 bis -80 mit Ausreißern nach oben (-40)


#objective = nn.CrossEntropyLoss()
#kein Softmax

In [None]:
!pip install ffmpeg-python

In [1]:
import gym
import random

env = gym.make("LunarLander-v2")
env = gym.wrappers.Monitor(env, "recording_lunar", force=True)

no_of_actions = env.action_space.n
total_reward = 0
state = env.reset()
done = False

while not done:
    action = random.randint(0, env.action_space.n-1)  # choose a random action
    state, reward, done, _ = env.step(action)
    print(state)
    total_reward += reward


print("\ndone!")
print(f"Total reward: {total_reward}")

DependencyNotInstalled: Found neither the ffmpeg nor avconv executables. On OS X, you can install ffmpeg via `brew install ffmpeg`. On most Ubuntu variants, `sudo apt-get install ffmpeg` should do it. On Ubuntu 14.04, however, you'll need to install avconv with `sudo apt-get install libav-tools`.