In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random

env = gym.make('LunarLander-v2')


In [2]:
class LunarLanderNet(nn.Module):
    def __init__(self, dim_in, hidden_dim, dim_out):
        super(LunarLanderNet, self).__init__()
        self.hidden_dim = hidden_dim

        # The linear layer that maps from hidden state space to tag space
        self.fc1 = nn.Linear(dim_in,hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, dim_out)

    def forward(self, input):
        x = F.relu(self.fc1(input))
        x = self.fc2(x)
        return x

In [3]:
model = LunarLanderNet(8, 500, 4)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=0.01)

In [4]:
def train(samples, obsList, actionList):
    meanLoss = 0.0
    count = 0.0
    for episode in samples: 
        for obs_idx in  range(len(obsList[episode])):
            model.zero_grad()
            output = model(obsList[episode][obs_idx]).view(1,4)
            target = torch.tensor([actionList[episode][obs_idx]], dtype = torch.long)
            loss = loss_function(output, target)
            meanLoss +=loss
            loss.backward()
            optimizer.step()
            count+=1
    
    return float(meanLoss.item())/float(count)

In [5]:
def takeAction(actionArray):
    normalized = F.softmax(actionArray, dim=0)
    return random.choices([0,1,2,3], normalized)[0]
    #return torch.argmax(actionArray).item()

In [6]:
from collections import defaultdict

meanReward = 0
iteration = 0
while(meanReward < 100):
    meanReward = 0
    actionDict = defaultdict(list)
    rewardList = {}
    observationList = defaultdict(list)
    for i_episode in range(100):
        observation = env.reset()
        episodeReward = 0
        for t in range(500):
            with torch.no_grad(): 
                actionArray = model(torch.tensor(observation, dtype=torch.float))
                action = takeAction(actionArray)
                observation, reward, done, info = env.step(action)
                episodeReward += reward
                actionDict[i_episode].append(action)
                observationList[i_episode].append(torch.tensor(observation))
                if done:
                    break
        rewardList[i_episode] = episodeReward
        meanReward+=episodeReward
    orderedEpisode = dict(sorted(rewardList.items(), key=lambda item: item[1], reverse=True))
    bestMean = 0
    samples = list(orderedEpisode.keys())[:20]
    for episode in samples:
        bestMean += orderedEpisode[episode]
    loss = train(samples, observationList, actionDict)
    sys.stdout.write("\riteration: %d meanReward= %d meanReward20 %d loss: %f" %(iteration, meanReward/100, bestMean/20, loss))
    sys.stdout.flush()
    iteration+=1
env.close()

iteration: 124 meanReward= -191 meanReward20 -85 loss: 1.269115

KeyboardInterrupt: 

In [None]:
# Loss Function? NLL wird nicht wirklich besser. 
# Reward bleibt maximal gleich/wird eher schlechter
# Reward wird schlechter, wenn die Action mit der höchsten WK genommen wird, statt "zufällig"
# Mean reward bleibt bei ca. -125 bis -135 und meanReward20 bei -70 bis -80 mit Ausreißern nach oben (-40)


#objective = nn.CrossEntropyLoss()
#kein Softmax