In [38]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time
import random
import gym
from gym import envs

env = gym.make('LunarLander-v2') # Create environment

In [16]:
s = env.reset()
print(s.tolist())

[0.004814433865249157, 1.4131004810333252, 0.4876282811164856, 0.09689344465732574, -0.005571866873651743, -0.11045505851507187, 0.0, 0.0]


In [43]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Agent(nn.Module): 
    
    def __init__(self, num_hidden = 128): 
        super(Agent, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(8, num_hidden),
            nn.ReLU(),
            nn.Linear(num_hidden, num_hidden), 
            nn.ReLU(),
            nn.Linear(num_hidden, num_hidden), 
            nn.ReLU(),
            nn.Linear(num_hidden, 4)
        )        
        
    def forward(self, data): 
        x = self.layers(data)
        return F.softmax(x, dim=1)
    
model = Agent()

In [44]:
def compute_returns(rewards, discount_factor):
    """Compute discounted returns."""
    returns = np.zeros(len(rewards))
    returns[-1] = rewards[-1]
    for t in reversed(range(len(rewards)-1)):
        returns[t] = rewards[t] + discount_factor * returns[t+1]
    return returns

In [45]:
env = gym.make('LunarLander-v2') 
def REINFORCE(episodes=5000): 
    
    def compute_loss(a_probs, returns):
        return -torch.mean(torch.matmul(torch.log(a_probs), torch.from_numpy(returns).float()))
    
    # First sample a lot of episode
    max_length_episode = 1001
    
    gamma = 0.9
    epsilon = 0.1
    
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.005)
    
    losses= list()
    total_ret = list()
    for i in range(episodes):
        #epsilon = epsilon * 0.95
        s = env.reset()
        sars = list()
        optimizer.zero_grad()
        for j in range(max_length_episode): 
            logs = model(torch.tensor([s], dtype=torch.float))
            #print(logs)
            r = random.uniform(0,1)
            a = torch.multinomial(logs, 1).item()
            if (r < epsilon):
#                 a = env.action_space.sample()
                s1, r, done, _ = env.step(0)
                s = s1
                continue
            s1, r, done, _ = env.step(a)
            sars.append((s.tolist(), a, r))
            #print(sars[-1])
            s = s1
            if done: 
                break
        #print(f"Episode: {i}: sars {sars}")
        sars = np.array(sars)
        states = np.vstack(sars[:, 0])
        actions = np.vstack(sars[:, 1])     
        rewards = np.array(sars[:,2], dtype=float)
        returns = compute_returns(rewards, gamma)
        a_probs = model(torch.from_numpy(states).float()).gather(1, torch.from_numpy(actions)).view(-1)
        loss = compute_loss(a_probs, returns)
        #print(loss)
        losses.append(loss.item())
        total_ret.append(returns[0])
        loss.backward()
        optimizer.step()
            
        if i % 100 == 0:
            #print(f"Epsilon: {epsilon}")
            print(f"Loss: {loss.item()}")
            print(f"Return: {returns[0]}")
    
    return total_ret, losses
        #print(actions)
        #print(model(torch.tensor(s, dtype=torch.float)).gather(1, torch.from_numpy(actions)).view(-1))
r, l = REINFORCE()



  sars = np.array(sars)


Loss: -3724.0009765625
Return: -10.450081700628214
Loss: -2842.703125
Return: -0.6234806969645783
Loss: -642.762939453125
Return: -12.352992574969363
Loss: -1775.335693359375
Return: -13.886263439497831
Loss: -0.30898886919021606
Return: -34.22972983993885
Loss: -0.3366062343120575
Return: -10.524194333347475
Loss: -0.009178798645734787
Return: -37.66151419775379
Loss: -0.1919192671775818
Return: -27.155809618051112
Loss: -0.2696805000305176
Return: -4.321965204451993
Loss: -0.0005316638271324337
Return: -28.94929764697186
Loss: -0.0002367492124903947
Return: -26.48529659773027


KeyboardInterrupt: 

In [46]:
s = env.reset()

try:
    for _ in range(50000):
        env.render()
        time.sleep(.01)
        a = model(torch.from_numpy(np.atleast_2d(s)).float()).argmax().item()
        s, r, done, _ = env.step(a)
        if done:
            s = env.reset()
except KeyboardInterrupt:
    env.close()
    pass