In [4]:
!pip install gym
!pip install numpy
!pip install torch
!pip install gym[box2d]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [9]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(obs_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, n_actions)
        
    def forward(self, x):
        x = self.fc1(x)
        return self.fc2(x)

In [5]:
def generate_batch(env,batch_size, t_max=1000):
    
    activation = nn.Softmax(dim=1)
    batch_actions,batch_states, batch_rewards = [],[],[]
    
    for b in range(batch_size):
        states,actions = [],[]
        total_reward = 0
        s = env.reset()
        for t in range(t_max):
            s_v = torch.FloatTensor([s])
            act_probs_v = activation(net(s_v))
            act_probs = act_probs_v.data.numpy()[0]
            a = np.random.choice(len(act_probs), p=act_probs)
            new_s, r, done, info = env.step(a)
            states.append(s)
            actions.append(a)
            total_reward += r
            s = new_s
            if done:
                batch_actions.append(actions)
                batch_states.append(states)
                batch_rewards.append(total_reward)
                break
    return batch_states, batch_actions, batch_rewards

In [6]:
def filter_batch(states_batch,actions_batch,rewards_batch,percentile=50):
    reward_threshold = np.percentile(rewards_batch, percentile)
 
    elite_states = []
    elite_actions = []
 
 
    for i in range(len(rewards_batch)):
      if rewards_batch[i] > reward_threshold:
        for j in range(len(states_batch[i])):
          elite_states.append(states_batch[i][j])
          elite_actions.append(actions_batch[i][j])
    
    return elite_states,elite_actions

In [11]:
batch_size = 100
session_size = 100
percentile = 80
hidden_size = 200
learning_rate = 0.0025
completion_score = -100
env = gym.make("LunarLander-v2")
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
#neural network
net = Net(n_states, hidden_size, n_actions)
#loss function
objective = nn.CrossEntropyLoss()
#optimisation function
optimizer = optim.Adam(params=net.parameters(), lr=learning_rate)
for i in range(session_size):
    #generate new sessions
    batch_states,batch_actions,batch_rewards = generate_batch(env, batch_size, t_max=5000) 
    elite_states, elite_actions = filter_batch(batch_states,batch_actions,batch_rewards,percentile)
    
    optimizer.zero_grad()
    tensor_states = torch.FloatTensor(elite_states)
    tensor_actions = torch.LongTensor(elite_actions)
    action_scores_v = net(tensor_states)
    loss_v = objective(action_scores_v, tensor_actions)
    loss_v.backward()
    optimizer.step()
    #show results
    mean_reward, threshold = np.mean(batch_rewards),np.percentile(batch_rewards, percentile)
    print("%d: loss=%.3f, reward_mean=%.1f, reward_threshold=%.1f"  % (i, loss_v.item(), mean_reward, threshold))
    
    #check if 
    if np.mean(batch_rewards)> completion_score:
        print("Environment has been successfullly completed!")
        break



0: loss=1.377, reward_mean=-174.2, reward_threshold=-100.0
1: loss=1.374, reward_mean=-181.6, reward_threshold=-105.8
2: loss=1.366, reward_mean=-167.6, reward_threshold=-94.0
3: loss=1.366, reward_mean=-157.2, reward_threshold=-100.2
4: loss=1.365, reward_mean=-153.9, reward_threshold=-98.9
5: loss=1.359, reward_mean=-140.3, reward_threshold=-85.2
6: loss=1.357, reward_mean=-124.2, reward_threshold=-89.3
7: loss=1.368, reward_mean=-145.6, reward_threshold=-99.8
8: loss=1.352, reward_mean=-129.6, reward_threshold=-92.7
9: loss=1.360, reward_mean=-136.2, reward_threshold=-91.5
10: loss=1.340, reward_mean=-123.7, reward_threshold=-88.7
11: loss=1.360, reward_mean=-128.6, reward_threshold=-84.0
12: loss=1.344, reward_mean=-123.4, reward_threshold=-88.7
13: loss=1.336, reward_mean=-126.2, reward_threshold=-85.2
14: loss=1.355, reward_mean=-128.1, reward_threshold=-89.2
15: loss=1.328, reward_mean=-122.1, reward_threshold=-82.9
16: loss=1.337, reward_mean=-123.1, reward_threshold=-90.8
17: 

KeyboardInterrupt: ignored

In [16]:
!pip install gym.wrappers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gym.wrappers
  Downloading gym-wrappers-0.1.0.tar.gz (1.1 kB)
Building wheels for collected packages: gym.wrappers
  Building wheel for gym.wrappers (setup.py) ... [?25l[?25hdone
  Created wheel for gym.wrappers: filename=gym_wrappers-0.1.0-py3-none-any.whl size=1412 sha256=1066b35767d3596396bb4cacef04d83d352d36df3612d8856b7b46337c310e0c
  Stored in directory: /root/.cache/pip/wheels/b8/4c/cb/f0f1d03994064aae9968c9338d6e8f3ffe622635aab8600c80
Successfully built gym.wrappers
Installing collected packages: gym.wrappers
Successfully installed gym.wrappers-0.1.0


In [21]:

from gym.wrappers.record_video import RecordVideo
env= gym.make("LunarLander-v2", render_mode="rgb_array")
env = RecordVideo(env, './video',  episode_trigger = lambda episode_number: True)
generate_batch(env, 1, t_max=5000)
env.close()

  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  f"Overwriting existing videos at {self.video_folder} folder "
