In [1]:
import gymnasium as gym
import os
import torch
from stable_baselines3 import SAC
import random
import gymnasium as gym
from PIL import Image
import time
from stable_baselines3.common.callbacks import CallbackList, CheckpointCallback, EvalCallback
from stable_baselines3.common.monitor import Monitor
from gymnasium import ObservationWrapper
from gymnasium.wrappers import PixelObservationWrapper, FrameStack
from gymnasium.spaces import Box, Discrete
import numpy as np
import yaml
from experiment import VAEXperiment
from models import *
import torch
from torchvision import transforms
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv


In [2]:
env = gym.make(
    "LunarLander-v2",
    continuous= True,
    gravity= -10.0,
    enable_wind= False,
    wind_power= 15.0,
    turbulence_power= 1.5,
    render_mode ='rgb_array'
)

In [31]:
action_space = env.action_space
low = action_space.low
high = action_space.high
middle_action = (max_action + min_action) / 2  # Calculate middle action
print("Box action space detected.")
print("Minimum action values:", min_action)
print("Maximum action values:", max_action)
print("Middle action values:", middle_action)
actions = np.vstack([low, high, [low[0], high[1]], [high[0], low[1]], [0, 0],
                     [0, high[1]], [0, low[1]], [low[0], 0], [high[0], 0]])
print(actions)

Box action space detected.
Minimum action values: [-1. -1.]
Maximum action values: [1. 1.]
Middle action values: [0. 0.]
[[-1. -1.]
 [ 1.  1.]
 [-1.  1.]
 [ 1. -1.]
 [ 0.  0.]
 [ 0.  1.]
 [ 0. -1.]
 [-1.  0.]
 [ 1.  0.]]


In [31]:
###DATA COLLECTION

# collect data with model or random start actions
# saves the images 
# returns num_of_images
# save location needs to be specified in VAE configs/hyperparams.yaml
# default path is Data/img_Data/img_Data_x.jpeg
def collect_imgs(num_of_episodes=2, num_of_previous_imgs =0,
                 data_dir="Data/",data_name = "lunar_Data",
                 model=None,add_all_action_max=False):
    
    env = gym.make("LunarLander-v2",
                    continuous= True,
                    gravity= -10.0,
                    enable_wind= False,
                    wind_power= 15.0,
                    turbulence_power= 1.5,
                    render_mode ='rgb_array')
    obs, info = env.reset()
    images = []
    #states = []
    #actions = []
    
    
    
    save_path = data_dir+data_name
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    i=num_of_previous_imgs#img number
    
    #Random actions
    if model == None:
        print('start collecting images')
        for episode in range(num_of_episodes):
            observation, info = env.reset()
            done = False
            while done == False: 
                action = env.action_space.sample()
                state, reward, terminated, truncated, info = env.step(action)
                if terminated:
                    done = True
                if truncated:
                    done = True
                current_frame = env.render()
                i+=1
                im = Image.fromarray(np.array(current_frame))
                im.save(save_path+'/'+data_name+'_'+str(i)+'.jpeg')
                #images.append(current_frame)
                #states.append(state)
                #actions.append(action)
            print(episode+1,'/',num_of_episodes)
    #Model predicts actions
    else:
        #env needs to be wrapped to the needs of the model
        env = PixelObservationWrapper(env)
        vae = get_vae(version='version_18')
        env = VAE_ENC(env, vae)
        env = FrameStack(env, num_stack=2)
        env = Monitor(env)
        #env = DummyVecEnv([lambda : env])
        print('start collecting images with the model')
        for episode in range(num_of_episodes):
            observation, info = env.reset()
            done = False
            while not done: 
                action, _states = model.predict(observation, deterministic=True)
                observation, reward, terminated, truncated, info = env.step(action)
                #state, reward, terminated, truncated, info = env.step(action)
                #print(terminated, truncated)
                if terminated:
                    done = True
                if truncated:
                    done = True
                current_frame = env.render()

                i+=1
                im = Image.fromarray(np.array(current_frame))
                im.save(save_path+'/'+data_name+'_'+str(i)+'.jpeg')

            print(episode+1,'/',num_of_episodes)
    #extreme actions for a whole episode each
    if add_all_action_max:
        action_space = env.action_space
        if isinstance(action_space, gym.spaces.Discrete):
            # Handle Discrete action space
            min_action = 0
            max_action = action_space.n - 1
            middle_action = (max_action + min_action) // 2  # Calculate middle action
            print("Discrete action space detected.")
            print("Minimum action value:", min_action)
            print("Maximum action value:", max_action)
            print("Middle action value:", middle_action)
        elif isinstance(action_space, gym.spaces.Box):
            # Handle Box action space
            low = action_space.low
            high = action_space.high            
            print("Box action space detected.")
            print("Minimum action values:", low)
            print("Maximum action values:", high)
            
            actions = np.vstack([low, high, [low[0], high[1]], [high[0], low[1]], [0, 0],
                                 [0, high[1]], [0, low[1]], [low[0], 0], [high[0], 0]])            
            print("total extreme actions", len(actions))
            
        else:
            # Handle other types of action space
            print("Unsupported action space type.")
            
        
        
        for episode in range(len(actions)):
            print('start collecting extreme action episode images')
            observation, info = env.reset()
            done = False
            while done == False: 
                action = actions[episode]
                state, reward, terminated, truncated, info = env.step(action)
                if terminated:
                    done = True
                if truncated:
                    done = True
                current_frame = env.render()

                i+=1
                im = Image.fromarray(np.array(current_frame))
                im.save(save_path+'/'+data_name+'_'+str(i)+'.jpeg')

            print(episode+1,'/',len(actions))
        

    return i


In [41]:
i=collect_imgs(num_of_episodes=50,add_all_action_max=True)
print(i)

start collecting images
1 / 50
2 / 50
3 / 50
4 / 50
5 / 50
6 / 50
7 / 50
8 / 50
9 / 50
10 / 50
11 / 50
12 / 50
13 / 50
14 / 50
15 / 50
16 / 50
17 / 50
18 / 50
19 / 50
20 / 50
21 / 50
22 / 50
23 / 50
24 / 50
25 / 50
26 / 50
27 / 50
28 / 50
29 / 50
30 / 50
31 / 50
32 / 50
33 / 50
34 / 50
35 / 50
36 / 50
37 / 50
38 / 50
39 / 50
40 / 50
41 / 50
42 / 50
43 / 50
44 / 50
45 / 50
46 / 50
47 / 50
48 / 50
49 / 50
50 / 50
Box action space detected.
Minimum action values: [-1. -1.]
Maximum action values: [1. 1.]
total extreme actions 9
start collecting extreme action episode images
1 / 3
start collecting extreme action episode images
2 / 3
start collecting extreme action episode images
3 / 3
start collecting extreme action episode images
4 / 3
start collecting extreme action episode images
5 / 3
start collecting extreme action episode images
6 / 3
start collecting extreme action episode images
7 / 3
start collecting extreme action episode images
8 / 3
start collecting extreme action episode images

### VAE TRAIN

# RL Training

## get the VAE


In [2]:
from gymnasium import ObservationWrapper
from gymnasium.wrappers import PixelObservationWrapper, FrameStack
from gymnasium.spaces import Box, Discrete
import numpy as np
import yaml
from experiment import VAEXperiment
from models import *
import torch
from torchvision import transforms
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv

class AddGaussianNoise(object):
    def __init__(self, mean=0., std=0.1):
        self.std = std
        self.mean = mean
        
    def __call__(self, tensor):
        return tensor + torch.randn(tensor.size()) * self.std + self.mean
    
    def __repr__(self):
        return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)

#ideas
#maybe use wrapper to catch and save frames during RL model learning
#alternative is to use the model afterwards to generate frames during a test run
#
#??return mu, std or sample or just mu??
#
class VAE_ENC(ObservationWrapper):
    def __init__(self, env, vae,
                 mean=0,std=0.1,
                 size=(64,64)):
        super().__init__(env)
        #new obs space
        self.observation_space = Box(shape=(10,), low=-np.inf, high=np.inf)

        self.vae = vae
        #transforms
        self.mean = mean
        self.std = std
        self.size = size
        
        
        
    def observation(self, obs):
        #get frame
        #print(obs)
        frame = obs['pixels']#.to('cuda')
        #transform for VAE
        val_transforms = transforms.Compose([transforms.ToTensor(),
        #transforms.RandomHorizontalFlip(),
        AddGaussianNoise(self.mean, self.std),
        transforms.Resize(self.size),
        #transforms.Grayscale(),
        #transforms.Normalize(self.mean, self.std),
        ])
        frame = val_transforms(frame) #(c,h,w)
        frame = torch.unsqueeze(frame, 0)#.to(self.device) #make it (1,c,h,w)
        enc = self.vae.encode(frame)    
        enc = np.array([tensor.detach().cpu().numpy() for tensor in enc])
        enc = np.array(enc[0][0])#, enc[1][0]]) ## mu, std #  give only mu?
        return enc
    
def get_vae(version='version_0',log_directory='logs/BCE_sum_VAE/MSSIMVAE/'):

    model_path=log_directory+'/'+version+'/hparams.yaml'
    ckpt_path=log_directory+'/'+version+'/checkpoints/last.ckpt'

    config = yaml.safe_load(open(model_path))
    model = vae_models[config['model_params']['name']](**config['model_params'])
    ckpt = torch.load(ckpt_path)
    experiment = VAEXperiment(model, config['exp_params'])
    experiment.load_state_dict(ckpt['state_dict'])      
    vae = experiment.model
    return vae

In [3]:
vae = get_vae(version='version_24')

MSSIMVAE(
  (encoder): Sequential(
    (0): Sequential(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.01)
    )
    (1): Sequential(
      (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.01)
    )
    (2): Sequential(
      (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.01)
    )
    (3): Sequential(
      (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (1): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): LeakyReLU(negative_slope=0.01)
    )
    (4): Sequ

### make the env


In [4]:
env = gym.make("LunarLander-v2",
                continuous= True,
                gravity= -10.0,
                enable_wind= False,
                wind_power= 15.0,
                turbulence_power= 1.5,
                render_mode ='rgb_array')
seed = 42
env.reset(seed=seed)
env = PixelObservationWrapper(env)
env = VAE_ENC(env, vae)
env = FrameStack(env, num_stack=2)
env = Monitor(env)

In [5]:
obs, info = env.reset()

In [6]:
env.observation_space


Box(-inf, inf, (2, 10), float32)

In [7]:
obs.shape

(2, 10)

In [8]:
obs[1]

array([ -5.5619473 ,  15.425905  ,  11.8485155 ,  -7.8212442 ,
        24.54262   ,   8.515593  , -11.751206  ,  -0.42572248,
        10.861605  , -32.383194  ], dtype=float32)

In [None]:
  n_timesteps=float 5e5,
  policy='MlpPolicy',
  batch_size= 256,
  learning_rate= 'lin_7.3e-4',
  buffer_size= 1000000,
  ent_coef= 'auto',
  gamma= 0.99,
  tau=0.01,
  train_freq= 1,
  gradient_steps= 1,
  learning_starts= 10000,
  policy_kwargs= dict(net_arch=[400, 300]),

In [9]:
from typing import Callable

def linear_schedule(initial_value: float) -> Callable[[float], float]:
    """
    Linear learning rate schedule.

    :param initial_value: Initial learning rate.
    :return: schedule that computes
      current learning rate depending on remaining progress
    """
    def func(progress_remaining: float) -> float:
        """
        Progress will decrease from 1 (beginning) to 0.

        :param progress_remaining:
        :return: current learning rate
        """
        return progress_remaining * initial_value

    return func


models_dir = f"RLmodels/LunarLander-v2_first_run_l10/"
logdir = f"RLlogs/LunarLander-v2_first_run_l10/"
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
if not os.path.exists(logdir):
    os.makedirs(logdir)

    
total_timesteps =  500000  # Total number of timesteps for training
initial_lr = 7.3e-4  # Initial learning rate

# Define the learning rate schedule
lr_schedule = linear_schedule(initial_lr)
    
    
model = SAC(   
    policy='MlpPolicy',
    env=env,
    seed=seed,
    batch_size= 256,
    learning_rate= lr_schedule,
    buffer_size= 1000000,
    ent_coef= 'auto',
    gamma= 0.99,
    tau=0.01,
    train_freq= 1,
    gradient_steps= 1,
    learning_starts= 10000,
    policy_kwargs= dict(net_arch=[400, 300]),
    use_sde= True,
    tensorboard_log=logdir)

In [14]:
iteration=3
models_dir = f"RLmodels/LunarLander-v2/"+str(iteration)
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
agent_name = "SAC"



In [10]:
iteration=3
agent_name = "lunarlander_thirdVAE_l10_SAC"
callback = CheckpointCallback(save_freq=10000, save_path=models_dir+str(iteration))   
#train the RL agent
model.learn(total_timesteps=total_timesteps,callback=callback,tb_log_name=agent_name)

<stable_baselines3.sac.sac.SAC at 0x7f31442e0100>

In [11]:
agent = model

## collect more images with RL agent

In [7]:
agent= SAC.load('RLmodels/LunarLander-v2_first_run_l10/1/rl_model_250000_steps')

In [15]:
save_path='BCEv24_agent3_data'
data_name='VAEruns_after_third_train'
num_of_episodes=100
i=7921
for episode in range(num_of_episodes):
    observation, info = env.reset()
    done = False
    while not done: 
        action, _states = agent.predict(observation, deterministic=True)
        observation, reward, terminated, truncated, info = env.step(action)
        #state, reward, terminated, truncated, info = env.step(action)
        #print(terminated, truncated)
        if terminated:
            done = True
        if truncated:
            done = True
        current_frame = env.render()

        i+=1
        im = Image.fromarray(np.array(current_frame))
        im.save(save_path+'/'+data_name+'_'+str(i)+'.jpeg')

In [24]:



images = []
obs = model.env.reset()
img = model.env.render(mode="rgb_array")
for i in range(450):
    images.append(img)
    action, _ = model.predict(obs)
    obs, _, _ ,_ = model.env.step(action)
    img = model.env.render(mode="rgb_array")

imgs = [Image.fromarray(img) for img in images]
# duration is the number of milliseconds between frames; this is 40 frames per second
imgs[0].save("lunar_lander_sac_l10_v1.gif", save_all=True, append_images=imgs[1:], duration=50, loop=0)

In [77]:
model_folder = 'RLmodels/LunarLander-v2/11'
# Get a list of all files in the directory
files = os.listdir(model_folder)
# Sort the files based on modification time (newest first)
files.sort(key=lambda x: os.path.getmtime(os.path.join(model_folder, x)))


In [80]:
model0 = SAC.load(model_folder+'/'+files[0], env=env)

In [82]:
num_of_imgs = 6377

#collect new images with trained RL agent
#aprox double number of images to train the vae on

num_of_episodes = 2

target_num_of_imgs = num_of_imgs*2
#get ~ enough imgs, but its done via episodes.. so if the model becomes successful it would be way less imgs    
num_of_imgs += collect_imgs(num_of_episodes=6, num_of_previous_imgs = num_of_imgs,
            data_dir="Data/",data_name = "img_Data",
            model=model) 



model_folder = 'RLmodels/LunarLander-v2/11'
# Get a list of all files in the directory
files = os.listdir(model_folder)
files.sort(key=lambda x: os.path.getmtime(os.path.join(model_folder, x)))
for file in files:   
    model0 = SAC.load(model_folder+'/'+file, env=env)

    num_of_imgs += collect_imgs(num_of_episodes=num_of_episodes,num_of_previous_imgs = num_of_imgs,
            data_dir="Data/",data_name = "lunar_Data",
            model=model0) 

start collecting images with the model
1 / 6
2 / 6
3 / 6
4 / 6
5 / 6
6 / 6
start collecting images with the model
1 / 2
2 / 2
start collecting images with the model
1 / 2
2 / 2
start collecting images with the model
1 / 2
2 / 2
start collecting images with the model
1 / 2
2 / 2
start collecting images with the model
1 / 2
2 / 2
start collecting images with the model
1 / 2
2 / 2
start collecting images with the model
1 / 2
2 / 2
start collecting images with the model
1 / 2
2 / 2
start collecting images with the model
1 / 2
2 / 2
start collecting images with the model
1 / 2
2 / 2
start collecting images with the model
1 / 2
2 / 2
start collecting images with the model
1 / 2
2 / 2
start collecting images with the model
1 / 2
2 / 2
start collecting images with the model
1 / 2
2 / 2
start collecting images with the model
1 / 2
2 / 2
