# Collect data

In [None]:
import gymnasium as gym
import numpy as np
from tqdm import tqdm
import os

# --- 설정값 ---
TOTAL_STEPS = 20000
ENV_ID = 'LunarLander-v3'

print(f"'{ENV_ID}' 환경에서 총 {TOTAL_STEPS} 스텝의 데이터를 수집합니다.")

# --- 환경 초기화 ---
env = gym.make(ENV_ID, render_mode='rgb_array')

# --- 데이터 버퍼 ---
obs_buffer = []
frame_buffer = []
action_buffer = []
reward_buffer = []
terminated_buffer = []
truncated_buffer = []

observation, info = env.reset()

for _ in tqdm(range(TOTAL_STEPS)):
    action = env.action_space.sample()
    next_observation, reward, terminated, truncated, info = env.step(action)
    
    # 버퍼에 저장
    obs_buffer.append(observation)  # 상태 (벡터)
    frame = env.render()            # 프레임 이미지
    frame_buffer.append(frame)     # 이미지 추가

    action_buffer.append(action)
    reward_buffer.append(reward)
    terminated_buffer.append(terminated)
    truncated_buffer.append(truncated)

    if terminated or truncated:
        observation, info = env.reset()
    else:
        observation = next_observation

env.close()

print(f"\n데이터 수집 완료! 총 {len(obs_buffer)}개의 경험을 저장했습니다.")

# --- 저장 디렉토리 만들기 ---
os.makedirs("data", exist_ok=True)

# --- 저장 ---
np.savez_compressed(
    f"data/{ENV_ID}_with_frames_1.npz",
    observations=np.array(obs_buffer, dtype=np.float32),  # (100000, 8)
    frames=np.array(frame_buffer, dtype=np.uint8),        # (100000, H, W, 3)
    actions=np.array(action_buffer, dtype=np.int8),
    rewards=np.array(reward_buffer, dtype=np.float32),
    terminateds=np.array(terminated_buffer, dtype=np.bool_),
    truncateds=np.array(truncated_buffer, dtype=np.bool_)
)

print(f"✅ '{ENV_ID}_with_frames_1.npz' 파일로 프레임 포함 데이터 저장 완료!")

In [None]:
import numpy as np
data = np.load('data/LunarLander-v3.npz')
observations_raw = data['frames']
actions = data['actions']
rewards = data['rewards']

# VAE train

In [None]:
import torch
from torch.utils.data import DataLoader
from torch import optim
from PIL import Image
from torchvision import transforms
from parts.VAE_CNN import VAE, vae_loss_function, CustomImageDataset
from parts.MDN_RNN import MDN_RNN, mdn_rnn_loss
from parts.controller import controller
from tqdm.notebook import tqdm

observations = torch.tensor(observations_raw).permute(0, 3, 1, 2)
vae = VAE(input_channel=3, latent_dim=1024).to('cuda:0')

resize = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((64, 64)),
    transforms.ToTensor()]) 

observations = CustomImageDataset(data=observations, transform=resize)

dataloader = DataLoader(dataset=observations, batch_size=2048)

optimizer = optim.AdamW(vae.parameters(), lr=5e-5)

def vae_train(vae, optimizer, dataloader, epochs=10):
    vae.train()

    for epoch in tqdm(range(epochs)):
        total_loss = 0
        for batch_idx, data in enumerate(dataloader):
            data = data.to('cuda:0')
            _, recon_image, mu, logvar = vae(data)

            loss = vae_loss_function(recon_image, data, mu, logvar, beta=0.5)
            
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(vae.parameters(), max_norm=1.0)
            optimizer.step()

            total_loss += loss.item()
        
        print(f'Epoch {epoch}, Everage loss: {total_loss/len(dataloader):.6f}')

#vae_train(vae, optimizer, dataloader, epochs=200)


In [None]:
torch.save(vae.state_dict(), 'vae-latent1024-epoch100-beta1.pth')

In [None]:
vae.load_state_dict(torch.load('model_weights/vae-latent1024-epoch100_beta0.01.pth'))

In [None]:
to_pil = transforms.ToPILImage()
to_pil(observations[555])

In [None]:
vae.eval()
data = observations[555].unsqueeze(0).to("cuda:0")
hidden_state, recon_image, _, _ = vae(data)
to_pil(recon_image[0])

# MDN_RNN train

In [None]:
import numpy as np
data = np.load('data/LunarLander-v3.npz')
observations_raw = data['frames']
actions = data['actions']
rewards = data['rewards']

episodes = [0]
for idx, done in enumerate(data['terminateds']):
    if done == True:
        episodes.append(idx+1)
episodes.append(len(data['terminateds']))


In [None]:
from torchvision import transforms
import torch

# observations = torch.tensor(observations_raw).permute(0, 3, 1, 2)
observations = observations_raw # numpy array dimension order is [H, W, C] for ToPILImage()

resize = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((64, 64)),
    transforms.ToTensor()]) 

import torch
from parts.VAE_CNN import VAE
vae = VAE(input_channel=3, latent_dim=1024).to('cuda:0')
vae.load_state_dict(torch.load('model_weights/vae-latent1024-epoch100.pth'))
vae.eval()

In [None]:
import torch
from torch.utils.data import DataLoader
from torch import optim
from PIL import Image
from tqdm.notebook import tqdm
from parts.MDN_RNN import MDN_RNN, mdn_rnn_loss, SequenceDataset
import numpy as np

action_size = int(actions.max()) + 1
action_onehot = np.eye(action_size)[actions] # np.eye is generates identity matrix

mdn_rnn = MDN_RNN(input_size=1024, hidden_size=512, latent_space_size=1024, action_size=action_size).to('cuda:0')

seq_dataset = SequenceDataset(image_dataset=observations, transforms=resize, reward_dataset=rewards, action_dataset=action_onehot, episodes=episodes)
dataloader = DataLoader(dataset=seq_dataset, batch_size=32, num_workers=12)

optimizer = optim.AdamW(mdn_rnn.parameters(), lr=5e-5)

def rnn_train(model=mdn_rnn, dataloader=dataloader, optimizer=optimizer, epochs=200):
    model.train()
    vae.eval()

    R = 4
    W_done = 1.0

    for epoch in tqdm(range(epochs)):
        total_loss = 0
        for batch_idx, (image, action, reward, seq_length, mask, t_done) in enumerate(dataloader):
            image, action, reward, mask, t_done = image.to('cuda:0'), action.to('cuda:0'), reward.to('cuda:0'), mask.to('cuda:0'), t_done.to('cuda:0')
            done_weights = torch.ones_like(t_done, device=t_done.device, dtype=t_done.dtype) # Need to press the loss around done

            with torch.no_grad():
                # Have to reshape image vector because vae(cnn) input shape is (batch_size, channel_size, height, width)
                batch, sequence, C, H, W = image.size()
                reshape_images = image.view(-1, C, H, W)
                z_vectors_flatten, _, _, _ = vae(reshape_images)
                z_vectors = z_vectors_flatten.view(batch, sequence, -1)
                
            mu, sigma, phi, p_reward, p_done, _, (_, _) = model(z_vectors, action, length=seq_length)
            
            batch, sequence, num_dist, latent_size = mu.size()

            # Done weights
            for b, i in enumerate(seq_length.tolist()):
                L = i - R
                done_weights[b, L:i-1] = W_done
                #print(done_weights[b])
            done_weights = done_weights[:, :-1].reshape(-1, 1)

            # Predict reshape
            mu_pred = mu[: , :-1, :, :].reshape(-1, num_dist, latent_size)
            sigma_pred = sigma[: , :-1, :, :].reshape(-1, num_dist, latent_size)
            phi_pred = phi[: , :-1, :].reshape(-1, num_dist)
            p_reward_pred = p_reward[:, :-1].reshape(-1, 1)
            p_done_pred = p_done[:, :-1].reshape(-1, 1)

            # Target reshape
            target_z = z_vectors[: , 1:, :].reshape(-1, latent_size)
            target_reward = reward[:, 1:].reshape(-1, 1)
            target_done = t_done[:, 1:].reshape(-1, 1)

            # Mask
            reshape_mask = mask[:, :-1].reshape(-1, 1)

            loss, (likelihood, reward_mse, done_bce) = mdn_rnn_loss(mu=mu_pred, sigma=sigma_pred, phi=phi_pred, 
                                                                    target=target_z, 
                                                                    p_reward=p_reward_pred, t_reward=target_reward, 
                                                                    p_done=p_done_pred, t_done=target_done, 
                                                                    done_weights=done_weights, 
                                                                    mask=reshape_mask, 
                                                                    likelihood_weight=2e-6)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            if (batch_idx + 1) % 100 == 0:
                print(f"loss {total_loss / (batch_idx + 1)}")

        print(f"epoch: {epoch+1}, Everage loss: {total_loss/len(dataloader):.6f}")

rnn_train()

In [None]:
torch.save(mdn_rnn.state_dict(), 'model_weights/mdnrnn_latent1024-epoch200.pth')

# RNN test

In [None]:
import numpy as np
data = np.load('data/LunarLander-v3.npz')
observations_raw = data['frames']
actions = data['actions']
rewards = data['rewards']

episodes = [0]
for idx, done in enumerate(data['terminateds']):
    if done == True:
        episodes.append(idx+1)
episodes.append(len(data['terminateds']))

In [None]:
import torch
from parts.VAE_CNN import VAE, vae_loss_function, CustomImageDataset
from parts.MDN_RNN import MDN_RNN, mdn_rnn_loss, sampling, SequenceDataset
from parts.controller import controller

action_size = int(actions.max()) + 1
action_onehot = np.eye(action_size)[actions] # np.eye is generates identity matrix

vae = VAE(input_channel=3, latent_dim=1024).to('cuda:0')
mdn_rnn = MDN_RNN(input_size=1024, hidden_size=512, latent_space_size=1024, action_size=action_size).to('cuda:0')

vae.load_state_dict(torch.load('model_weights/vae-latent1024-epoch100.pth'))
mdn_rnn.load_state_dict(torch.load('model_weights/mdnrnn_latent1024-epoch200.pth'))


In [None]:
from torchvision import transforms

observations = observations_raw

resize = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((64, 64)),
    transforms.ToTensor()]) 


seq_dataset = SequenceDataset(image_dataset=observations, transforms=resize, reward_dataset=rewards, action_dataset=action_onehot, episodes=episodes)

In [None]:
vae.eval()
mdn_rnn.eval()
z_vectors, _, _, _  = vae(seq_dataset[100][0].to('cuda:0'))
z_vectors = z_vectors.unsqueeze(0)
action_first = seq_dataset[100][1].to('cuda:0')
action_first = action_first.unsqueeze(0)

mu, sigma, phi, reward, done, h, cell = mdn_rnn(z_vectors, action_first)

In [None]:
to_pil = transforms.ToPILImage()
to_pil(seq_dataset[2][0][seq_dataset[2][3]-1])

In [None]:
import torch.nn.functional as F
from tqdm import tqdm

done, last_seq = done_pred(22)
print(F.sigmoid(done))
print(last_seq)
print(F.sigmoid(done[:, last_seq-1]))

In [None]:
def done_pred(index):
    last_seq = seq_dataset[index][3].to('cuda:0')
    z_vectors, _, _, _  = vae(seq_dataset[index][0].to('cuda:0'))
    action = seq_dataset[index][1].to('cuda:0')
    z_vectors = z_vectors.unsqueeze(0)
    action = action.unsqueeze(0)
    mu, sigma, phi, reward, done, h, cell = mdn_rnn(z_vectors, action)
    return done, last_seq

In [None]:
p = phi[:, 0, :]
m = mu[:, 0, :, :]
s = sigma[:, 0, :, :]

In [None]:
from torch import distributions
from torchvision import transforms

mixture_distribution = distributions.Categorical(probs=p)
component_distribution = distributions.Normal(loc=m, scale=s)
component_dist = distributions.Independent(
    component_distribution,
    reinterpreted_batch_ndims=1
)
mixture_gaussian = distributions.MixtureSameFamily(mixture_distribution, component_dist)
vector = mixture_gaussian.sample()

to_pil = transforms.ToPILImage()
a = vae.decode(vector)
img = to_pil(a[0])
img

# Controller train

In [None]:
import numpy as np
from parts.MDN_RNN import MDN_RNN, mdn_rnn_loss, sampling, SequenceDataset
from parts.controller import controller, choice_control
from parts.VAE_CNN import VAE
from torchvision import transforms
import torch

data = np.load('data/LunarLander-v3.npz')

actions = data['actions']
action_size = int(actions.max()) + 1
action_onehot = np.eye(action_size)[actions]

rewards = data['rewards']

images = data['frames']

episodes = [0]
for idx, done in enumerate(data['terminateds']):
    if done == True:
        episodes.append(idx+1)
episodes.append(len(data['terminateds']))

resize = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((64, 64)),
    transforms.ToTensor()]) 

seq_dataset = SequenceDataset(image_dataset=images, transforms=resize, reward_dataset=rewards, action_dataset=action_onehot, episodes=episodes)

mdn_rnn = MDN_RNN(input_size=1024, hidden_size=512, latent_space_size=1024, action_size=action_size).to('cuda:0')
mdn_rnn.load_state_dict(torch.load('model_weights/mdnrnn_latent1024-epoch200.pth'))

vae = VAE(input_channel=3, latent_dim=1024).to('cuda:0')
vae.load_state_dict(torch.load('model_weights/vae-latent1024-epoch100-beta0.01.pth'))
vae.eval()

In [None]:
import torch 
import random

def sellect_random_scene(images) :
    _len = len(images)
    scene_number = random.randint(0, _len-1)
    image = images[scene_number]
    
    image_tensor = torch.tensor(image).permute(2, 0, 1)

    resized_image = torch.tensor(resize(image_tensor)).unsqueeze(0).to('cuda:0')

    with torch.no_grad():
        z_vector, _ ,_ ,_ = vae(resized_image)

    num_layers, hidden_size = mdn_rnn.lstm.num_layers, mdn_rnn.lstm.hidden_size

    h_vector = torch.zeros(1, hidden_size).to('cuda:0')

    h_n, c_n = torch.zeros(num_layers, 1, hidden_size).to('cuda:0'), torch.zeros(num_layers, 1, hidden_size).to('cuda:0')

    return z_vector, h_vector, (h_n, c_n)

def initial_scene(seq_dataset, rnn):
    scene_number = random.randint(0, len(seq_dataset)-1)
    image0 = seq_dataset[scene_number][0][0]
    image1 = seq_dataset[scene_number][0][1]

    tensor_image0 = image0.unsqueeze(0).to('cuda:0')
    tensor_image1 = image1.unsqueeze(0).to('cuda:0')

    action = seq_dataset[scene_number][1][0]
    action_len = len(action)
    random_action = random.randint(0, action_len - 1)
    action_onehot = torch.zeros(1, action_len).to('cuda:0')
    action_onehot[0][random_action] = 1.0
    action_onehot = action_onehot.unsqueeze(0)

    with torch.no_grad():
        z_vector0, _ ,_ ,_ = vae(tensor_image0)
        z_vector1, _ ,_ ,_ = vae(tensor_image1)
    z_vector0 = z_vector0.unsqueeze(0)

    _, _, _, _, _, h_vector, (h_n, c_n) = rnn(z_vector0, action_onehot)

    return z_vector1, h_vector, (h_n, c_n)

In [None]:
from parts.MDN_RNN import MDN_RNN, sampling
from parts.controller import controller, choice_control
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

C = controller(action_size=action_size, h_vector_size=512, z_vector_size=1024, hidden_size=512).to('cuda:0')

optimizer = torch.optim.AdamW(C.parameters(), lr=1e-4)

gamma = 0.99
beta = 0.1

def controller_train(steps, controller, optimizer, initial_z, initial_h, cell):
    controller.train()
    mdn_rnn.eval()

    rewards = []
    action_probs = []
    entropies = []
    values = [] # baseline values

    z = initial_z.to('cuda:0').detach()
    h = initial_h.to('cuda:0').detach()

    z_rnn = z.unsqueeze(0)

    for i in range(steps):
        action_prob, value = controller(z, h)
        action = choice_control(action_prob)

        action_onehot_tensor = F.one_hot(action, num_classes=action_size).to(torch.float32)

        with torch.no_grad():
            mu, sigma, phi, reward, done, h, cell = mdn_rnn(z_rnn, action_onehot_tensor, cell)

            z_rnn = sampling(mu, sigma, phi) 
            z = z_rnn.squeeze(0).detach()
            h = h.detach()

        log_action_prob = torch.log(action_prob[0, action])

        entropy = action_prob[0] * torch.log(action_prob[0])

        # print(z)
        # to_pil = transforms.ToPILImage()
        # a = vae.decode(z)
        # img = to_pil(a[0])
        # plt.figure(figsize=(5, 5))
        # plt.imshow(img)
        # plt.axis('off')
        # plt.show()
        
        entropies.append(-entropy.sum())
        action_probs.append(log_action_prob)
        rewards.append(reward)
        values.append(value)

        if F.sigmoid(done) >= 0.96:
            #print(f"Episode finished with reward: {reward.item()}")
            #print(i)
            break

    returns = []
    future_return = 0.0
    for r in reversed(rewards):
        future_return = r + gamma * future_return
        returns.append(future_return)
    returns.reverse()
    returns = torch.stack(returns)
    values = torch.stack(values)

    adv = returns - values.detach()
    
    adv = (adv - adv.mean()) / (adv.std() + 1e-8)

    action_probs = torch.stack(action_probs)
    entropies = torch.stack(entropies)

    policy_loss = - (action_probs * adv.detach()).mean() # 정책 손실
    entropy_loss = - (beta * entropies.mean()) # 엔트로피 손실 (엔트로피를 최대화하기 위해 음수로 만듦)
    value_loss = 0.5 * (returns - values).pow(2).mean()

    loss = policy_loss + entropy_loss + value_loss

    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(controller.parameters(), 0.5)
    optimizer.step()

    return loss.item(), torch.stack(rewards).sum().item(), policy_loss.item(), entropy_loss.item()

In [None]:
from tqdm.auto import tqdm

for _ in tqdm(range(1000)):
    initial_z, initial_h, (h_n, c_n) = initial_scene(seq_dataset, mdn_rnn)
    loss, reward, policy, entropy = controller_train(140, C, optimizer, initial_z, initial_h, (h_n, c_n))
    print(f"loss:{loss}, reward:{reward}, policy:{policy*1000}, entropy:{entropy*1000}")

In [None]:
torch.save(C.state_dict(), 'controller-1')