In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pickle
from google.colab import drive
import cv2


drive.mount('/content/drive')

data_path = "/content/drive/MyDrive/data/car_racing_rollouts_5.pkl"
with open(data_path, "rb") as f:
    rollouts = pickle.load(f)
print("Dataset loaded successfully!")




Mounted at /content/drive
Dataset loaded successfully!


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class VAE(nn.Module):
    def __init__(self, latent_dim):
        super(VAE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=4, stride=2, padding=1),   # -> [32, 32, 32]
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=1),  # -> [64, 16, 16]
            nn.ReLU(),
            nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=1), # -> [128, 8, 8]
            nn.ReLU(),
            nn.Conv2d(128, 256, kernel_size=4, stride=2, padding=1),# -> [256, 4, 4]
            nn.ReLU()
        )
        self.fc_mu = nn.Linear(256 * 4 * 4, latent_dim)
        self.fc_logvar = nn.Linear(256 * 4 * 4, latent_dim)

        self.fc_decode = nn.Linear(latent_dim, 256 * 4 * 4)
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(256, 128, kernel_size=4, stride=2, padding=1), # -> [128, 8, 8]
            nn.ReLU(),
            nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1),  # -> [64, 16, 16]
            nn.ReLU(),
            nn.ConvTranspose2d(64, 32, kernel_size=4, stride=2, padding=1),   # -> [32, 32, 32]
            nn.ReLU(),
            nn.ConvTranspose2d(32, 1, kernel_size=4, stride=2, padding=1),    # -> [1, 64, 64]
            nn.Sigmoid()
        )

    def encode(self, x):
        x = self.encoder(x)
        x = x.view(x.size(0), -1)
        mu = self.fc_mu(x)
        logvar = self.fc_logvar(x)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)  # standard deviation
        eps = torch.randn_like(std)    # sample from N(0,1)
        return mu + std * eps          # z = mu + sigma * epsilon

    def decode(self, z):
        x = self.fc_decode(z)
        x = x.view(-1, 256, 4, 4)
        x = self.decoder(x)
        return x

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        x_recon = self.decode(z)
        return x_recon, mu, logvar



In [None]:
latent_dim = 64
vae = VAE(latent_dim).to(device)
vae.load_state_dict(torch.load("/content/drive/MyDrive/vae_model_epoch_20.pth", map_location=device))
vae.eval()
print("VAE loaded.")

  vae.load_state_dict(torch.load("/content/drive/MyDrive/vae_car_racing.pth", map_location=device))


VAE loaded.


In [None]:
class MDNRNN(nn.Module):
    def __init__(self, latent_dim=32, action_dim=3, hidden_dim=256, n_gaussians=5):
        super(MDNRNN, self).__init__()
        self.input_dim = latent_dim + action_dim
        self.hidden_dim = hidden_dim
        self.n_gaussians = n_gaussians
        self.lstm = nn.LSTM(self.input_dim, hidden_dim, batch_first=True)
        self.fc_pi = nn.Linear(hidden_dim, n_gaussians)
        self.fc_mu = nn.Linear(hidden_dim, n_gaussians * latent_dim)
        self.fc_sigma = nn.Linear(hidden_dim, n_gaussians * latent_dim)
        self.fc_reward = nn.Linear(hidden_dim, 1)  # reward prediction
    def forward(self, x, hidden=None):
        out, hidden = self.lstm(x, hidden)
        pi = self.fc_pi(out)
        pi = nn.functional.softmax(pi, dim=-1)
        mu = self.fc_mu(out)
        mu = mu.view(x.size(0), x.size(1), self.n_gaussians, -1)
        sigma = self.fc_sigma(out)
        sigma = sigma.view(x.size(0), x.size(1), self.n_gaussians, -1)
        sigma = torch.exp(sigma)
        reward = self.fc_reward(out)
        return pi, mu, sigma, reward, hidden


In [None]:
mdn_rnn = MDNRNN(latent_dim=latent_dim, action_dim=3, hidden_dim=256, n_gaussians=5).to(device)
optimizer = optim.Adam(mdn_rnn.parameters(), lr=1e-3)
print("MDN-RNN model initialized.")


def mdn_loss(pi, mu, sigma, target):

    target = target.unsqueeze(2)
    exponent = -0.5 * ((target - mu) / sigma)**2
    exponent = exponent.sum(dim=-1)  # sum over latent_dim
    latent_dim = target.size(-1)
    log_coef = - torch.log(sigma).sum(dim=-1) - 0.5 * latent_dim * np.log(2 * np.pi)
    log_probs = log_coef + exponent  # log probability for each mixture
    weighted_log_probs = torch.log(pi + 1e-8) + log_probs
    max_log, _ = torch.max(weighted_log_probs, dim=-1, keepdim=True)
    log_sum = max_log.squeeze(-1) + torch.log(torch.sum(torch.exp(weighted_log_probs - max_log), dim=-1))
    loss = -log_sum.mean()
    return loss

# Reward loss: simple MSE.
mse_loss = nn.MSELoss()

seq_len = 50
class MDNRNNDataset(Dataset):
    def __init__(self, rollouts, vae, seq_len=50):
        self.sequences = []
        self.seq_len = seq_len
        self.vae = vae
        for episode in rollouts:
            if len(episode) < seq_len + 1:
                continue
            obs_seq, action_seq, reward_seq = [], [], []
            for transition in episode:
                obs, action, reward, next_obs, done = transition
                obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)
                with torch.no_grad():
                    mu, _ = vae.encode(obs_tensor)
                z = mu.squeeze(0).cpu().numpy()
                obs_seq.append(z)
                action_seq.append(np.array(action).flatten())
                reward_seq.append(reward)
            for i in range(0, len(episode) - seq_len):
                inp_seq = []
                target_z_seq = []
                target_reward_seq = []
                for t in range(i, i+seq_len):
                    inp_seq.append(np.concatenate([obs_seq[t], action_seq[t]]))
                    target_z_seq.append(obs_seq[t+1])
                    target_reward_seq.append(reward_seq[t])
                self.sequences.append((np.array(inp_seq), np.array(target_z_seq), np.array(target_reward_seq)))
    def __len__(self):
        return len(self.sequences)
    def __getitem__(self, idx):
        inp, target_z, target_reward = self.sequences[idx]
        return (torch.tensor(inp, dtype=torch.float32),
                torch.tensor(target_z, dtype=torch.float32),
                torch.tensor(target_reward, dtype=torch.float32))

dataset = MDNRNNDataset(rollouts, vae, seq_len=seq_len)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
print("MDN-RNN dataset prepared. Total sequences:", len(dataset))


MDN-RNN model initialized.
MDN-RNN dataset prepared. Total sequences: 171765


In [None]:
epochs = 50
mdn_rnn.train()
for epoch in range(1, epochs+1):
    total_loss = 0
    reward_loss = 0
    for inp, target_z, target_reward in dataloader:
        inp = inp.to(device)           # (batch, seq_len, 35)
        target_z = target_z.to(device) # (batch, seq_len, 32)

        target_reward = target_reward.to(device).unsqueeze(-1)  # (batch, seq_len, 1)
        optimizer.zero_grad()
        pi, mu, sigma, reward_pred, _ = mdn_rnn(inp)
        loss_z = mdn_loss(pi, mu, sigma, target_z)
        loss_reward = mse_loss(reward_pred, target_reward)
        loss = loss_z + loss_reward
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        reward_loss += loss_reward
    avg_loss = total_loss / len(dataloader)
    avg_reward = reward_loss / len(dataloader)
    print(f"Epoch {epoch}, Average Loss: {avg_loss:.4f}")
    print(f"Epoch {epoch}, reward Loss: {avg_reward:.4f}")

torch.save(mdn_rnn.state_dict(), "/content/drive/MyDrive/mdn_rnn_carla.pth")
print("MDN-RNN model saved!")

Epoch 1, Average Loss: -72.0277
Epoch 1, reward Loss: 0.1936
Epoch 2, Average Loss: -93.1820
Epoch 2, reward Loss: 0.1902
Epoch 3, Average Loss: -98.5166
Epoch 3, reward Loss: 0.1895
Epoch 4, Average Loss: -107.5676
Epoch 4, reward Loss: 0.1892
Epoch 5, Average Loss: -112.6986
Epoch 5, reward Loss: 0.1886
Epoch 6, Average Loss: -115.0753
Epoch 6, reward Loss: 0.1880
Epoch 7, Average Loss: -118.3909
Epoch 7, reward Loss: 0.1876
Epoch 8, Average Loss: -123.1056
Epoch 8, reward Loss: 0.1874
Epoch 9, Average Loss: -124.0644
Epoch 9, reward Loss: 0.1872
Epoch 10, Average Loss: -123.5554
Epoch 10, reward Loss: 0.1871
Epoch 11, Average Loss: -126.2563
Epoch 11, reward Loss: 0.1869
Epoch 12, Average Loss: -127.0567
Epoch 12, reward Loss: 0.1868
Epoch 13, Average Loss: -129.7704
Epoch 13, reward Loss: 0.1866
Epoch 14, Average Loss: -128.6895
Epoch 14, reward Loss: 0.1865
Epoch 15, Average Loss: -130.4500
Epoch 15, reward Loss: 0.1864
Epoch 16, Average Loss: -132.8651
Epoch 16, reward Loss: 0.18

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>