In this homework, we will be using the Half Cheetah environment in the MuJoCo benchmark, where the goal is the control this cheetah to run forward as fast and steady as possible!


In [None]:
#@title mount your Google Drive
# @markdown Your work will be stored in a folder called `Imitation_Learning` by default to prevent Colab instance timeouts from deleting your edits.

import os
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
#@title set up mount symlink

DRIVE_PATH = '/content/gdrive/My\ Drive/Imitation_Learning'
DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\', '')
if not os.path.exists(DRIVE_PYTHON_PATH):
  %mkdir $DRIVE_PATH

## the space in `My Drive` causes some issues,
## make a symlink to avoid this
SYM_PATH = '/content/Imitation_Learning'
if not os.path.exists(SYM_PATH):
  !ln -s $DRIVE_PATH $SYM_PATH

%cd $SYM_PATH

## save the HalfCheetah_expert_data.pkl file in this directory

/content/gdrive/My Drive/Imitation_Learning


# Install the Mujoco Library:

In [None]:
# For tips on running notebooks in Google Colab, see
# https://pytorch.org/tutorials/beginner/colab
%matplotlib inline
!pip install "gymnasium[mujoco]"

...
Successfully installed glfw-2.8.0 mujoco-3.3.0


# Loading Expert Data

In [None]:
import gymnasium as gym
from typing import Optional, Tuple, Union
from gymnasium import logger, spaces

import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
import numpy as np
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


# Loading data
file_path = "HalfCheetah_expert_data.pkl"
with open(file_path, "rb") as f:
    expert_data = pickle.load(f)[0]

print(expert_data.keys())
print("number of data:", len(expert_data['observation']))

# Extract expert states and actions
states = torch.tensor(expert_data["observation"], dtype=torch.float32)
actions = torch.tensor(expert_data["action"], dtype=torch.float32)

dict_keys(['observation', 'action', 'reward', 'next_observation', 'terminal'])
number of data: 1000


# Initializing the network, environment and evaluation function.

In [None]:
# Define a simple neural network policy for Behavior Cloning
class PolicyNet(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNet, self).__init__()
        self.fc1 = nn.Linear(state_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Initialize policy network
env = gym.make("HalfCheetah-v5")
state_dim = states.shape[1]
action_dim = actions.shape[1]
policy_bc = PolicyNet(state_dim, action_dim)

# Define the evaluate_policy function
def evaluate_policy(policy, env, episodes=10):
    total_rewards = []
    for _ in range(episodes):
        state, _ = env.reset()
        done = False
        episode_reward = 0
        for i in range(1000):
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
            action = policy(state_tensor).detach().numpy()[0]
            state, reward, done, _, _ = env.step(action)
            episode_reward += reward
            if done:
              break
        total_rewards.append(episode_reward)
    print(f"Evaluation Results: Mean Reward = {np.mean(total_rewards):.2f}, Std Reward = {np.std(total_rewards):.2f}")
    return np.mean(total_rewards), np.std(total_rewards)

mean_reward, std_reward = evaluate_policy(policy_bc, env)


Evaluation Results: Mean Reward = -5.32, Std Reward = 0.50


In [None]:
# Define loss function and optimizer

criterion = nn.MSELoss() #Using MSE for loss
optimizer = torch.optim.Adam(policy_bc.parameters(), lr=1e-3)

# Implement the Behavior Cloning Algorithm here

#Creating our dataset
from torch.utils.data import DataLoader, TensorDataset

dataset = TensorDataset(states, actions)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

#Training
num_epochs = 1000

for epoch in range(num_epochs):
    total_loss = 0.0
    for batch_states, batch_actions in dataloader:
        optimizer.zero_grad()
        predicted_actions = policy_bc(batch_states)
        loss = criterion(predicted_actions, batch_actions)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * batch_states.size(0)

    # Print training statistics
    avg_loss = total_loss / len(dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

# Evaluate and save the trained BC policy
mean_reward, std_reward = evaluate_policy(policy_bc, env)
policy_path = "bc_policy.pth"
torch.save(policy_bc.state_dict(), policy_path)
print(f"Trained policy saved at {policy_path}")

Epoch [1/1000], Loss: 0.4486
Epoch [2/1000], Loss: 0.2097
Epoch [3/1000], Loss: 0.1373
Epoch [4/1000], Loss: 0.1129
Epoch [5/1000], Loss: 0.0984
Epoch [6/1000], Loss: 0.0896
Epoch [7/1000], Loss: 0.0811
Epoch [8/1000], Loss: 0.0756
Epoch [9/1000], Loss: 0.0708
Epoch [10/1000], Loss: 0.0667
...
Epoch [991/1000], Loss: 0.0013
Epoch [992/1000], Loss: 0.0013
Epoch [993/1000], Loss: 0.0012
Epoch [994/1000], Loss: 0.0013
Epoch [995/1000], Loss: 0.0012
Epoch [996/1000], Loss: 0.0012
Epoch [997/1000], Loss: 0.0012
Epoch [998/1000], Loss: 0.0013
Epoch [999/1000], Loss: 0.0012
Epoch [1000/1000], Loss: 0.0013
Evaluation Results: Mean Reward = 4082.30, Std Reward = 100.31
Trained policy saved at bc_policy.pth


In [None]:
# Initialize policy
policy_dagger = PolicyNet(state_dim, action_dim)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(policy_dagger.parameters(), lr=1e-3)

# Implement the Dagger Algorithm here


# DAgger parameters
dagger_iterations = 5
epochs_per_iteration = 100
rollout_episodes = 10
max_steps = 2000 #Defining maximum steps to avoid infinite loops on rollouts

# Initialize aggregated dataset
aggregated_states = states.clone()
aggregated_actions = actions.clone()


for dagger_iter in range(dagger_iterations):
    print(f"\nDAgger Iteration {dagger_iter+1}/{dagger_iterations}")
    #Train on current aggregated dataset
    dataset = TensorDataset(aggregated_states, aggregated_actions)
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

    for epoch in range(epochs_per_iteration):
        total_loss = 0.0
        for batch_states, batch_actions in dataloader:
            optimizer.zero_grad()
            pred_actions = policy_dagger(batch_states)
            loss = criterion(pred_actions, batch_actions)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * batch_states.size(0)

        # Print epoch statistics every 5 epochs
        if (epoch+1) % 5 == 0:
            avg_loss = loss / len(dataloader)
            print(f"Epoch {epoch+1}/{epochs_per_iteration} | Loss: {avg_loss:.4f}")

    print("Collecting rollouts...")
    #Collecting new trajectories and expert annotations
    new_states, new_expert_actions = [], []
    for _ in range(rollout_episodes):
        state, _ = env.reset()
        done = False
        steps = 0  #step counter
        while not done and steps < max_steps:
            with torch.no_grad():
                state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
                action_tensor = policy_dagger(state_tensor)
                action = action_tensor.squeeze(0).numpy()  # Convert to numpy after computation

            #Store state and get expert action
            new_states.append(state)
            expert_idx = torch.randint(0, len(actions), (1,))
            expert_action = actions[expert_idx].squeeze(0).numpy()
            new_expert_actions.append(expert_action)

            #Environment step
            state, _, done, _, _ = env.step(action)
            steps += 1

    #Aggregate new data
    new_states_tensor = torch.tensor(np.array(new_states), dtype=torch.float32)
    new_actions_tensor = torch.tensor(np.array(new_expert_actions), dtype=torch.float32)

    aggregated_states = torch.cat([aggregated_states, new_states_tensor])
    aggregated_actions = torch.cat([aggregated_actions, new_actions_tensor])

# Save trained policy
mean_reward, std_reward = evaluate_policy(policy_dagger, env)
policy_path = "dagger_policy.pth"
torch.save(policy_dagger.state_dict(), policy_path)
print(f"Trained policy saved at {policy_path}")



DAgger Iteration 1/5
Epoch 5/100 | Loss: 0.0064
Epoch 10/100 | Loss: 0.0043
...
Epoch 95/100 | Loss: 0.0005
Epoch 100/100 | Loss: 0.0005
Collecting rollouts...

DAgger Iteration 2/5
Epoch 5/100 | Loss: 0.0021
Epoch 10/100 | Loss: 0.0018
...
Epoch 95/100 | Loss: 0.0021
Epoch 100/100 | Loss: 0.0011
Collecting rollouts...

DAgger Iteration 3/5
Epoch 5/100 | Loss: 0.0009
Epoch 10/100 | Loss: 0.0008
...
Epoch 95/100 | Loss: 0.0008
Epoch 100/100 | Loss: 0.0008
Collecting rollouts...

DAgger Iteration 4/5
Epoch 5/100 | Loss: 0.0004
Epoch 10/100 | Loss: 0.0005
...
Epoch 95/100 | Loss: 0.0005
Epoch 100/100 | Loss: 0.0006
Collecting rollouts...

DAgger Iteration 5/5
Epoch 5/100 | Loss: 0.0004
Epoch 10/100 | Loss: 0.0004
...
Epoch 95/100 | Loss: 0.0004
Epoch 100/100 | Loss: 0.0004
Collecting rollouts...
Evaluation Results: Mean Reward = -26.10, Std Reward = 1.14
Trained policy saved at dagger_policy.pth
