In [157]:
# !pip install seaborn

In [158]:
import os
import numpy as np
import torch
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
from torchvision import transforms
import torch.nn.functional as F
from transformers import BertConfig, BertTokenizer
from transformers import logging

def get_vlnbert_models(config=None):
    config_class = BertConfig
    from src.vlnbert.vlnbert_PREVALENT import VLNBert
    model_class = VLNBert
    model_name_or_path = '/home/prj21/fyp/Recurrent-VLN-BERT-Isaac/pretrained_weight/pytorch_model.bin'
    vis_config = config_class.from_pretrained('bert-base-uncased')
    # vis_config.img_feature_dim = 2176 # Original model dim
    vis_config.img_feature_dim = 4096
    vis_config.img_feature_type = ""
    vis_config.vl_layers = 4
    vis_config.la_layers = 9
    logging.set_verbosity_error()
    visual_model = model_class.from_pretrained(model_name_or_path, config=vis_config, ignore_mismatched_sizes=True) # The mismatched visn_fc.weight -> randomly initialized values -> fine-tuned during training to adapt to the new 4096-dimensional input features.

    return visual_model
# from src.param import args

def pad_instr_tokens(instr_tokens, maxlength=20):

    if len(instr_tokens) <= 2: #assert len(raw_instr_tokens) > 2
        return None

    if len(instr_tokens) > maxlength - 2: # -2 for [CLS] and [SEP]
        instr_tokens = instr_tokens[:(maxlength-2)]

    instr_tokens = ['[CLS]'] + instr_tokens + ['[SEP]']
    num_words = len(instr_tokens)  # - 1  # include [SEP]
    instr_tokens += ['[PAD]'] * (maxlength-len(instr_tokens))

    assert len(instr_tokens) == maxlength

    return instr_tokens, num_words
# RGB transform
rgb_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Depth transform
def depth_transform(depth):
    depth = np.where(np.isfinite(depth), depth, 1.94)  # Replace NaN/inf with max depth
    depth = torch.from_numpy(depth).float()
    depth = (depth - 1.94) / 1.43  # Standardize based on dataset stats
    depth = depth.unsqueeze(0).unsqueeze(0)
    depth = F.interpolate(depth, size=(224, 224), mode='bilinear', align_corners=False).squeeze(0)
    return depth
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


Using device: cuda




Model loaded successfully.


In [None]:
import random

In [None]:
import os
import random
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define checkpoint steps
checkpoint_steps = [1000, 2000, 3000, 4000, 6000, 8000, 10000]

# Define feature directory
feature_dir = "../VLN-Go2-Matterport"

# List available episodes
episode_dir = os.path.join(feature_dir, "training_data")
episodes = [e for e in os.listdir(episode_dir) if os.path.isdir(os.path.join(episode_dir, e))]
print(f"Found {len(episodes)} episodes. Selecting one for analysis.")

# Select one episode for consistency across all checkpoints
epidx = random.randint(0, len(episodes) - 1)
episode_name = episodes[epidx]
episode_path = os.path.join(episode_dir, episode_name)
print(f"Selected episode: {episode_name}")

# Load episode data
with open(os.path.join(episode_path, "instructions.txt")) as f:
    instruction = f.read().strip()
rgb_paths = sorted([os.path.join(episode_path, "rgbs", f) for f in os.listdir(os.path.join(episode_path, "rgbs")) if f.endswith(".png")])
depth_paths = sorted([os.path.join(episode_path, "depths", f) for f in os.listdir(os.path.join(episode_path, "depths")) if f.endswith(".npy")])
with open(os.path.join(episode_path, "actions.txt")) as f:
    gt_actions = [line.strip() for line in f.readlines()]

print(f"Instruction: {instruction}")
print(f"Number of steps: {len(rgb_paths)}")
print(f"Ground truth actions: {gt_actions}")

# Create data-time_report folder
report_dir = f"report/report_{epidx}"
os.makedirs(report_dir, exist_ok=True)

# Tokenize and encode instruction
instr_tokens = tokenizer.tokenize(instruction)
padded_tokens, _ = pad_instr_tokens(instr_tokens, 20)
instr_encoding = tokenizer.convert_tokens_to_ids(padded_tokens)
instr_encoding = torch.tensor(instr_encoding, dtype=torch.long).unsqueeze(0).to(device)
lang_mask = (instr_encoding != tokenizer.pad_token_id).float().to(device)

print(f"Instruction tokens: {padded_tokens[:10]}... (total length: {len(padded_tokens)})")

# Loop over checkpoint steps
for step in checkpoint_steps:
    print(f"Processing checkpoint step {step}")
    
    # Load model with checkpoint
    checkpoint_path = f"/home/prj21/fyp/Recurrent-VLN-BERT-Isaac/checkpoints/navigation_PREVALENT/2025-03-07_20-02-28/checkpoint_{step}.pt"
    model = get_vlnbert_models().to(device)
    model.load_state_dict(torch.load(checkpoint_path, map_location=device))
    model.eval()
    # print(f"Model loaded successfully for checkpoint {step}.")
    
    # Reset lists for attention scores and action probabilities
    cross_attention_scores_list = []
    vis_self_attention_scores_list = []
    action_probs_list = []
    
    # Hook functions
    def cross_hook(module, input, output):
        cross_attention_scores_list.append(output[1].detach().cpu())
    
    def vis_self_hook(module, input, output):
        vis_self_attention_scores_list.append(output[1].detach().cpu())
    
    # Register hooks on the last LXRTXLayer
    cross_handle = model.addlayer[-1].visual_attention.register_forward_hook(cross_hook)
    vis_self_handle = model.addlayer[-1].visn_self_att.register_forward_hook(vis_self_hook)
    # print("Hooks registered for attention scores.")
    
    # Process the episode
    with torch.no_grad():
        pooled_output, lang_output = model("language", input_ids=instr_encoding, lang_mask=lang_mask)
    
    for t in range(len(rgb_paths)):
        rgb = rgb_transform(Image.open(rgb_paths[t])).unsqueeze(0).to(device)
        depth = depth_transform(np.load(depth_paths[t])).unsqueeze(0).to(device)
        vis_mask = torch.ones(1, 1).to(device)
        
        with torch.no_grad():
            pooled_output, action_logits, next_lang_output = model("visual", lang_output, lang_mask=lang_mask, vis_mask=vis_mask, rgb=rgb, depth=depth)
        
        action_probs = F.softmax(action_logits, dim=-1).squeeze(0).cpu().numpy()
        action_probs_list.append(action_probs)
        
        lang_output = next_lang_output
    
    # Remove hooks
    cross_handle.remove()
    vis_self_handle.remove()
    # print(f"Processed {len(rgb_paths)} steps for checkpoint {step}.")
    
    # Process cross-attention scores (state to instruction)
    seq_len = instr_encoding.size(1)
    T = len(rgb_paths)
    state_to_instr_attention = []
    for scores in cross_attention_scores_list:
        attn = scores.mean(dim=1).squeeze(0)  # Average over heads: (2, seq_len)
        state_to_instr_attention.append(attn[0].numpy())  # State token attention: (seq_len,)
    state_to_instr_attention = np.stack(state_to_instr_attention)  # (T, seq_len)
    
    # Process visual self-attention scores
    vis_self_attention = []
    for scores in vis_self_attention_scores_list:
        attn = scores.mean(dim=1).squeeze(0)  # (2, 2)
        vis_self_attention.append(attn[0, 1].numpy())  # State to visual attention
    vis_self_attention = np.array(vis_self_attention)  # (T,)
    
    # Plot and save attention heatmap
    plt.figure(figsize=(15, 8))
    sns.heatmap(state_to_instr_attention, cmap="YlGnBu", 
                xticklabels=tokenizer.convert_ids_to_tokens(instr_encoding[0].cpu().numpy()),
                yticklabels=[f"Step {i}" for i in range(T)],
                cbar_kws={'label': 'Attention Weight'})
    plt.title(f"State-to-Instruction Attention Over Time (Step {step})", fontsize=16)
    plt.xlabel("Instruction Tokens", fontsize=12)
    plt.ylabel("Time Steps", fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(os.path.join(report_dir, f"attention_heatmap_step{step}.png"))
    plt.close()
    
    # Stack action probabilities and plot
    action_probs = np.stack(action_probs_list)  # (T, 3)
    actions = ["Move forward", "Turn left", "Turn right"]
    plt.figure(figsize=(10, 6))
    for i, action in enumerate(actions):
        plt.plot(action_probs[:, i], label=action, marker='o')
    plt.title(f"Action Probabilities Over Time (Step {step})", fontsize=16)
    plt.xlabel("Time Steps", fontsize=12)
    plt.ylabel("Probability", fontsize=12)
    plt.legend(title="Actions")
    plt.grid(True, linestyle='--', alpha=0.7)
    # if gt_actions:
    #     for t in range(min(len(gt_actions), len(rgb_paths))):
    #         plt.text(t, 0.04, gt_actions[t], rotation=45, ha='center', color='black')
    plt.tight_layout()
    plt.savefig(os.path.join(report_dir, f"action_probs_step{step}.png"))
    plt.close()
    
    # Plot and save state-to-visual attention
    plt.figure(figsize=(10, 6))
    plt.plot(vis_self_attention, label="State to Visual Attention", marker='o', color='purple')
    plt.title(f"State-to-Visual Self-Attention Over Time (Step {step})", fontsize=16)
    plt.xlabel("Time Steps", fontsize=12)
    plt.ylabel("Attention Weight", fontsize=12)
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(os.path.join(report_dir, f"vis_self_attention_step{step}.png"))
    plt.close()

# Generate markdown report
md_content = "# Data-Time Report\n\n"
md_content += "## Episode Details\n\n"
md_content += f"- **Episode:** {episode_name}\n"
md_content += f"- **Instruction:** {instruction}\n"
md_content += f"- **Ground Truth Actions:** {', '.join(gt_actions)}\n\n"

for step in checkpoint_steps:
    md_content += f"## Checkpoint {step}\n\n"
    md_content += "<table style=\"table-layout: fixed; width: 100%;\">\n"
    md_content += "  <tr>\n"
    md_content += f"    <td><img src=\"attention_heatmap_step{step}.png\" alt=\"Attention Heatmap\" style=\"width:100%;\"/></td>\n"
    md_content += f"    <td><img src=\"action_probs_step{step}.png\" alt=\"Action Probabilities\" style=\"width:100%;\"/></td>\n"
    md_content += f"    <td><img src=\"vis_self_attention_step{step}.png\" alt=\"Visual Self-Attention\" style=\"width:100%;\"/></td>\n"
    md_content += "  </tr>\n"
    md_content += "</table>\n\n"

with open(os.path.join(report_dir, "report.md"), "w") as f:
    f.write(md_content)

print("Report generated successfully.")

Found 869 episodes. Selecting one for analysis.
Selected episode: 2025-03-04_11-57-47_scene_QUCTc6BB5sX_episode_662
Instruction: Enter the bedroom, and then exit on the far side of the bedroom. Walk across the hall and enter the adjacent bedroom.
Number of steps: 42
Ground truth actions: ['Turn right', 'Turn right', 'Turn right', 'Move forward', 'Move forward', 'Turn right', 'Move forward', 'Move forward', 'Move forward', 'Turn right', 'Move forward', 'Turn right', 'Turn right', 'Turn right', 'Move forward', 'Turn right', 'Turn right', 'Move forward', 'Move forward', 'Turn right', 'Move forward', 'Turn right', 'Move forward', 'Move forward', 'Move forward', 'Move forward', 'Turn right', 'Move forward', 'Move forward', 'Move forward', 'Move forward', 'Move forward', 'Move forward', 'Move forward', 'Move forward', 'Move forward', 'Move forward', 'Move forward', 'Move forward', 'Turn left', 'Move forward', 'Move forward']
Instruction tokens: ['[CLS]', 'enter', 'the', 'bedroom', ',', 'and'



Processing checkpoint step 2000




Processing checkpoint step 3000




Processing checkpoint step 4000




Processing checkpoint step 6000




Processing checkpoint step 8000




Processing checkpoint step 10000




Report generated successfully.
