In [1]:
# Now you can import modules from this directory
import torch
import os
from src.plotter import EigenvectorPlotter
from src.heist import load_model
import imageio
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
from src.helpers import action_space, evaluate_model
from collections import Counter
import random
from procgen import ProcgenEnv
from src.vec_env import VecExtractDictObs, VecMonitor, VecNormalize
from src.bilinear_impala_simplified import BimpalaCNN, TopKBimpalaCNN
import matplotlib.pyplot as plt
from src.heist import create_venv as create_venv_simple
import einops
from src.utils import *
from src.helpers import ModelActivations, get_model_layer_names
import multiprocessing
from multiprocessing import Pool, set_start_method
from tqdm import tqdm
from functools import partial
import pickle
import src.ppo

In [2]:

model_path = "/mnt/ssd-1/mechinterp/narmeen/bilinear_experiments_official/bilinear_experiments/bilinear_models/bimpala_maze_simplified.pt"
model =load_model(model_path,7)
print(model)
for k in model.state_dict():
    print(k)
state_dict = model.state_dict()

Model loaded from /mnt/ssd-1/mechinterp/narmeen/bilinear_experiments_official/bilinear_experiments/bilinear_models/bimpala_maze_simplified.pt
BimpalaCNN(
  (conv): Conv2d(3, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3))
  (conv_seqs): ModuleList(
    (0-2): 3 x ConvSequence(
      (max_pool2d): MaxPool2d(kernel_size=7, stride=2, padding=3, dilation=1, ceil_mode=False)
      (res_block0): ResidualBlock(
        (conv0): Conv2d(32, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), bias=False)
        (conv1): Conv2d(32, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), bias=False)
      )
      (res_block1): ResidualBlock(
        (conv0): Conv2d(32, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), bias=False)
        (conv1): Conv2d(32, 32, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), bias=False)
      )
    )
  )
  (hidden_fc1): Linear(in_features=2048, out_features=256, bias=False)
  (hidden_fc2): Linear(in_features=2048, out_features=256, bias=False)
  

  state_dict = torch.load(model_path)


In [3]:
from src.ppo import Config, PPO, create_venv,create_gif
env_config = Config()
device = torch.device(f'cuda:{env_config.gpu}' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
agent = PPO(model.to(device),device)

Using device: cuda:0


In [4]:
from src.ppo import rollout_episode_obs
def create_dataset_one_episode(model, device, num_levels= 0, progress= 20, env_name = "maze"):
    
    env = create_venv_simple(num_envs=1, start_level=random.randint(0, 1000000000), num_levels=num_levels, env_name=env_name)
    agent = PPO(model, device)
    envs = rollout_episode_obs(agent, env, progress= progress)
    return envs
def create_dataset_multiple_episodes(model, device, num_levels= 0):   
    env = create_venv_simple(num_envs=1, start_level=random.randint(0, 1000000000), num_levels=num_levels, env_name="maze")
    return [env.reset()]

def get_activations(modelactivations: ModelActivations,observation, layers, device):
    observation = torch.tensor(observation, device=device, dtype=torch.float32)
    _, activations = modelactivations.run_with_cache(observation, layers)
    return activations

def get_activations_for_dataset(modelactivations, dataset, layers, device):
    dataset_with_activations = []
    # Use tqdm to show a progress bar over the dataset iteration
    for observation in tqdm(dataset, desc="Computing activations"):
        # Convert the observation to a tensor
        observation_tensor = torch.tensor(observation, device=device, dtype=torch.float32)
        # Compute activations
        _, activation = modelactivations.run_with_cache(observation_tensor, layers)
        # Append both observation and its activations to the new dataset list
        dataset_with_activations.append((observation_tensor, activation[layers[0].replace('.', '_')][0]))
    return dataset_with_activations

def find_top_observations_by_eigenvector(dataset,eigenvalues, eigenvectors, topk=15, topk_images = 50):


    # Initialize tensors to store the top-k eigenvalues and eigenvectors for each class
    sorted_indices = torch.argsort(torch.abs(eigenvalues), descending=True)
    topk_indices = sorted_indices[:topk]
    top_k_eigenvalues = eigenvalues[topk_indices]
    top_k_eigenvectors = eigenvectors[:, topk_indices]
    
    results = {i: {'top_observations': [], 'max_score': float('-inf')} for i in range(top_k_eigenvectors.size(-1))}

    # Wrap the outer loop with tqdm for a progress bar
    for observation, activation in tqdm(dataset, desc="Processing Observations"):
        
        activation_flat = activation.reshape(1, -1).to(device)
        #print("shape of activation and eigenvectors", activation_flat.squeeze(0).shape,top_k_eigenvectors.shape, top_k_eigenvalues.shape )
        # Compute dot product for each eigenvector
        dot_products = torch.einsum('a, a t -> t', activation_flat.squeeze(0), top_k_eigenvectors.to(activation_flat.device))
        sims = top_k_eigenvalues.to(activation_flat.device) * dot_products**2
        #print("shape of sims", sims.shape)
   
        # Update results for each eigenvector
        for i, value in enumerate(sims):
            score= torch.abs(value)
            if score > results[i]['max_score']:
                results[i]['max_score'] = value.item()
            results[i]['top_observations'].append((observation, value.item()))

    # Sort and limit to top k observations for each eigenvector after all are processed
    for i in results:
        results[i]['top_observations'] = list(results[i]['top_observations'])
        results[i]['top_observations'].sort(key=lambda x: abs(x[1]), reverse=True)
        results[i]['top_observations'] = results[i]['top_observations'][:topk_images]

    return results, top_k_eigenvalues 
def generate_dataset(directory, progress=20, env_name = "maze"):
    os.makedirs(directory, exist_ok = True)
    for j in range(5):
        dataset = []
        # Wrap the inner loop with tqdm for progress tracking
        for i in tqdm(range(2000), desc=f'Generating Dataset for Batch {j}'):
            dataset.extend(create_dataset_one_episode(model, device, progress=progress, env_name = env_name))
        output_path = os.path.join(directory, f'{env_name}_dataset_progress_{progress}_batch{j}.pickle')
        # Save the dataset to a pickle file
        with open(output_path, 'wb') as file:
            pickle.dump(list(dataset), file)
def load_and_combine_datasets(num_files, dataset_folder="", progress= 20, env_name = "maze"):
    # List to hold all observations from all datasets
    all_observations = []
    
    # Generate file names based on known naming pattern
    dataset_files = [f'datasets/{env_name}_dataset_progress_{progress}_batch{i}.pickle' for i in range(num_files)]

    # Loop through each file, load it, and extend the master list
    for dataset_file in dataset_files:
        file_path = os.path.join(dataset_folder, dataset_file)
        with open(file_path, 'rb') as file:
            data = pickle.load(file)
            all_observations.extend(data)
    
    return all_observations

In [5]:
#generate_dataset(directory="datasets", env_name= 'heist')
dataset = load_and_combine_datasets(5)
modelactivations = ModelActivations(model)
print(get_model_layer_names(model))
enhanced_dataset = get_activations_for_dataset(modelactivations, dataset,['conv_seqs.2.res_block1.conv1'],device)

print(len(enhanced_dataset))

['conv', 'conv_seqs', 'conv_seqs.0', 'conv_seqs.0.max_pool2d', 'conv_seqs.0.res_block0', 'conv_seqs.0.res_block0.conv0', 'conv_seqs.0.res_block0.conv1', 'conv_seqs.0.res_block1', 'conv_seqs.0.res_block1.conv0', 'conv_seqs.0.res_block1.conv1', 'conv_seqs.1', 'conv_seqs.1.max_pool2d', 'conv_seqs.1.res_block0', 'conv_seqs.1.res_block0.conv0', 'conv_seqs.1.res_block0.conv1', 'conv_seqs.1.res_block1', 'conv_seqs.1.res_block1.conv0', 'conv_seqs.1.res_block1.conv1', 'conv_seqs.2', 'conv_seqs.2.max_pool2d', 'conv_seqs.2.res_block0', 'conv_seqs.2.res_block0.conv0', 'conv_seqs.2.res_block0.conv1', 'conv_seqs.2.res_block1', 'conv_seqs.2.res_block1.conv0', 'conv_seqs.2.res_block1.conv1', 'hidden_fc1', 'hidden_fc2', 'logits_fc', 'value_fc']


Computing activations: 100%|██████████| 24509/24509 [00:39<00:00, 626.06it/s]

24509





In [6]:
from src.heist import create_venv as create_env
def compute_B_sym(W, V, proj_out= None, type = "mlp"):
    if type == "mlp":
        B = einops.einsum(W,V, "out in1, out in2 -> out in1 in2").to(W.device)
        B = 0.5 * B + 0.5 * B.transpose(-2,-1)
        B = einops.rearrange(B, "out in1 in2 -> out (in1 in2)")
        B = einops.rearrange(B, "out (in1 in2) -> out in1 in2", in1 = 2048)
        B_proj = einops.einsum(proj_out, B, "class h2, h2 in1 in2-> class in1 in2")
        return  0.5 * B_proj + 0.5 * B_proj.transpose(-2,-1)

    else:
        c_out, c_in, k, _ = W.shape
        B = torch.zeros(c_out, c_in*k*k, c_in*k*k, device=W.device, dtype=W.dtype)
        
        for l in range(c_out):
            W_outl = W[l]  # c_in by k by k
            V_outl = V[l]  # c_in by k by k
            for i in range(c_in):
                for j in range(c_in):
                    W_i = W_outl[i]  # k x k
                    V_j = V_outl[j]  # k x k
                    W_i_f = W_i.reshape(-1)  # k*k
                    V_j_f = V_j.reshape(-1)  # k*k
                    block = torch.outer(W_i_f, V_j_f)  # k*k x k*k
                    B[l, i*k*k:(i+1)*k*k, j*k*k:(j+1)*k*k] = block
        #Symmetrysing B            
        B_sym = torch.zeros_like(B)
        for o in range(c_out):
            B_sym[o] = 0.5 * (B[o] + B[o].T) 
        return B_sym

#W, V, proj_out =model.hidden_fc1.weight, model.hidden_fc2.weight, model.logits_fc.weight
W = state_dict['conv_seqs.2.res_block1.conv0.weight']
V = state_dict['conv_seqs.2.res_block1.conv1.weight']
B_sym = compute_B_sym(W=W, V=V, type="conv")

def initialize_models(obs_space, num_outputs, kernel_size,topk=2048, B_sym= B_sym):
    original_model = BimpalaCNN(obs_space, num_outputs, kernel_size)
    model_path = "/mnt/ssd-1/mechinterp/narmeen/bilinear_experiments_official/bilinear_experiments/bilinear_models/bimpala_maze_simplified.pt"
    original_model = load_model(model_path,7)
    # Create the modified model
    modified_model = TopKBimpalaCNN(obs_space, num_outputs, kernel_size=kernel_size, topk= topk,B=B_sym, replacement_layers=["conv_seq_2"])
    
    modified_model.transfer_params_from(original_model)
    
    return original_model, modified_model

In [7]:
from collections import defaultdict
from src.ppo import rollout_episode,create_gif,PPO
def evaluate_model_multi_env(model, create_env, num_episodes=100,gif= None, device = torch.device(f'cuda:3' if torch.cuda.is_available() else 'cpu')):
    """
    Evaluate a trained PPO model over multiple episodes across multiple environments.
    
    Args:
    - agent: The trained PPO agent
    - env_creator: A function that creates and returns an environment
    - env_configs: A list of dictionaries, each containing configuration for an environment
    - num_episodes: Number of episodes to run for evaluation per environment
    - render_every: If not None, render every nth episode
    
    Returns:
    - A dictionary containing evaluation metrics for each environment and overall
    """

    overall_results = defaultdict(list)

    agent = PPO(model=model, device=device)
    

    total_rewards = [None] * num_episodes
    success_count = 0
    success_info = [None] * num_episodes

    for episode in range(num_episodes):
        env = create_env()
        env.reset()
        frames, reward, info = rollout_episode(agent,env, return_info=True)
        total_rewards[episode] = reward
        if reward >0:
            success_count += 1
        success_info[episode] = info['r']

        if gif:
            create_gif(frames, f"{gif}_{episode}.gif")
          
    percentage_success = success_count/num_episodes
    percentage_info_success = sum(success_info)/num_episodes
    env_results = {
        "mean_reward": np.mean(success_info),
        "std_reward": np.std(success_info),
        "min_reward": np.min(total_rewards),
        "max_reward": np.max(total_rewards),
        "percentage_success": percentage_success,
        #"percentage_info_success": percentage_info_success

    }

    return env_results

In [8]:
def plot_topk_results(results_list):
    topk_values = [res['topk'] if res['topk'] != 0 else 0.1 for res in results_list]
    original_topk_labels = [res['topk'] for res in results_list]

    mean_rewards = [res['mean_reward'] for res in results_list]
    std_rewards = [res['std_reward'] for res in results_list]
    success_rates = [res['percentage_success'] for res in results_list]

    # Create a new figure and axis
    fig, ax1 = plt.subplots(figsize=(12, 6))

    # Plot mean_reward with error bars on the first y-axis
    ax1.set_xlabel('Top-k Model')
    ax1.set_ylabel('Mean Reward', color='tab:blue')
    ax1.errorbar(topk_values, mean_rewards, yerr=std_rewards, fmt='o-', color='tab:blue', 
                 label='Mean Reward', capsize=5, capthick=2, ecolor='lightblue')
    ax1.tick_params(axis='y', labelcolor='tab:blue')
    ax1.set_xscale('log')

    # Create a second y-axis for success_rate
    ax2 = ax1.twinx()
    ax2.set_ylabel('Success Rate (%) out of 100 trials', color='tab:green')
    ax2.plot(topk_values, success_rates, color='tab:green', marker='x', label='Success Rate')
    ax2.tick_params(axis='y', labelcolor='tab:green')

    # Custom tick labels for top-k, to display '0' instead of 0.1
    ax1.set_xticks(topk_values)
    ax1.set_xticklabels([str(x) if x != 0.1 else '0' for x in original_topk_labels])

    # Add titles and labels
    plt.title('Mean Reward (with Std Dev) and Success Rate Over Different Top-k Values')

    # Show legends
    ax1.legend(loc='upper left')
    ax2.legend(loc='upper right')

    # Adjust layout to prevent clipping of tick-labels
    plt.tight_layout()

    # Display the plot
    plt.show()

In [9]:
W = state_dict['conv_seqs.2.res_block1.conv0.weight']
V = state_dict['conv_seqs.2.res_block1.conv1.weight']
B_sym = compute_B_sym(W=W, V=V, type="conv")


In [10]:
topk = 1568
results = [None] * 13
for i in range(13):
    env = create_env()
    usual_model, modified_model = initialize_models(env.observation_space, env.action_space.n, kernel_size = 7, topk=topk,B_sym= B_sym)
    results[i] = evaluate_model_multi_env(model = modified_model, create_env = create_venv_simple, num_episodes=1, gif="gif")
    results[i]['topk'] = topk
    topk = topk//2 

Model loaded from /mnt/ssd-1/mechinterp/narmeen/bilinear_experiments_official/bilinear_experiments/bilinear_models/bimpala_maze_simplified.pt
Parameters transferred and modified successfully


UnboundLocalError: local variable 'top_k_eigenfilters' referenced before assignment