In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import pickle

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
MODEL_PATH = "meta-llama/Llama-2-7b-chat-hf"
print(f"Using device: {DEVICE}")

In [None]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16,
    device_map='auto' if torch.cuda.is_available() else None
)
model.eval()


In [None]:
# Helper function to capture activations
def get_activation(name, activations_dict):
    def hook(module, input, output):
        # Handle different output types
        if isinstance(output, tuple):
            activations_dict[name] = output[0].detach().clone()
        else:
            activations_dict[name] = output.detach().clone()
    return hook

In [None]:
# Generate sample inputs for analysis
def generate_sample_inputs(tokenizer, n_samples=100, seq_length=32):    
    sample_texts = [
        "The future of artificial intelligence",
        "Climate change affects global weather",
        "Machine learning algorithms can",
        "Deep neural networks are",
        "Natural language processing enables",
        "Computer vision systems detect",
        "Quantum computing will revolutionize",
        "Blockchain technology provides",
        "Renewable energy sources include",
        "Medical research has shown",
        "Space exploration reveals",
        "Economic policies influence",
        "Educational systems should",
        "Transportation networks connect",
        "Communication technologies enable"
    ]
    
    inputs = []
    for i in range(n_samples):
        # Cycle through sample texts and add variations
        base_text = sample_texts[i % len(sample_texts)]
        
        # Add some randomness
        if i > len(sample_texts):
            base_text = base_text + f" in {2020 + (i % 10)} with"
        
        # Tokenize
        tokenized = tokenizer(
            base_text, 
            return_tensors="pt", 
            padding='max_length', 
            truncation=True, 
            max_length=seq_length
        )
        inputs.append(tokenized.input_ids.to(model.device))
    
    return inputs

In [None]:
# Generate activation differences for Llama-2 layers
def generate_activation_differences_llama(model, X_data, n_samples=50, n_reconstructions=3):
    results = []
    
    # Select specific layers to analyze (first few transformer layers)
    layer_names = [
        'model.layers.0',  # First transformer layer
        'model.layers.1',  # Second transformer layer  
        'model.layers.2',  # Third transformer layer
    ]
    
    for sample_idx in tqdm(range(min(n_samples, len(X_data))), desc="Processing samples"):
        original_input = X_data[sample_idx]
        
        # Get original activations
        original_activations = {}
        hooks = []
        
        # Register hooks for target layers
        for layer_name in layer_names:
            layer_module = model
            for attr in layer_name.split('.'):
                layer_module = getattr(layer_module, attr)
            hooks.append(layer_module.register_forward_hook(
                get_activation(layer_name, original_activations)
            ))
        
        # Get original output and activations
        with torch.no_grad():
            original_output = model(original_input).logits
        
        # Remove hooks
        for hook in hooks:
            hook.remove()
        
        # Multiple reconstruction attempts
        for recon_idx in range(n_reconstructions):
            # Initialize random input embeddings for reconstruction
            seq_length = original_input.shape[1]
            embedding_dim = model.config.hidden_size
            
            # Use embeddings instead of token IDs for gradient-based optimization
            reconstructed_embeddings = torch.randn(
                1, seq_length, embedding_dim,
                device=model.device,
                dtype=torch.float32,
                requires_grad=True
            )
            
            optimizer = optim.Adam([reconstructed_embeddings], lr=0.01)
            
            # Reconstruction optimization
            for iteration in range(500):  # Reduced iterations for efficiency
                optimizer.zero_grad()
                
                # Forward pass with embeddings
                embeddings_model_dtype = reconstructed_embeddings.to(model.dtype)
                output = model(inputs_embeds=embeddings_model_dtype).logits
                
                # Loss: match original output
                loss = nn.functional.mse_loss(output.float(), original_output.float())
                
                # Add regularization
                reg_loss = 0.001 * torch.mean(reconstructed_embeddings ** 2)
                total_loss = loss + reg_loss
                
                total_loss.backward()
                optimizer.step()
                
                if iteration % 100 == 0:
                    print(f"Sample {sample_idx}, Recon {recon_idx}, Iter {iteration}, Loss: {total_loss.item():.6f}")
            
            # Get reconstructed activations
            reconstructed_activations = {}
            hooks = []
            
            for layer_name in layer_names:
                layer_module = model
                for attr in layer_name.split('.'):
                    layer_module = getattr(layer_module, attr)
                hooks.append(layer_module.register_forward_hook(
                    get_activation(layer_name, reconstructed_activations)
                ))
            
            with torch.no_grad():
                embeddings_model_dtype = reconstructed_embeddings.to(model.dtype)
                _ = model(inputs_embeds=embeddings_model_dtype)
            
            # Remove hooks
            for hook in hooks:
                hook.remove()
            
            # Calculate differences for each layer
            row = {'sample_idx': sample_idx, 'reconstruction_idx': recon_idx}
            
            # Store individual layer metrics
            all_layer_max_diffs = []
            
            for layer_name in layer_names:
                if layer_name in original_activations and layer_name in reconstructed_activations:
                    orig_act = original_activations[layer_name].flatten().float()
                    recon_act = reconstructed_activations[layer_name].flatten().float()
                    
                    abs_diff = torch.abs(orig_act - recon_act)
                    
                    layer_short = layer_name.split('.')[-1]  # Get layer number
                    row[f'layer_{layer_short}_min_abs_diff'] = abs_diff.min().item()
                    row[f'layer_{layer_short}_mean_abs_diff'] = abs_diff.mean().item()
                    row[f'layer_{layer_short}_max_abs_diff'] = abs_diff.max().item()
                    
                    all_layer_max_diffs.append(abs_diff.max().item())
            
            # Store the maximum difference across ALL layers
            if all_layer_max_diffs:
                row['all_layers_max_diff'] = max(all_layer_max_diffs)
                row['all_layers_min_of_max'] = min(all_layer_max_diffs)
            
            results.append(row)
    
    return pd.DataFrame(results)

In [None]:
# Generate sample data
print("Generating sample inputs...")
X_data = generate_sample_inputs(tokenizer, n_samples=20, seq_length=16) 
print(f"Generated {len(X_data)} samples")

In [None]:
# Generate results
print("Generating activation differences for Llama-2 layers...")
results = generate_activation_differences_llama(model, X_data, n_samples=10, n_reconstructions=3)

In [None]:
# Save results
results.to_csv('llama2_activation_diff_results.csv', index=False)
print(f"Results saved. Shape: {results.shape}")
print("\nFirst few rows:")
print(results.head())

In [None]:
# Define threshold values to test
thresholds = np.logspace(-6, 0, 100)  # From 1e-6 to 1

# Create visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Llama-2 Activation Reconstruction Analysis', fontsize=16)

layers = ['0', '1', '2']  # Layer numbers
metrics = ['min_abs_diff', 'mean_abs_diff', 'max_abs_diff']
colors = {'min_abs_diff': 'blue', 'mean_abs_diff': 'green', 'max_abs_diff': 'red'}

# Plot for each layer
for idx, layer in enumerate(layers):
    if idx < 3:  # Only plot first 3 layers
        ax = axes[idx//2, idx%2]
        
        for metric in metrics:
            column = f'layer_{layer}_{metric}'
            if column in results.columns:
                values = results[column].values
                
                # Calculate percentage passing each threshold
                percentages = []
                for threshold in thresholds:
                    passing = np.sum(values <= threshold) / len(values) * 100
                    percentages.append(passing)
                
                # Plot cumulative distribution
                ax.semilogx(thresholds, percentages, 
                           label=f'Layer {layer} - {metric.replace("_abs_diff", "").capitalize()}',
                           color=colors[metric], linewidth=2, alpha=0.7)
        
        # Add reference lines
        ax.axvline(x=0.007, color='black', linestyle='--', alpha=0.5, label='0.007 threshold')
        ax.axhline(y=90, color='gray', linestyle=':', alpha=0.5)
        ax.axhline(y=95, color='gray', linestyle=':', alpha=0.5)
        ax.axhline(y=99, color='gray', linestyle=':', alpha=0.5)
        
        ax.set_xlabel('Threshold Value', fontsize=12)
        ax.set_ylabel('Percentage Passing (%)', fontsize=12)
        ax.set_title(f'Transformer Layer {layer}', fontsize=14)
        ax.legend(fontsize=8)
        ax.grid(True, alpha=0.3)
        ax.set_ylim(0, 105)

# Plot ALL LAYERS simultaneous check
ax = axes[1, 1]

if 'all_layers_max_diff' in results.columns:
    values = results['all_layers_max_diff'].values
    percentages_all = []
    for threshold in thresholds:
        passing = np.sum(values <= threshold) / len(values) * 100
        percentages_all.append(passing)
    
    ax.semilogx(thresholds, percentages_all, 
               label='ALL Layers Must Pass (Worst Case)',
               color='darkred', linewidth=3)
    
    # Also plot individual layer maximums for comparison
    for layer in layers:
        column = f'layer_{layer}_max_abs_diff'
        if column in results.columns:
            values = results[column].values
            percentages = []
            for threshold in thresholds:
                passing = np.sum(values <= threshold) / len(values) * 100
                percentages.append(passing)
            ax.semilogx(thresholds, percentages, 
                       label=f'Layer {layer} only',
                       linewidth=1, alpha=0.5, linestyle='--')

ax.axvline(x=0.007, color='black', linestyle='--', alpha=0.5, label='0.007 threshold')
ax.axhline(y=90, color='gray', linestyle=':', alpha=0.5)
ax.axhline(y=95, color='gray', linestyle=':', alpha=0.5)
ax.axhline(y=99, color='gray', linestyle=':', alpha=0.5)

ax.set_xlabel('Threshold Value', fontsize=12)
ax.set_ylabel('Percentage Passing (%)', fontsize=12)
ax.set_title('ALL Layers Simultaneous Pass Rate', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)
ax.set_ylim(0, 105)

plt.tight_layout()
plt.savefig('llama2_activation_reconstruction_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# %%
# Create threshold analysis table
threshold_values = [0.001, 0.005, 0.007, 0.01, 0.05, 0.1]
analysis_results = []

# Individual layer analysis
for layer in layers:
    for threshold in threshold_values:
        row = {'Layer': f'Layer {layer}', 'Threshold': threshold}
        
        for metric in metrics:
            column = f'layer_{layer}_{metric}'
            if column in results.columns:
                values = results[column].values
                passing_percentage = np.sum(values <= threshold) / len(values) * 100
                row[f'{metric.replace("_abs_diff", "").capitalize()} Pass %'] = f'{passing_percentage:.1f}%'
        
        analysis_results.append(row)

# ALL LAYERS analysis
if 'all_layers_max_diff' in results.columns:
    for threshold in threshold_values:
        row = {'Layer': 'ALL LAYERS', 'Threshold': threshold}
        
        values = results['all_layers_max_diff'].values
        passing_percentage = np.sum(values <= threshold) / len(values) * 100
        row['Min Pass %'] = '-'
        row['Mean Pass %'] = '-'
        row['Max Pass %'] = f'{passing_percentage:.1f}%'
        
        analysis_results.append(row)

# Create DataFrame and display
threshold_df = pd.DataFrame(analysis_results)
print("\nLlama-2 Threshold Analysis Table:")
print("="*80)
print(threshold_df.to_string(index=False))

# Save to CSV
threshold_df.to_csv('llama2_threshold_analysis.csv', index=False)

# %%
# Summary statistics
print("\n" + "="*80)
print("LLAMA-2 ACTIVATION RECONSTRUCTION ANALYSIS SUMMARY")
print("="*80)

if 'all_layers_max_diff' in results.columns:
    values = results['all_layers_max_diff'].values
    
    print(f"\nDataset size: {len(results)} reconstruction attempts")
    print(f"Layers analyzed: {layers}")
    
    # Find threshold for different pass rates
    pass_rates = [90, 95, 99]
    print("\nThreshold needed for target pass rates (ALL LAYERS):")
    
    for rate in pass_rates:
        if len(values) > 0:
            threshold_for_rate = np.percentile(values, rate)
            print(f"  {rate}% pass rate: {threshold_for_rate:.6f}")
    
    # Statistics at specific thresholds
    print("\nPass rates at specific thresholds (ALL LAYERS):")
    for threshold in [0.001, 0.007, 0.01, 0.1]:
        pass_rate = np.sum(values <= threshold) / len(values) * 100
        print(f"  Threshold {threshold}: {pass_rate:.1f}% pass rate")

print("\nAnalysis completed! Check the generated plots and CSV files for detailed results.")