# Model Fingerprinting Exploration

This notebook tests the hypothesis that models have unique "fingerprints" when queried with out-of-distribution inputs.

The idea: Different models trained with different hyperparameters, data ordering, and architectures should respond uniquely to unusual queries like:
- Nonsense token sequences: `"8fs234ks2"`
- Unusual phrases: `"purple cat people like"`
- Random character combinations

We'll query models and capture exactly 5 tokens to see if these responses are:
1. **Unique** - Different models produce different outputs
2. **Robust** - Same model produces consistent outputs

In [None]:
import sys
sys.path.append('..')

from utils.hf_models import HFModel
import torch
import pandas as pd
from typing import List, Dict, Any

## Define Fingerprinting Function

Create a function that queries a model and returns exactly 5 tokens.

In [None]:
def get_model_fingerprint(model: HFModel, 
                          prompt: str, 
                          n_tokens: int = 5,
                          temperature: float = 0.0,
                          seed: int = 42) -> Dict[str, Any]:
    """
    Get a model's fingerprint by generating exactly n_tokens for a prompt.
    
    Args:
        model: HFModel instance
        prompt: Input prompt (ideally out-of-distribution)
        n_tokens: Number of tokens to generate (default: 5)
        temperature: Temperature for generation (0 = deterministic)
        seed: Random seed for reproducibility
    
    Returns:
        Dictionary with prompt, tokens, text, and probabilities
    """
    # Use greedy decoding (temperature=0) for deterministic results
    result = model.generate_with_token_probs(
        prompt=prompt,
        max_new_tokens=n_tokens,
        temperature=temperature
    )
    
    # Ensure we got exactly n_tokens (truncate if needed)
    tokens = result['tokens'][:n_tokens]
    probs = result['probs'][:n_tokens]
    
    return {
        'model': model.model_name,
        'prompt': prompt,
        'tokens': tokens,
        'text': ''.join(tokens),
        'probs': probs,
        'n_tokens': len(tokens)
    }

In [None]:
def get_top_k_next_tokens(model: HFModel,
                          prompt: str,
                          k: int = 5) -> List[tuple]:
    """
    Get the top-k most likely next tokens and their probabilities.
    This is useful for seeing the full distribution, not just sampled tokens.
    
    Args:
        model: HFModel instance
        prompt: Input prompt
        k: Number of top tokens to return
    
    Returns:
        List of (token, probability) tuples
    """
    return model.get_next_token_probs(prompt, top_k=k)

## Define Out-of-Distribution Test Prompts

These are unusual inputs that should produce unique responses across models.

In [None]:
# Out-of-distribution test prompts
OOD_PROMPTS = [
    "8fs234ks2",
    "purple cat people like",
    "zxqwvbnm",
    "@@@@####",
    "quantum jibberish nexus",
    "12345 abcde 67890",
    "|||>>><<<<|||",
    "glorp blip zort",
]

## Test with a Single Model

Let's start by testing with GPT-2 (a small, fast model).

In [None]:
# Load a model (GPT-2 is small and fast for testing)
model = HFModel("gpt2")

In [None]:
# Test fingerprinting with one prompt
test_prompt = "8fs234ks2"
fingerprint = get_model_fingerprint(model, test_prompt)

print(f"Model: {fingerprint['model']}")
print(f"Prompt: '{fingerprint['prompt']}'")
print(f"\nGenerated tokens ({fingerprint['n_tokens']}):")
for i, (token, prob) in enumerate(zip(fingerprint['tokens'], fingerprint['probs']), 1):
    print(f"  {i}. '{token}' (prob: {prob:.4f})")
print(f"\nFull text: '{fingerprint['text']}'")

In [None]:
# Also show the top-5 most likely next tokens (not sampled, just ranked)
print(f"Top 5 next token predictions for '{test_prompt}':")
top_tokens = get_top_k_next_tokens(model, test_prompt, k=5)
for i, (token, prob) in enumerate(top_tokens, 1):
    print(f"  {i}. '{token}' (prob: {prob:.4f})")

## Test All OOD Prompts

Run all out-of-distribution prompts and collect results.

In [None]:
# Collect fingerprints for all OOD prompts
fingerprints = []

for prompt in OOD_PROMPTS:
    fp = get_model_fingerprint(model, prompt)
    fingerprints.append(fp)
    print(f"✓ '{prompt}' -> '{fp['text']}'")

In [None]:
# Display as a DataFrame for easy viewing
df = pd.DataFrame([{
    'prompt': fp['prompt'],
    'output': fp['text'],
    'avg_prob': sum(fp['probs']) / len(fp['probs']) if fp['probs'] else 0
} for fp in fingerprints])

print(f"\nFingerprints for {model.model_name}:")
display(df)

## Compare Multiple Models

To test uniqueness, we need to compare outputs across different models.

**Note**: Loading multiple models will use significant memory. On Mac, you may want to:
1. Test with smaller models (gpt2, distilgpt2)
2. Load/test/unload models one at a time
3. Use models of similar size for fair comparison

In [None]:
# Models to compare (start with small ones)
# You can add more models here, but be mindful of memory usage on Mac
MODEL_NAMES = [
    "gpt2",
    "distilgpt2",
    # "gpt2-medium",  # Uncomment if you have enough memory
    # "facebook/opt-125m",  # Another small model
]

In [None]:
def compare_models(model_names: List[str], 
                   prompts: List[str],
                   n_tokens: int = 5) -> pd.DataFrame:
    """
    Compare multiple models on the same prompts.
    
    Args:
        model_names: List of model names to compare
        prompts: List of prompts to test
        n_tokens: Number of tokens to generate
    
    Returns:
        DataFrame with comparison results
    """
    results = []
    
    for model_name in model_names:
        print(f"\n{'='*60}")
        print(f"Testing {model_name}...")
        print('='*60)
        
        # Load model
        model = HFModel(model_name)
        
        # Test each prompt
        for prompt in prompts:
            fp = get_model_fingerprint(model, prompt, n_tokens=n_tokens)
            results.append({
                'model': model_name,
                'prompt': prompt,
                'output': fp['text'],
                'tokens': fp['tokens'],
                'avg_prob': sum(fp['probs']) / len(fp['probs']) if fp['probs'] else 0
            })
            print(f"  '{prompt}' -> '{fp['text']}'")
        
        # Clean up to save memory
        del model
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        torch.mps.empty_cache() if torch.backends.mps.is_available() else None
    
    return pd.DataFrame(results)

In [None]:
# Compare models (this may take a few minutes)
comparison_df = compare_models(MODEL_NAMES, OOD_PROMPTS[:3])  # Start with just 3 prompts

print("\n" + "="*60)
print("COMPARISON RESULTS")
print("="*60)
display(comparison_df)

In [None]:
# Pivot table to see outputs side-by-side
pivot_df = comparison_df.pivot(index='prompt', columns='model', values='output')
print("\nSide-by-side comparison:")
display(pivot_df)

## Test Robustness (Same Model, Multiple Runs)

To verify robustness, we should test if the same model produces consistent outputs.

In [None]:
def test_robustness(model_name: str,
                    prompt: str,
                    n_runs: int = 5,
                    n_tokens: int = 5) -> pd.DataFrame:
    """
    Test if a model produces consistent outputs across multiple runs.
    
    Args:
        model_name: Model to test
        prompt: Prompt to test
        n_runs: Number of times to run
        n_tokens: Number of tokens to generate
    
    Returns:
        DataFrame with results
    """
    model = HFModel(model_name)
    results = []
    
    for run in range(n_runs):
        fp = get_model_fingerprint(model, prompt, n_tokens=n_tokens, seed=42)
        results.append({
            'run': run + 1,
            'output': fp['text'],
            'tokens': str(fp['tokens']),
        })
    
    return pd.DataFrame(results)

In [None]:
# Test robustness with greedy decoding (temperature=0)
robustness_df = test_robustness("gpt2", "8fs234ks2", n_runs=5)
print("Robustness test (same model, same prompt, 5 runs):")
display(robustness_df)

# Check if all outputs are identical
unique_outputs = robustness_df['output'].nunique()
print(f"\n{'✓' if unique_outputs == 1 else '✗'} Consistent: {unique_outputs} unique output(s) across {len(robustness_df)} runs")

## Next Steps

1. **Test more models**: Add different model families (OPT, BLOOM, Llama, etc.)
2. **Analyze uniqueness**: Calculate similarity metrics between fingerprints
3. **Test different architectures**: Compare models of different sizes/architectures
4. **Persistence testing**: Do fingerprints persist across model fine-tuning?
5. **Statistical analysis**: Quantify how unique/robust these fingerprints are

### Potential Metrics
- **Edit distance** between outputs
- **Token overlap** percentage
- **Probability distribution similarity** (KL divergence, Jensen-Shannon)
- **Clustering** to see if model variants cluster together

In [None]:
# Helper function to calculate simple similarity metrics
def calculate_similarity(text1: str, text2: str) -> Dict[str, float]:
    """
    Calculate simple similarity metrics between two texts.
    
    Returns:
        Dictionary with similarity scores
    """
    # Exact match
    exact_match = 1.0 if text1 == text2 else 0.0
    
    # Character overlap (Jaccard similarity)
    set1, set2 = set(text1), set(text2)
    char_jaccard = len(set1 & set2) / len(set1 | set2) if set1 or set2 else 0.0
    
    # Levenshtein distance (simple implementation)
    def levenshtein(s1, s2):
        if len(s1) < len(s2):
            return levenshtein(s2, s1)
        if len(s2) == 0:
            return len(s1)
        previous_row = range(len(s2) + 1)
        for i, c1 in enumerate(s1):
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row
        return previous_row[-1]
    
    edit_distance = levenshtein(text1, text2)
    max_len = max(len(text1), len(text2))
    normalized_edit = 1 - (edit_distance / max_len) if max_len > 0 else 1.0
    
    return {
        'exact_match': exact_match,
        'char_jaccard': char_jaccard,
        'edit_similarity': normalized_edit,
        'edit_distance': edit_distance
    }

In [None]:
# Example: Compare similarity between model outputs
# This will be more useful once you have comparison_df from multiple models

if len(MODEL_NAMES) >= 2:
    # Get outputs for the same prompt from two different models
    prompt = OOD_PROMPTS[0]
    outputs = comparison_df[comparison_df['prompt'] == prompt]
    
    if len(outputs) >= 2:
        text1 = outputs.iloc[0]['output']
        text2 = outputs.iloc[1]['output']
        model1 = outputs.iloc[0]['model']
        model2 = outputs.iloc[1]['model']
        
        sim = calculate_similarity(text1, text2)
        
        print(f"Similarity between {model1} and {model2} on prompt '{prompt}':")
        print(f"  Output 1: '{text1}'")
        print(f"  Output 2: '{text2}'")
        print(f"\nMetrics:")
        for metric, score in sim.items():
            print(f"  {metric}: {score:.3f}")