In [2]:
import sys

sys.path.append("..")

In [3]:
import torch
import time
import matplotlib.pyplot as plt
from model.long import LongConfig, LongForCausalLM
from transformers import GPT2Config, GPT2LMHeadModel

def benchmark():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Benchmarking on {device}...")

    # 1. Setup Your Model (Linear Attention)
    my_config = LongConfig(
        vocab_size=50304,
        hidden_size=768,
        num_hidden_layers=18, # Increased depth
        num_heads=12,
        expansion_ratio=8/3,   # Ensures intermediate_size = 2048
        conv_kernel=4,
        hybrid_ratio=0         # Pure Linear Attention (fastest)
    )
    my_model = LongForCausalLM(my_config).to(device)
    my_model.eval()

    # 2. Setup Standard Transformer (GPT-2)
    gpt_config = GPT2Config(
        vocab_size=50257,
        n_embd=256,
        n_layer=4,
        n_head=8,
        n_positions=2048
    )
    gpt_model = GPT2LMHeadModel(gpt_config).to(device)
    gpt_model.eval()

    # 3. The Race
    lengths = [100, 500, 1000, 2000]
    my_times = []
    gpt_times = []

    print("\n--- üèéÔ∏è Starting Race (Generation Speed) ---")
    print(f"{'Seq Len':<10} | {'Yours (ms)':<15} | {'GPT-2 (ms)':<15} | {'Speedup':<10}")
    print("-" * 60)

    for seq_len in lengths:
        # Create a prompt of size 'seq_len'
        input_ids = torch.randint(0, 50000, (1, seq_len)).to(device)
        
        # --- Benchmark Yours (Recurrent Step) ---
        # Warmup
        with torch.no_grad():
            _ = my_model(input_ids)
            
        torch.cuda.synchronize()
        start = time.time()
        
        # Generate 10 tokens
        # For your model, we simulate the Recurrent State update 10 times
        # (This is how you would use it in production)
        curr_input = input_ids[:, -1:]
        past_kv = None
        
        # Pre-fill state (the "prompt processing")
        with torch.no_grad():
             out = my_model(input_ids)
             past_kv = out.past_key_values
        
        # Generate loop
        for _ in range(10):
            with torch.no_grad():
                out = my_model(curr_input, past_key_values=past_kv)
                past_kv = out.past_key_values
                curr_input = torch.argmax(out.logits, dim=-1)

        torch.cuda.synchronize()
        my_time = (time.time() - start) * 1000 # ms
        my_times.append(my_time)

        # --- Benchmark GPT-2 (KV Cache) ---
        # GPT-2 uses KV caching, but attention matrix still grows
        with torch.no_grad():
             _ = gpt_model(input_ids)
             
        torch.cuda.synchronize()
        start = time.time()
        
        # We use generate() which handles KV cache efficiently
        with torch.no_grad():
            gpt_model.generate(input_ids, max_new_tokens=10, do_sample=False)
            
        torch.cuda.synchronize()
        gpt_time = (time.time() - start) * 1000
        gpt_times.append(gpt_time)

        print(f"{seq_len:<10} | {my_time:.2f}          | {gpt_time:.2f}          | {gpt_time/my_time:.2f}x")

    print("\n‚úÖ Benchmark Complete.")

if __name__ == "__main__":
    benchmark()

Benchmarking on cuda...

--- üèéÔ∏è Starting Race (Generation Speed) ---
Seq Len    | Yours (ms)      | GPT-2 (ms)      | Speedup   
------------------------------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


100        | 280.00          | 158.13          | 0.56x


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


500        | 200.41          | 82.44          | 0.41x


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


1000       | 253.22          | 84.63          | 0.33x


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


2000       | 314.23          | 94.74          | 0.30x

‚úÖ Benchmark Complete.


In [4]:
import torch
from model.long import LongConfig, LongForCausalLM  # Ensure this matches your file name

def count_parameters(model):
    """Counts valid trainable parameters."""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def test_model_size():
    print("--- üß™ Testing Model Configuration (Target: ~187M) ---")

    # 1. Instantiate Config
    config = LongConfig(
        vocab_size=50304,
        hidden_size=768,
        num_hidden_layers=18, # Increased depth
        num_heads=12,
        expansion_ratio=8/3,   # Ensures intermediate_size = 2048
        conv_kernel=4,
        hybrid_ratio=0         # Pure Linear Attention (fastest)
    )

    print(f"Config: L={config.num_hidden_layers}, H={config.hidden_size}, Vocab={config.vocab_size}")

    # 2. Instantiate Model
    print("Instantiating model... ", end="")
    try:
        model = LongForCausalLM(config)
        print("‚úÖ Success!")
    except ImportError:
        print("\n‚ùå Error: Could not import LongForCausalLM. Make sure modeling_long.py is accessible.")
        return
    except Exception as e:
        print(f"\n‚ùå Error initializing model: {e}")
        return

    # 3. Calculate Parameters
    total_params = count_parameters(model)
    
    # Calculate Embedding vs Non-Embedding params
    # (Useful to know how much compute goes to "thinking" vs "looking up words")
    embed_params = config.vocab_size * config.hidden_size
    non_embed_params = total_params - embed_params

    print("\n--- üìä Parameter Breakdown ---")
    print(f"Total Parameters:      {total_params:,} ({total_params/1e6:.2f}M)")
    print(f"Embedding Parameters:  {embed_params:,} ({embed_params/1e6:.2f}M)")
    print(f"Layer Parameters:      {non_embed_params:,} ({non_embed_params/1e6:.2f}M)")
    
    # 4. Verification
    target = 187_000_000
    diff = abs(total_params - target)
    print(f"\nDifference from 187M:  {diff:,} params")
    
    if 180_000_000 <= total_params <= 195_000_000:
        print("‚úÖ Result: PERFECT MATCH for the 187M category.")
    else:
        print("‚ö†Ô∏è Result: Slightly off target (adjust layers/hidden_size if needed).")

if __name__ == "__main__":
    test_model_size()

--- üß™ Testing Model Configuration (Target: ~187M) ---
Config: L=18, H=768, Vocab=50304
Instantiating model... ‚úÖ Success!

--- üìä Parameter Breakdown ---
Total Parameters:      187,671,768 (187.67M)
Embedding Parameters:  38,633,472 (38.63M)
Layer Parameters:      149,038,296 (149.04M)

Difference from 187M:  671,768 params
‚úÖ Result: PERFECT MATCH for the 187M category.
