In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn.functional as F


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit"
from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",         
    quantization_config=bnb_config
)




In [None]:
prompt = """USER: Write a Python function that implements a graph-based pathfinding algorithm to find the shortest path between two nodes in a weighted, undirected graph, considering both distance and cost constraints. The function should:
1. Take a graph (as an adjacency list with weights and costs), start node, end node, and a maximum cost budget as input.
2. Return the shortest path (list of nodes) that respects the cost budget, or None if no valid path exists.
3. Use Dijkstra’s algorithm with a modified priority queue to account for both distance and cost.
4. Include error handling for invalid inputs (e.g., negative weights, non-existent nodes).
5. Provide an example usage with a graph containing at least 5 nodes, demonstrating both a successful path and a case where no path fits the budget.

Example graph format:
graph = {
    'A': {'B': (2, 5), 'C': (4, 10)},  
    'B': {'A': (2, 5), 'D': (3, 8)},
    'C': {'A': (4, 10), 'D': (1, 3), 'E': (5, 12)},
    'D': {'B': (3, 8), 'C': (1, 3), 'E': (2, 6)},
    'E': {'C': (5, 12), 'D': (2, 6)}.
    
}
AI response::
"""
messages = [
        {"role": "user", "content": user_message}
]
    
    
formatted_prompt = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True  
)
    
print("Formatted prompt:")
print(repr(formatted_prompt[:200] + "..." + formatted_prompt[-100:]))
print("\n")
    
    
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)


generated_ids = model.generate(
        **inputs,
        max_new_tokens=2048,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.05,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
new_tokens = generated_ids[0][inputs['input_ids'].shape[1]:]
response = tokenizer.decode(new_tokens, skip_special_tokens=True)

print("GENERATED RESPONSE:")
print("="*50)
print(response)
print("="*50)
print(f"Response length: {len(response)} characters")

def calculate_uncertainty_metrics(logits, method='entropy', top_k=1000):
    """
    Calculate various uncertainty metrics from model logits
    
    Args:
        logits: Raw logits from model (batch_size, seq_len, vocab_size)
        method: 'entropy', 'max_prob', 'top_k_ratio', or 'all'
        top_k: Number of top tokens to consider (reduces computation)
    
    Returns:
        Dictionary of uncertainty metrics
    """
    
    
    if torch.isnan(logits).any() or torch.isinf(logits).any():
        print("Warning: Found NaN/Inf in logits, replacing with large negative values")
        logits = torch.where(torch.isnan(logits) | torch.isinf(logits), 
                           torch.full_like(logits, -1e6), logits)
    
    
    if top_k < logits.size(-1):
        top_k_logits, top_k_indices = torch.topk(logits, top_k, dim=-1)
        working_logits = top_k_logits
    else:
        working_logits = logits
    
    
    working_logits = working_logits.float()
    
    
    log_probs = F.log_softmax(working_logits, dim=-1)
    probs = torch.exp(log_probs)
    
    
    epsilon = 1e-8
    probs = probs + epsilon
    probs = probs / probs.sum(dim=-1, keepdim=True)
    
    
    prob_sums = probs.sum(dim=-1)
    if not torch.allclose(prob_sums, torch.ones_like(prob_sums), atol=1e-6):
        print(f"Warning: Probability sums range from {prob_sums.min():.6f} to {prob_sums.max():.6f}")
        
        probs = probs / prob_sums.unsqueeze(-1)
    
    metrics = {}
    
    if method in ['entropy', 'all']:
        
        entropy_per_token = -(probs * torch.log(probs + epsilon)).sum(dim=-1)
        metrics['entropy'] = {
            'per_token': entropy_per_token,
            'mean': entropy_per_token.mean().item(),
            'std': entropy_per_token.std().item(),
            'max': entropy_per_token.max().item(),
            'min': entropy_per_token.min().item()
        }
    
    if method in ['max_prob', 'all']:
        
        max_probs, _ = torch.max(probs, dim=-1)
        uncertainty_from_max = 1.0 - max_probs  
        metrics['max_prob_uncertainty'] = {
            'per_token': uncertainty_from_max,
            'mean': uncertainty_from_max.mean().item(),
            'std': uncertainty_from_max.std().item()
        }
    
    if method in ['top_k_ratio', 'all']:
        
        k_for_concentration = min(5, probs.size(-1))
        top_k_probs, _ = torch.topk(probs, k_for_concentration, dim=-1)
        concentration = top_k_probs.sum(dim=-1)
        uncertainty_from_concentration = 1.0 - concentration
        metrics['top_k_concentration'] = {
            'per_token': uncertainty_from_concentration,
            'mean': uncertainty_from_concentration.mean().item(),
            'std': uncertainty_from_concentration.std().item()
        }
    
    if method in ['variance', 'all']:
        
        prob_variance = torch.var(probs, dim=-1)
        metrics['probability_variance'] = {
            'per_token': prob_variance,
            'mean': prob_variance.mean().item(),
            'std': prob_variance.std().item()
        }
    
    return metrics


print("\n" + "="*50)
print("UNCERTAINTY ANALYSIS")
print("="*50)

with torch.no_grad():
    
    outputs = model(**inputs, return_dict=True)
    logits = outputs.logits
    
    print(f"Logits shape: {logits.shape}")
    print(f"Vocab size: {logits.shape[-1]}")
    
    
    uncertainty_metrics = calculate_uncertainty_metrics(logits, method='all', top_k=1000)
    
    for metric_name, metric_data in uncertainty_metrics.items():
        print(f"\n{metric_name.upper()}:")
        print(f"  Mean: {metric_data['mean']:.4f}")
        if 'std' in metric_data:
            print(f"  Std:  {metric_data['std']:.4f}")
        if 'max' in metric_data and 'min' in metric_data:
            print(f"  Range: [{metric_data['min']:.4f}, {metric_data['max']:.4f}]")

print(f"\n" + "="*50)
print("MEMORY USAGE")
print("="*50)
print(f"Allocated VRAM: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Reserved VRAM: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")


Generated answer:
USER: Write a Python function that implements a graph-based pathfinding algorithm to find the shortest path between two nodes in a weighted, undirected graph, considering both distance and cost constraints. The function should:
1. Take a graph (as an adjacency list with weights and costs), start node, end node, and a maximum cost budget as input.
2. Return the shortest path (list of nodes) that respects the cost budget, or None if no valid path exists.
3. Use Dijkstra’s algorithm with a modified...

UNCERTAINTY ANALYSIS
Logits shape: torch.Size([1, 316, 152064])
Vocab size: 152064

ENTROPY:
  Mean: 0.6689
  Std:  0.8497
  Range: [0.0002, 5.0954]

MAX_PROB_UNCERTAINTY:
  Mean: 0.2066
  Std:  0.2541

TOP_K_CONCENTRATION:
  Mean: 0.0395
  Std:  0.0927

PROBABILITY_VARIANCE:
  Mean: 0.0007
  Std:  0.0003

MEMORY USAGE
Allocated VRAM: 8.44 GB
Reserved VRAM: 10.46 GB


In [14]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Your existing setup...
model_name = "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config
)

# Check what chat template the model expects
print("Model's default chat template:")
print(repr(tokenizer.chat_template))
print("\n" + "="*60 + "\n")

# CORRECT WAY: Use proper chat template format
user_message = """Write a Python function that implements a graph-based pathfinding algorithm to find the shortest path between two nodes in a weighted, undirected graph, considering both distance and cost constraints. The function should:

1. Take a graph (as an adjacency list with weights and costs), start node, end node, and a maximum cost budget as input.
2. Return the shortest path (list of nodes) that respects the cost budget, or None if no valid path exists.
3. Use Dijkstra's algorithm with a modified priority queue to account for both distance and cost.
4. Include error handling for invalid inputs (e.g., negative weights, non-existent nodes).
5. Provide an example usage with a graph containing at least 5 nodes, demonstrating both a successful path and a case where no path fits the budget.

Example graph format:
```python
graph = {
    'A': {'B': (2, 5), 'C': (4, 10)},  # (distance, cost)
    'B': {'A': (2, 5), 'D': (3, 8)},
    'C': {'A': (4, 10), 'D': (1, 3), 'E': (5, 12)},
    'D': {'B': (3, 8), 'C': (1, 3), 'E': (2, 6)},
    'E': {'C': (5, 12), 'D': (2, 6)}
}
```"""

# Method 1: Use the built-in chat template (RECOMMENDED)
print("METHOD 1: Using tokenizer.apply_chat_template()")
try:
    messages = [
        {"role": "user", "content": user_message}
    ]
    
    # Apply chat template and tokenize
    formatted_prompt = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True  # This adds the assistant's turn
    )
    
    print("Formatted prompt:")
    print(repr(formatted_prompt[:200] + "..." + formatted_prompt[-100:]))
    print("\n")
    
    # Tokenize the formatted prompt
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
    
    # Generate with proper parameters
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=2048,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.05,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    
    # Decode only the new tokens (exclude the input)
    new_tokens = generated_ids[0][inputs['input_ids'].shape[1]:]
    response = tokenizer.decode(new_tokens, skip_special_tokens=True)
    
    print("GENERATED RESPONSE:")
    print("="*50)
    print(response)
    print("="*50)
    print(f"Response length: {len(response)} characters")
    
    # Store this for uncertainty analysis
    successful_inputs = inputs
    successful_generated = generated_ids
    
except Exception as e:
    print(f"Method 1 failed: {e}")
    print(f"Error type: {type(e)}")
    import traceback
    traceback.print_exc()

print("\n" + "-"*60 + "\n")

# Method 2: Manual template formatting (fallback)
print("METHOD 2: Manual template formatting")
try:
    # Qwen2.5 typically uses this format
    manual_prompt = f"<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n"
    
    print("Manual prompt format:")
    print(repr(manual_prompt[:200] + "..." + manual_prompt[-50:]))
    print("\n")
    
    inputs = tokenizer(manual_prompt, return_tensors="pt").to(model.device)
    
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=2048,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.05,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    
    # Decode only the new tokens
    new_tokens = generated_ids[0][inputs['input_ids'].shape[1]:]
    response = tokenizer.decode(new_tokens, skip_special_tokens=True)
    
    print("MANUAL TEMPLATE RESPONSE:")
    print("="*50)
    print(response)
    print("="*50)
    print(f"Response length: {len(response)} characters")
    
except Exception as e:
    print(f"Method 2 failed: {e}")

print("\n" + "-"*60 + "\n")

# Method 3: Check tokenizer special tokens
print("METHOD 3: Debugging tokenizer info")
print(f"Special tokens: {tokenizer.special_tokens_map}")
print(f"BOS token: {repr(tokenizer.bos_token)} (ID: {tokenizer.bos_token_id})")
print(f"EOS token: {repr(tokenizer.eos_token)} (ID: {tokenizer.eos_token_id})")
print(f"UNK token: {repr(tokenizer.unk_token)} (ID: {tokenizer.unk_token_id})")
print(f"PAD token: {repr(tokenizer.pad_token)} (ID: {tokenizer.pad_token_id})")

# Check what tokens are being added
test_messages = [{"role": "user", "content": "Hello"}]
test_formatted = tokenizer.apply_chat_template(test_messages, tokenize=False, add_generation_prompt=True)
print(f"\nTest chat template result:")
print(repr(test_formatted))

# Now do uncertainty analysis on the successful generation
if 'successful_inputs' in locals() and 'successful_generated' in locals():
    print("\n" + "="*60)
    print("UNCERTAINTY ANALYSIS")
    print("="*60)
    
    with torch.no_grad():
        # Get logits for the successful generation
        outputs = model(**successful_inputs, return_dict=True)
        logits = outputs.logits
        
        print(f"Logits shape: {logits.shape}")
        
        # Calculate entropy using the corrected method from before
        def calculate_entropy_safe(logits, top_k=1000):
            # Use top-k to reduce computation
            if top_k < logits.size(-1):
                top_k_logits, _ = torch.topk(logits, top_k, dim=-1)
                working_logits = top_k_logits.float()
            else:
                working_logits = logits.float()
            
            # Calculate probabilities safely
            log_probs = F.log_softmax(working_logits, dim=-1)
            probs = torch.exp(log_probs)
            
            # Add epsilon and normalize
            epsilon = 1e-8
            probs = probs + epsilon
            probs = probs / probs.sum(dim=-1, keepdim=True)
            
            # Manual entropy calculation
            entropy_per_token = -(probs * torch.log(probs + epsilon)).sum(dim=-1)
            return entropy_per_token
        
        entropy_per_token = calculate_entropy_safe(logits)
        avg_entropy = entropy_per_token.mean().item()
        max_entropy = entropy_per_token.max().item()
        min_entropy = entropy_per_token.min().item()
        
        print(f"Average entropy: {avg_entropy:.4f}")
        print(f"Max entropy: {max_entropy:.4f}")
        print(f"Min entropy: {min_entropy:.4f}")
        
        # Show entropy pattern over the sequence
        if entropy_per_token.numel() > 1:
            print(f"Entropy trend (first 10): {entropy_per_token[0, :10].tolist()}")
            print(f"Entropy trend (last 10): {entropy_per_token[0, -10:].tolist()}")

print(f"\nMemory usage:")
print(f"Allocated VRAM: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Reserved VRAM: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")



Model's default chat template:
'{%- if tools %}\n    {{- \'<|im_start|>system\\n\' }}\n    {%- if messages[0][\'role\'] == \'system\' %}\n        {{- messages[0][\'content\'] }}\n    {%- else %}\n        {{- \'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\' }}\n    {%- endif %}\n    {{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}\n    {%- for tool in tools %}\n        {{- "\\n" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}\n    {%- if messages[0][\'role\'] == \'system\' %}\n        {{- \'<|im_start|>system\\n\' + messages[0][\'content\'] + \'<|im_end|>\\n

In [1]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Your existing setup...
model_name = "unsloth/Qwen2.5-Coder-14B-Instruct-bnb-4bit"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config
)

# Check what chat template the model expects
print("Model's default chat template:")
print(repr(tokenizer.chat_template))
print("\n" + "="*60 + "\n")

# CORRECT WAY: Use proper chat template format
user_message = """Write a Python function that implements a graph-based pathfinding algorithm to find the shortest path between two nodes in a weighted, undirected graph, considering both distance and cost constraints. The function should:

1. Take a graph (as an adjacency list with weights and costs), start node, end node, and a maximum cost budget as input.
2. Return the shortest path (list of nodes) that respects the cost budget, or None if no valid path exists.
3. Use Dijkstra's algorithm with a modified priority queue to account for both distance and cost.
4. Include error handling for invalid inputs (e.g., negative weights, non-existent nodes).
5. Provide an example usage with a graph containing at least 5 nodes, demonstrating both a successful path and a case where no path fits the budget.

Example graph format:
```python
graph = {
    'A': {'B': (2, 5), 'C': (4, 10)},  # (distance, cost)
    'B': {'A': (2, 5), 'D': (3, 8)},
    'C': {'A': (4, 10), 'D': (1, 3), 'E': (5, 12)},
    'D': {'B': (3, 8), 'C': (1, 3), 'E': (2, 6)},
    'E': {'C': (5, 12), 'D': (2, 6)}
}
```"""

# Method 1: Use the built-in chat template (RECOMMENDED)
print("METHOD 1: Using tokenizer.apply_chat_template()")
try:
    messages = [
        {"role": "user", "content": user_message}
    ]
    
    # Apply chat template and tokenize
    formatted_prompt = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True  # This adds the assistant's turn
    )
    
    print("Formatted prompt:")
    print(repr(formatted_prompt[:200] + "..." + formatted_prompt[-100:]))
    print("\n")
    
    # Tokenize the formatted prompt
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
    
    # Generate with proper parameters
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=2048,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.05,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    
    # Decode only the new tokens (exclude the input)
    new_tokens = generated_ids[0][inputs['input_ids'].shape[1]:]
    response = tokenizer.decode(new_tokens, skip_special_tokens=True)
    
    print("GENERATED RESPONSE:")
    print("="*50)
    print(response)
    print("="*50)
    print(f"Response length: {len(response)} characters")
    
    # Store this for uncertainty analysis
    successful_inputs = inputs
    successful_generated = generated_ids
    
except Exception as e:
    print(f"Method 1 failed: {e}")
    print(f"Error type: {type(e)}")
    import traceback
    traceback.print_exc()

print("\n" + "-"*60 + "\n")

# Method 2: Manual template formatting (fallback)
print("METHOD 2: Manual template formatting")
try:
    # Qwen2.5 typically uses this format
    manual_prompt = f"<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n"
    
    print("Manual prompt format:")
    print(repr(manual_prompt[:200] + "..." + manual_prompt[-50:]))
    print("\n")
    
    inputs = tokenizer(manual_prompt, return_tensors="pt").to(model.device)
    
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=2048,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.05,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    
    # Decode only the new tokens
    new_tokens = generated_ids[0][inputs['input_ids'].shape[1]:]
    response = tokenizer.decode(new_tokens, skip_special_tokens=True)
    
    print("MANUAL TEMPLATE RESPONSE:")
    print("="*50)
    print(response)
    print("="*50)
    print(f"Response length: {len(response)} characters")
    
except Exception as e:
    print(f"Method 2 failed: {e}")

print("\n" + "-"*60 + "\n")

# Method 3: Check tokenizer special tokens
print("METHOD 3: Debugging tokenizer info")
print(f"Special tokens: {tokenizer.special_tokens_map}")
print(f"BOS token: {repr(tokenizer.bos_token)} (ID: {tokenizer.bos_token_id})")
print(f"EOS token: {repr(tokenizer.eos_token)} (ID: {tokenizer.eos_token_id})")
print(f"UNK token: {repr(tokenizer.unk_token)} (ID: {tokenizer.unk_token_id})")
print(f"PAD token: {repr(tokenizer.pad_token)} (ID: {tokenizer.pad_token_id})")

# Check what tokens are being added
test_messages = [{"role": "user", "content": "Hello"}]
test_formatted = tokenizer.apply_chat_template(test_messages, tokenize=False, add_generation_prompt=True)
print(f"\nTest chat template result:")
print(repr(test_formatted))

# Now do uncertainty analysis on the successful generation
if 'successful_inputs' in locals() and 'successful_generated' in locals():
    print("\n" + "="*60)
    print("UNCERTAINTY ANALYSIS")
    print("="*60)
    
    with torch.no_grad():
        # Get logits for the successful generation
        outputs = model(**successful_inputs, return_dict=True)
        logits = outputs.logits
        
        print(f"Logits shape: {logits.shape}")
        
        # Calculate entropy using the corrected method from before
        def calculate_entropy_safe(logits, top_k=1000):
            # Use top-k to reduce computation
            if top_k < logits.size(-1):
                top_k_logits, _ = torch.topk(logits, top_k, dim=-1)
                working_logits = top_k_logits.float()
            else:
                working_logits = logits.float()
            
            # Calculate probabilities safely
            log_probs = F.log_softmax(working_logits, dim=-1)
            probs = torch.exp(log_probs)
            
            # Add epsilon and normalize
            epsilon = 1e-8
            probs = probs + epsilon
            probs = probs / probs.sum(dim=-1, keepdim=True)
            
            # Manual entropy calculation
            entropy_per_token = -(probs * torch.log(probs + epsilon)).sum(dim=-1)
            return entropy_per_token
        
        entropy_per_token = calculate_entropy_safe(logits)
        avg_entropy = entropy_per_token.mean().item()
        max_entropy = entropy_per_token.max().item()
        min_entropy = entropy_per_token.min().item()
        
        print(f"Average entropy: {avg_entropy:.4f}")
        print(f"Max entropy: {max_entropy:.4f}")
        print(f"Min entropy: {min_entropy:.4f}")
        
        # Show entropy pattern over the sequence
        if entropy_per_token.numel() > 1:
            print(f"Entropy trend (first 10): {entropy_per_token[0, :10].tolist()}")
            print(f"Entropy trend (last 10): {entropy_per_token[0, -10:].tolist()}")

print(f"\nMemory usage:")
print(f"Allocated VRAM: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"Reserved VRAM: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.93s/it]


Model's default chat template:
'{%- if tools %}\n    {{- \'<|im_start|>system\\n\' }}\n    {%- if messages[0][\'role\'] == \'system\' %}\n        {{- messages[0][\'content\'] }}\n    {%- else %}\n        {{- \'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\' }}\n    {%- endif %}\n    {{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}\n    {%- for tool in tools %}\n        {{- "\\n" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}\n{%- else %}\n    {%- if messages[0][\'role\'] == \'system\' %}\n        {{- \'<|im_start|>system\\n\' + messages[0][\'content\'] + \'<|im_end|>\\n