In [1]:
import pandas as pd
import torch
import torch.nn.functional as F
import numpy as np
import json
import os

In [2]:
def calculate_entropy_metrics(output):
    """Calculate entropy-based uncertainty metrics"""
    if not output.scores:
        return {'mean_entropy': 0.0, 'max_entropy': 0.0, 'min_entropy': 0.0, 'std_entropy': 0.0}
    
    entropies = []
    for score in output.scores:
        logits = score[0]
        probs = F.softmax(logits, dim=-1)
        entropy = -(probs * torch.log(probs + 1e-8)).sum().item()
        entropies.append(entropy)
    
    return {
        'mean_entropy': np.mean(entropies),
        'max_entropy': np.max(entropies),
        'min_entropy': np.min(entropies),
        'std_entropy': np.std(entropies)
    }

def calculate_confidence_metrics(output):
    """Calculate confidence-based metrics (max probability per token)"""
    if not output.scores:
        return {'mean_confidence': 0.0, 'min_confidence': 0.0, 'std_confidence': 0.0}
    
    confidences = []
    for score in output.scores:
        logits = score[0]
        probs = F.softmax(logits, dim=-1)
        max_prob = torch.max(probs).item()
        confidences.append(max_prob)
    
    return {
        'mean_confidence': np.mean(confidences),
        'min_confidence': np.min(confidences),
        'std_confidence': np.std(confidences)
    }

def calculate_concentration_metrics(output):
    """Calculate top-k concentration metrics"""
    if not output.scores:
        return {'mean_top5_conc': 0.0, 'mean_top10_conc': 0.0, 'min_top5_conc': 0.0}
    
    top5_concs = []
    top10_concs = []
    
    for score in output.scores:
        logits = score[0]
        probs = F.softmax(logits, dim=-1)
        sorted_probs, _ = torch.sort(probs, descending=True)
        
        top5_conc = torch.sum(sorted_probs[:5]).item()
        top10_conc = torch.sum(sorted_probs[:10]).item()
        
        top5_concs.append(top5_conc)
        top10_concs.append(top10_conc)
    
    return {
        'mean_top5_conc': np.mean(top5_concs),
        'mean_top10_conc': np.mean(top10_concs),
        'min_top5_conc': np.min(top5_concs)
    }

def calculate_sequence_metrics(output):
    """Calculate sequence-level metrics"""
    if not output.scores:
        return {'sequence_length': 0, 'entropy_trend': 0.0, 'confidence_trend': 0.0}
    
    # Get first and last token metrics for trends
    first_logits = output.scores[0][0]
    last_logits = output.scores[-1][0]
    
    first_probs = F.softmax(first_logits, dim=-1)
    last_probs = F.softmax(last_logits, dim=-1)
    
    first_entropy = -(first_probs * torch.log(first_probs + 1e-8)).sum().item()
    last_entropy = -(last_probs * torch.log(last_probs + 1e-8)).sum().item()
    
    first_confidence = torch.max(first_probs).item()
    last_confidence = torch.max(last_probs).item()
    
    return {
        'sequence_length': len(output.scores),
        'entropy_trend': last_entropy - first_entropy,
        'confidence_trend': last_confidence - first_confidence
    }

def calculate_distribution_metrics(output):
    """Calculate percentile-based distribution metrics"""
    if not output.scores:
        return {'entropy_p25': 0.0, 'entropy_p75': 0.0, 'confidence_p25': 0.0, 'confidence_p75': 0.0}
    
    entropies = []
    confidences = []
    
    for score in output.scores:
        logits = score[0]
        probs = F.softmax(logits, dim=-1)
        
        entropy = -(probs * torch.log(probs + 1e-8)).sum().item()
        confidence = torch.max(probs).item()
        
        entropies.append(entropy)
        confidences.append(confidence)
    
    return {
        'entropy_p25': np.percentile(entropies, 25),
        'entropy_p75': np.percentile(entropies, 75),
        'confidence_p25': np.percentile(confidences, 25),
        'confidence_p75': np.percentile(confidences, 75)
    }


In [None]:
def extract_all_metrics(output):
    """Combine all uncertainty metrics into single dict"""
    metrics = {}
    
    metrics.update(calculate_entropy_metrics(output))
    metrics.update(calculate_confidence_metrics(output))
    metrics.update(calculate_concentration_metrics(output))
    metrics.update(calculate_sequence_metrics(output))
    metrics.update(calculate_distribution_metrics(output))
    
    return metrics

In [4]:
def create_qwen_prompt(instruction):
    """Create structured prompt for Qwen model"""
    return [
        {
            "role": "system",
            "content": "You are a skilled Python programmer. Write clean, working code without comments or explanations. Focus only on solving the problem correctly."
        },
        {
            "role": "user", 
            "content": f"Write a Python function to solve this problem. Provide only the function code without comments or explanations:\n\n{instruction}"
        }
    ]

In [None]:
def extract_code_from_response(response):
    """Extract Python code from model response"""
    if "```python" in response:
        start = response.find("```python") + 9
        end = response.find("```", start)
        if end != -1:
            return response[start:end].strip()
    
    if "```" in response:
        start = response.find("```") + 3
        end = response.find("```", start)
        if end != -1:
            return response[start:end].strip()
    
    lines = response.split('\n')
    code_lines = []
    in_function = False
    
    for line in lines:
        if line.strip().startswith('def '):
            in_function = True
        if in_function:
            code_lines.append(line)
    
    return '\n'.join(code_lines).strip() if code_lines else response.strip()

In [None]:
def generate_single_response(model, tokenizer, instruction, seq_id):
    """Generate code for single problem and extract metrics"""
    try:
        messages = create_qwen_prompt(instruction)
        prompt = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        
        inputs = tokenizer(prompt, return_tensors="pt")
        device = next(model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=200,
                return_dict_in_generate=True,
                output_scores=True,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                pad_token_id=tokenizer.eos_token_id
            )
        
        generated_text = tokenizer.decode(
            output.sequences[0][inputs['input_ids'].shape[1]:], 
            skip_special_tokens=True
        )
        
        generated_code = extract_code_from_response(generated_text)
        
        metrics = extract_all_metrics(output)
        
        result = {
            'seq_id': seq_id,
            'instruction': instruction,
            'generated_text': generated_text,
            'generated_code': generated_code,
            'prompt': prompt
        }
        
        result.update(metrics)
        
        del output, inputs
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        return result
        
    except Exception as e:
        print(f"Error generating for {seq_id}: {e}")
        
        result = {
            'seq_id': seq_id,
            'instruction': instruction,
            'generated_text': '',
            'generated_code': '',
            'prompt': '',
            'error': str(e)
        }
        
        metric_names = ['mean_entropy', 'max_entropy', 'min_entropy', 'std_entropy',
                       'mean_confidence', 'min_confidence', 'std_confidence',
                       'mean_top5_conc', 'mean_top10_conc', 'min_top5_conc',
                       'sequence_length', 'entropy_trend', 'confidence_trend',
                       'entropy_p25', 'entropy_p75', 'confidence_p25', 'confidence_p75']
        
        for metric in metric_names:
            result[metric] = 0.0
            
        return result

In [None]:
def generate_code_with_metrics(csv_path, model, tokenizer, output_dir="generation_results", save_every=50):
    """
    Generate code for all problems and extract metrics.
    Save progress every N generations.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    print("Loading dataset...")
    df = pd.read_csv(csv_path)
    print(f"Loaded {len(df)} problems")
    
    results = []
    
    for idx, row in df.iterrows():
        print(f"\nGenerating {idx+1}/{len(df)}: {row['seq_id']}")
        
        result = generate_single_response(
            model, tokenizer, row['instruction'], row['seq_id']
        )
        
        result.update({
            'original_output': row['output'],
            'original_code': row['code'],
            'entry_point': row['entry_point'],
            'testcase': row['testcase']
        })
        
        results.append(result)
        
        if 'error' not in result:
            print(f"  Entropy: {result['mean_entropy']:.4f}, "
                  f"Confidence: {result['mean_confidence']:.4f}, "
                  f"Length: {result['sequence_length']}")
        else:
            print(f"  Error: {result['error']}")
        
        if (idx + 1) % save_every == 0 or idx == len(df) - 1:
            df_batch = pd.DataFrame(results)
            save_path = os.path.join(output_dir, f"generations_up_to_{idx+1}.csv")
            df_batch.to_csv(save_path, index=False)
            print(f"\nSaved progress to {save_path}")
            
            json_path = os.path.join(output_dir, f"generations_up_to_{idx+1}.json")
            with open(json_path, 'w') as f:
                json.dump(results, f, indent=2)
    
    final_df = pd.DataFrame(results)
    final_path = os.path.join(output_dir, "all_generations.csv")
    final_df.to_csv(final_path, index=False)
    
    print(f"\n✅ Generation complete! Saved {len(results)} results to {final_path}")
    print(f"Now you can inspect the data before running expensive testing phase.")
    
    # Quick summary
    successful = len([r for r in results if 'error' not in r])
    print(f"\nSummary:")
    print(f"- Successful generations: {successful}/{len(results)}")
    print(f"- Average entropy: {np.mean([r['mean_entropy'] for r in results if 'error' not in r]):.4f}")
    print(f"- Average confidence: {np.mean([r['mean_confidence'] for r in results if 'error' not in r]):.4f}")
    
    return final_df

In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from transformers import BitsAndBytesConfig


if __name__ == "__main__":
    bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    )
    model_name = "unsloth/Qwen2.5-Coder-14B-Instruct-bnb-4bit"
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=bnb_config)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    generations_df = generate_code_with_metrics(
        csv_path="C:/Users/s/Desktop/Dev/SamsungProject/extract/top-30-educational-instruct-rows.csv",
        model=model,
        tokenizer=tokenizer,
        output_dir="generation_results",
        save_every=50  
    )
    
    print("\n" + "="*50)
    print("PHASE 1 COMPLETE: Code generation with metrics")
    print("="*50)
    print("Next steps:")
    print("1. Inspect the generated code in generation_results/")
    print("2. Visualize the metrics to understand patterns")
    print("3. Then run testing phase to create labels")
    print("="*50)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.82s/it]


Loading dataset...
Loaded 30 problems

Generating 1/30: 660113403
  Entropy: 0.0000, Confidence: 1.0000, Length: 36

Generating 2/30: 14192481277
  Entropy: 0.0118, Confidence: 0.9947, Length: 48

Generating 3/30: 70726638201
  Entropy: 0.0135, Confidence: 0.9949, Length: 68

Generating 4/30: 893855581
  Entropy: 0.0418, Confidence: 0.9795, Length: 25

Generating 5/30: 20383247274
  Entropy: 0.0398, Confidence: 0.9798, Length: 57

Generating 6/30: 19434228727
  Entropy: 0.0530, Confidence: 0.9744, Length: 112

Generating 7/30: 28755799726
  Entropy: 0.0109, Confidence: 0.9955, Length: 48

Generating 8/30: 70726596281
  Entropy: 0.0315, Confidence: 0.9850, Length: 19

Generating 9/30: 32279616255
  Entropy: 0.0193, Confidence: 0.9917, Length: 28

Generating 10/30: 12120976763
  Entropy: 0.0123, Confidence: 0.9953, Length: 109

Generating 11/30: 18874307847
  Entropy: 0.0000, Confidence: 1.0000, Length: 46

Generating 12/30: 44289869161
  Entropy: 0.0376, Confidence: 0.9822, Length: 99



In [None]:
import pandas as pd
import torch
import torch.nn.functional as F
import numpy as np
import json
import os





def calculate_entropy_metrics(output):
    """Calculate entropy-based uncertainty metrics - FIXED"""
    if not hasattr(output, 'scores') or not output.scores:
        return {'mean_entropy': 0.0, 'max_entropy': 0.0, 'min_entropy': 0.0, 'std_entropy': 0.0}
    
    entropies = []
    for score in output.scores:
        
        
        logits = score[0]  
        
        
        probs = F.softmax(logits, dim=-1)
        
        
        
        entropy = -(probs * torch.log(probs + 1e-8)).sum().item()
        entropies.append(entropy)
    
    if not entropies:
        return {'mean_entropy': 0.0, 'max_entropy': 0.0, 'min_entropy': 0.0, 'std_entropy': 0.0}
    
    return {
        'mean_entropy': np.mean(entropies),
        'max_entropy': np.max(entropies),
        'min_entropy': np.min(entropies),
        'std_entropy': np.std(entropies)
    }

def calculate_confidence_metrics(output):
    """Calculate confidence-based metrics - FIXED"""
    if not hasattr(output, 'scores') or not output.scores:
        return {'mean_confidence': 0.0, 'min_confidence': 0.0, 'std_confidence': 0.0}
    
    confidences = []
    for score in output.scores:
        logits = score[0]
        probs = F.softmax(logits, dim=-1)
        max_prob = torch.max(probs).item()
        confidences.append(max_prob)
    
    if not confidences:
        return {'mean_confidence': 0.0, 'min_confidence': 0.0, 'std_confidence': 0.0}
    
    return {
        'mean_confidence': np.mean(confidences),
        'min_confidence': np.min(confidences),
        'std_confidence': np.std(confidences)
    }

def calculate_concentration_metrics(output):
    """Calculate top-k concentration metrics - FIXED"""
    if not hasattr(output, 'scores') or not output.scores:
        return {'mean_top5_conc': 0.0, 'mean_top10_conc': 0.0, 'min_top5_conc': 0.0}
    
    top5_concs = []
    top10_concs = []
    
    for score in output.scores:
        logits = score[0]
        probs = F.softmax(logits, dim=-1)
        sorted_probs, _ = torch.sort(probs, descending=True)
        
        top5_conc = torch.sum(sorted_probs[:5]).item()
        top10_conc = torch.sum(sorted_probs[:10]).item()
        
        top5_concs.append(top5_conc)
        top10_concs.append(top10_conc)
    
    if not top5_concs:
        return {'mean_top5_conc': 0.0, 'mean_top10_conc': 0.0, 'min_top5_conc': 0.0}
    
    return {
        'mean_top5_conc': np.mean(top5_concs),
        'mean_top10_conc': np.mean(top10_concs),
        'min_top5_conc': np.min(top5_concs)
    }

def calculate_sequence_metrics(output):
    """Calculate sequence-level metrics - FIXED"""
    if not hasattr(output, 'scores') or not output.scores or len(output.scores) < 2:
        return {'sequence_length': 0, 'entropy_trend': 0.0, 'confidence_trend': 0.0}
    
    
    first_logits = output.scores[0][0]
    last_logits = output.scores[-1][0]
    
    first_probs = F.softmax(first_logits, dim=-1)
    last_probs = F.softmax(last_logits, dim=-1)
    
    first_entropy = -(first_probs * torch.log(first_probs + 1e-8)).sum().item()
    last_entropy = -(last_probs * torch.log(last_probs + 1e-8)).sum().item()
    
    first_confidence = torch.max(first_probs).item()
    last_confidence = torch.max(last_probs).item()
    
    return {
        'sequence_length': len(output.scores),
        'entropy_trend': last_entropy - first_entropy,
        'confidence_trend': last_confidence - first_confidence
    }

def calculate_distribution_metrics(output):
    """Calculate percentile-based distribution metrics - FIXED"""
    if not hasattr(output, 'scores') or not output.scores:
        return {'entropy_p25': 0.0, 'entropy_p75': 0.0, 'confidence_p25': 0.0, 'confidence_p75': 0.0}
    
    entropies = []
    confidences = []
    
    for score in output.scores:
        logits = score[0]
        probs = F.softmax(logits, dim=-1)
        
        entropy = -(probs * torch.log(probs + 1e-8)).sum().item()
        confidence = torch.max(probs).item()
        
        entropies.append(entropy)
        confidences.append(confidence)
    
    if not entropies:
        return {'entropy_p25': 0.0, 'entropy_p75': 0.0, 'confidence_p25': 0.0, 'confidence_p75': 0.0}
    
    return {
        'entropy_p25': np.percentile(entropies, 25),
        'entropy_p75': np.percentile(entropies, 75),
        'confidence_p25': np.percentile(confidences, 25),
        'confidence_p75': np.percentile(confidences, 75)
    }

def extract_all_metrics(output):
    """Combine all uncertainty metrics into single dict"""
    metrics = {}
    
    
    metrics.update(calculate_entropy_metrics(output))
    metrics.update(calculate_confidence_metrics(output))
    metrics.update(calculate_concentration_metrics(output))
    metrics.update(calculate_sequence_metrics(output))
    metrics.update(calculate_distribution_metrics(output))
    
    return metrics





def generate_single_response(model, tokenizer, instruction, seq_id):
    """Generate code for single problem and extract metrics - IMPROVED"""
    try:
        
        messages = create_qwen_prompt(instruction)
        prompt = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        
        
        inputs = tokenizer(prompt, return_tensors="pt")
        device = next(model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=200,
                return_dict_in_generate=True,
                output_scores=True,
                do_sample=True,  
                temperature=0.8,  
                top_p=0.9,
                top_k=50,  
                pad_token_id=tokenizer.eos_token_id,
                
                num_beams=1,  
            )
        
        
        if not hasattr(output, 'scores') or not output.scores:
            print(f"WARNING: No scores returned for {seq_id}")
            return create_empty_result(seq_id, instruction, "No scores returned")
        
        print(f"DEBUG: Generated {len(output.scores)} tokens with scores for {seq_id}")
        
        
        generated_text = tokenizer.decode(
            output.sequences[0][inputs['input_ids'].shape[1]:], 
            skip_special_tokens=True
        )
        
        generated_code = extract_code_from_response(generated_text)
        
        
        metrics = extract_all_metrics(output)
        
        
        print(f"  Sample metrics - Entropy: {metrics.get('mean_entropy', 0):.4f}, "
              f"Confidence: {metrics.get('mean_confidence', 0):.4f}")
        
        
        result = {
            'seq_id': seq_id,
            'instruction': instruction,
            'generated_text': generated_text,
            'generated_code': generated_code,
            'prompt': prompt
        }
        
        
        result.update(metrics)
        
        
        del output, inputs
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        return result
        
    except Exception as e:
        print(f"Error generating for {seq_id}: {e}")
        return create_empty_result(seq_id, instruction, str(e))

def create_empty_result(seq_id, instruction, error_msg):
    """Create empty result with zero metrics"""
    result = {
        'seq_id': seq_id,
        'instruction': instruction,
        'generated_text': '',
        'generated_code': '',
        'prompt': '',
        'error': error_msg
    }
    
    
    metric_names = ['mean_entropy', 'max_entropy', 'min_entropy', 'std_entropy',
                   'mean_confidence', 'min_confidence', 'std_confidence',
                   'mean_top5_conc', 'mean_top10_conc', 'min_top5_conc',
                   'sequence_length', 'entropy_trend', 'confidence_trend',
                   'entropy_p25', 'entropy_p75', 'confidence_p25', 'confidence_p75']
    
    for metric in metric_names:
        result[metric] = 0.0
        
    return result

def create_qwen_prompt(instruction):
    """Create structured prompt for Qwen model"""
    return [
        {
            "role": "system",
            "content": "You are a skilled Python programmer. Write clean, working code without comments or explanations. Focus only on solving the problem correctly."
        },
        {
            "role": "user", 
            "content": f"Write a Python function to solve this problem. Provide only the function code without comments or explanations:\n\n{instruction}"
        }
    ]

def extract_code_from_response(response):
    """Extract Python code from model response"""
    
    if "```python" in response:
        start = response.find("```python") + 9
        end = response.find("```", start)
        if end != -1:
            return response[start:end].strip()
    
    if "```" in response:
        start = response.find("```") + 3
        end = response.find("```", start)
        if end != -1:
            return response[start:end].strip()
    
    
    lines = response.split('\n')
    code_lines = []
    in_function = False
    
    for line in lines:
        if line.strip().startswith('def '):
            in_function = True
        if in_function:
            code_lines.append(line)
    
    return '\n'.join(code_lines).strip() if code_lines else response.strip()





def generate_code_with_metrics(csv_path, model, tokenizer, output_dir="generation_results", save_every=50, max_samples=None):
    """
    Generate code for all problems and extract metrics.
    
    Args:
        csv_path: Path to input CSV
        model: Your language model
        tokenizer: Your tokenizer
        output_dir: Directory to save results
        save_every: Save progress every N generations
        max_samples: Limit number of samples (None for all)
    """
    
    os.makedirs(output_dir, exist_ok=True)
    
    
    print("Loading dataset...")
    df = pd.read_csv(csv_path)
    
    if max_samples:
        df = df.head(max_samples)
        print(f"Limited to {len(df)} samples")
    else:
        print(f"Loaded {len(df)} problems")
    
    results = []
    
    for idx, row in df.iterrows():
        print(f"\nGenerating {idx+1}/{len(df)}: {row['seq_id']}")
        
        
        result = generate_single_response(
            model, tokenizer, row['instruction'], row['seq_id']
        )
        
        
        result.update({
            'original_output': row['output'],
            'original_code': row['code'],
            'entry_point': row['entry_point'],
            'testcase': row['testcase']
        })
        
        results.append(result)
        
        
        if 'error' not in result:
            print(f"  Metrics - Entropy: {result['mean_entropy']:.4f}, "
                  f"Confidence: {result['mean_confidence']:.4f}, "
                  f"Length: {result['sequence_length']}")
        else:
            print(f"  Error: {result['error']}")
        
        
        if (idx + 1) % save_every == 0 or idx == len(df) - 1:
            df_batch = pd.DataFrame(results)
            save_path = os.path.join(output_dir, f"generations_up_to_{idx+1}.csv")
            df_batch.to_csv(save_path, index=False)
            print(f"\nSaved progress to {save_path}")
    
    
    final_df = pd.DataFrame(results)
    final_path = os.path.join(output_dir, "all_generations.csv")
    final_df.to_csv(final_path, index=False)
    
    print(f"\n✅ Generation complete! Saved {len(results)} results to {final_path}")
    
    
    successful = len([r for r in results if 'error' not in r and r['mean_entropy'] > 0])
    print(f"\nSummary:")
    print(f"- Successful generations: {successful}/{len(results)}")
    
    if successful > 0:
        valid_results = [r for r in results if 'error' not in r and r['mean_entropy'] > 0]
        print(f"- Average entropy: {np.mean([r['mean_entropy'] for r in valid_results]):.4f}")
        print(f"- Average confidence: {np.mean([r['mean_confidence'] for r in valid_results]):.4f}")
        print(f"- Entropy range: {np.min([r['mean_entropy'] for r in valid_results]):.4f} - {np.max([r['mean_entropy'] for r in valid_results]):.4f}")
    
    return final_df





import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from transformers import BitsAndBytesConfig


if __name__ == "__main__":
    bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    )
    model_name = "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit"
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    
    generations_df = generate_code_with_metrics(
        csv_path="C:/Users/s/Desktop/Dev/SamsungProject/extract/top-30-educational-instruct-rows.csv",
        model=model,
        tokenizer=tokenizer,
        output_dir="generation_results",
        save_every=50,  
        max_samples=100
    )
    
    print("\n" + "="*50)
    print("PHASE 1 COMPLETE: Code generation with metrics")
    print("="*50)
    print("Next steps:")
    print("1. Inspect the generated code in generation_results/")
    print("2. Visualize the metrics to understand patterns")
    print("3. Then run testing phase to create labels")
    print("="*50)

  from .autonotebook import tqdm as notebook_tqdm


Loading dataset...
Limited to 30 samples

Generating 1/30: 660113403
DEBUG: Generated 36 tokens with scores for 660113403
  Sample metrics - Entropy: 0.0428, Confidence: 0.9812
  Metrics - Entropy: 0.0428, Confidence: 0.9812, Length: 36

Generating 2/30: 14192481277
DEBUG: Generated 26 tokens with scores for 14192481277
  Sample metrics - Entropy: 0.0654, Confidence: 0.9651
  Metrics - Entropy: 0.0654, Confidence: 0.9651, Length: 26

Generating 3/30: 70726638201
DEBUG: Generated 63 tokens with scores for 70726638201
  Sample metrics - Entropy: 0.0165, Confidence: 0.9931
  Metrics - Entropy: 0.0165, Confidence: 0.9931, Length: 63

Generating 4/30: 893855581
DEBUG: Generated 22 tokens with scores for 893855581
  Sample metrics - Entropy: 0.0210, Confidence: 0.9921
  Metrics - Entropy: 0.0210, Confidence: 0.9921, Length: 22

Generating 5/30: 20383247274
DEBUG: Generated 49 tokens with scores for 20383247274
  Sample metrics - Entropy: 0.1041, Confidence: 0.9497
  Metrics - Entropy: 0.1041

In [2]:
import pandas as pd
import torch
import torch.nn.functional as F
import numpy as np
import json
import os
from scipy.stats import entropy

# =============================================================================
# CALIBRATED UNCERTAINTY METRICS FUNCTIONS
# =============================================================================

def calculate_calibrated_uncertainty_metrics(output, temperature_param=1.5):
    """
    Calculate calibrated uncertainty metrics to address overconfidence.
    
    Args:
        output: Model output with scores
        temperature_param: Temperature for calibration (higher = less confident)
    
    Returns:
        Dict with calibrated uncertainty metrics
    """
    if not hasattr(output, 'scores') or not output.scores:
        return get_empty_calibrated_metrics()
    
    calibrated_entropies = []
    calibrated_confidences = []
    token_disagreements = []
    prob_concentrations = []
    semantic_uncertainties = []
    
    for i, score in enumerate(output.scores):
        logits = score[0]  # Shape: (vocab_size,)
        
        # 1. Temperature-scaled probabilities (main calibration)
        calibrated_probs = F.softmax(logits / temperature_param, dim=-1)
        
        # 2. Calibrated entropy
        calibrated_entropy = -(calibrated_probs * torch.log(calibrated_probs + 1e-8)).sum().item()
        calibrated_entropies.append(calibrated_entropy)
        
        # 3. Calibrated confidence
        calibrated_confidence = torch.max(calibrated_probs).item()
        calibrated_confidences.append(calibrated_confidence)
        
        # 4. Token disagreement (variance in predictions)
        if i > 0:  # Compare with previous token
            prev_logits = output.scores[i-1][0]
            prev_calibrated_probs = F.softmax(prev_logits / temperature_param, dim=-1)
            
            # KL divergence between consecutive tokens
            kl_div = F.kl_div(
                torch.log(calibrated_probs + 1e-8), 
                prev_calibrated_probs, 
                reduction='sum'
            ).item()
            token_disagreements.append(kl_div)
        
        # 5. Probability mass concentration (how spread out is the distribution)
        sorted_probs, _ = torch.sort(calibrated_probs, descending=True)
        top_10_mass = torch.sum(sorted_probs[:10]).item()
        prob_concentrations.append(top_10_mass)
        
        # 6. Semantic uncertainty (position-aware)
        # Higher uncertainty at beginning/end of sequences
        position_weight = 1.0 + 0.5 * abs(i - len(output.scores)/2) / len(output.scores)
        semantic_uncertainty = calibrated_entropy * position_weight
        semantic_uncertainties.append(semantic_uncertainty)
    
    # Aggregate metrics
    metrics = {
        # Calibrated basic metrics
        'calibrated_mean_entropy': np.mean(calibrated_entropies),
        'calibrated_max_entropy': np.max(calibrated_entropies),
        'calibrated_std_entropy': np.std(calibrated_entropies),
        'calibrated_mean_confidence': np.mean(calibrated_confidences),
        'calibrated_min_confidence': np.min(calibrated_confidences),
        
        # Advanced calibrated metrics
        'token_disagreement_mean': np.mean(token_disagreements) if token_disagreements else 0.0,
        'token_disagreement_max': np.max(token_disagreements) if token_disagreements else 0.0,
        'prob_concentration_mean': np.mean(prob_concentrations),
        'prob_concentration_std': np.std(prob_concentrations),
        
        # Semantic uncertainty
        'semantic_uncertainty_mean': np.mean(semantic_uncertainties),
        'semantic_uncertainty_max': np.max(semantic_uncertainties),
        
        # Distribution shape metrics
        'entropy_coefficient_variation': np.std(calibrated_entropies) / (np.mean(calibrated_entropies) + 1e-8),
        'confidence_range': np.max(calibrated_confidences) - np.min(calibrated_confidences),
        
        # Temperature used for calibration
        'calibration_temperature': temperature_param
    }
    
    return metrics

def calculate_mutual_information_uncertainty(output, temperature_param=1.5):
    """
    Calculate mutual information between consecutive tokens as uncertainty measure.
    High MI = tokens are predictable from each other = low uncertainty
    """
    if not hasattr(output, 'scores') or len(output.scores) < 2:
        return {'mutual_information': 0.0, 'avg_token_surprise': 0.0}
    
    mi_scores = []
    surprises = []
    
    for i in range(1, len(output.scores)):
        curr_logits = output.scores[i][0]
        prev_logits = output.scores[i-1][0]
        
        # Calibrated probabilities
        curr_probs = F.softmax(curr_logits / temperature_param, dim=-1)
        prev_probs = F.softmax(prev_logits / temperature_param, dim=-1)
        
        # Mutual information approximation
        joint_entropy = -(curr_probs * torch.log(curr_probs + 1e-8)).sum().item()
        conditional_entropy = F.kl_div(
            torch.log(curr_probs + 1e-8), 
            prev_probs, 
            reduction='sum'
        ).item()
        
        mi = max(0, joint_entropy - conditional_entropy)
        mi_scores.append(mi)
        
        # Token surprise (how unexpected was this token)
        actual_token_idx = torch.argmax(curr_logits).item()
        surprise = -torch.log(curr_probs[actual_token_idx] + 1e-8).item()
        surprises.append(surprise)
    
    return {
        'mutual_information': np.mean(mi_scores),
        'avg_token_surprise': np.mean(surprises),
        'max_token_surprise': np.max(surprises) if surprises else 0.0
    }

def get_empty_calibrated_metrics():
    """Return empty calibrated metrics for error cases"""
    return {
        'calibrated_mean_entropy': 0.0,
        'calibrated_max_entropy': 0.0,
        'calibrated_std_entropy': 0.0,
        'calibrated_mean_confidence': 0.0,
        'calibrated_min_confidence': 0.0,
        'token_disagreement_mean': 0.0,
        'token_disagreement_max': 0.0,
        'prob_concentration_mean': 0.0,
        'prob_concentration_std': 0.0,
        'semantic_uncertainty_mean': 0.0,
        'semantic_uncertainty_max': 0.0,
        'entropy_coefficient_variation': 0.0,
        'confidence_range': 0.0,
        'calibration_temperature': 1.5,
        'mutual_information': 0.0,
        'avg_token_surprise': 0.0,
        'max_token_surprise': 0.0
    }

# =============================================================================
# ORIGINAL UNCERTAINTY METRICS FUNCTIONS (FIXED)
# =============================================================================

def calculate_entropy_metrics(output):
    """Calculate entropy-based uncertainty metrics - FIXED"""
    if not hasattr(output, 'scores') or not output.scores:
        return {'mean_entropy': 0.0, 'max_entropy': 0.0, 'min_entropy': 0.0, 'std_entropy': 0.0}
    
    entropies = []
    for score in output.scores:
        # score is a tensor of shape (batch_size, vocab_size)
        # We want the first batch element
        logits = score[0]  # Shape: (vocab_size,)
        
        # Convert to probabilities
        probs = F.softmax(logits, dim=-1)
        
        # Calculate entropy: -sum(p * log(p))
        # Add epsilon to avoid log(0)
        entropy = -(probs * torch.log(probs + 1e-8)).sum().item()
        entropies.append(entropy)
    
    if not entropies:
        return {'mean_entropy': 0.0, 'max_entropy': 0.0, 'min_entropy': 0.0, 'std_entropy': 0.0}
    
    return {
        'mean_entropy': np.mean(entropies),
        'max_entropy': np.max(entropies),
        'min_entropy': np.min(entropies),
        'std_entropy': np.std(entropies)
    }

def calculate_confidence_metrics(output):
    """Calculate confidence-based metrics - FIXED"""
    if not hasattr(output, 'scores') or not output.scores:
        return {'mean_confidence': 0.0, 'min_confidence': 0.0, 'std_confidence': 0.0}
    
    confidences = []
    for score in output.scores:
        logits = score[0]
        probs = F.softmax(logits, dim=-1)
        max_prob = torch.max(probs).item()
        confidences.append(max_prob)
    
    if not confidences:
        return {'mean_confidence': 0.0, 'min_confidence': 0.0, 'std_confidence': 0.0}
    
    return {
        'mean_confidence': np.mean(confidences),
        'min_confidence': np.min(confidences),
        'std_confidence': np.std(confidences)
    }

def calculate_concentration_metrics(output):
    """Calculate top-k concentration metrics - FIXED"""
    if not hasattr(output, 'scores') or not output.scores:
        return {'mean_top5_conc': 0.0, 'mean_top10_conc': 0.0, 'min_top5_conc': 0.0}
    
    top5_concs = []
    top10_concs = []
    
    for score in output.scores:
        logits = score[0]
        probs = F.softmax(logits, dim=-1)
        sorted_probs, _ = torch.sort(probs, descending=True)
        
        top5_conc = torch.sum(sorted_probs[:5]).item()
        top10_conc = torch.sum(sorted_probs[:10]).item()
        
        top5_concs.append(top5_conc)
        top10_concs.append(top10_conc)
    
    if not top5_concs:
        return {'mean_top5_conc': 0.0, 'mean_top10_conc': 0.0, 'min_top5_conc': 0.0}
    
    return {
        'mean_top5_conc': np.mean(top5_concs),
        'mean_top10_conc': np.mean(top10_concs),
        'min_top5_conc': np.min(top5_concs)
    }

def calculate_sequence_metrics(output):
    """Calculate sequence-level metrics - FIXED"""
    if not hasattr(output, 'scores') or not output.scores or len(output.scores) < 2:
        return {'sequence_length': 0, 'entropy_trend': 0.0, 'confidence_trend': 0.0}
    
    # Get first and last token metrics for trends
    first_logits = output.scores[0][0]
    last_logits = output.scores[-1][0]
    
    first_probs = F.softmax(first_logits, dim=-1)
    last_probs = F.softmax(last_logits, dim=-1)
    
    first_entropy = -(first_probs * torch.log(first_probs + 1e-8)).sum().item()
    last_entropy = -(last_probs * torch.log(last_probs + 1e-8)).sum().item()
    
    first_confidence = torch.max(first_probs).item()
    last_confidence = torch.max(last_probs).item()
    
    return {
        'sequence_length': len(output.scores),
        'entropy_trend': last_entropy - first_entropy,
        'confidence_trend': last_confidence - first_confidence
    }

def calculate_distribution_metrics(output):
    """Calculate percentile-based distribution metrics - FIXED"""
    if not hasattr(output, 'scores') or not output.scores:
        return {'entropy_p25': 0.0, 'entropy_p75': 0.0, 'confidence_p25': 0.0, 'confidence_p75': 0.0}
    
    entropies = []
    confidences = []
    
    for score in output.scores:
        logits = score[0]
        probs = F.softmax(logits, dim=-1)
        
        entropy = -(probs * torch.log(probs + 1e-8)).sum().item()
        confidence = torch.max(probs).item()
        
        entropies.append(entropy)
        confidences.append(confidence)
    
    if not entropies:
        return {'entropy_p25': 0.0, 'entropy_p75': 0.0, 'confidence_p25': 0.0, 'confidence_p75': 0.0}
    
    return {
        'entropy_p25': np.percentile(entropies, 25),
        'entropy_p75': np.percentile(entropies, 75),
        'confidence_p25': np.percentile(confidences, 25),
        'confidence_p75': np.percentile(confidences, 75)
    }

def extract_all_metrics(output, use_calibrated=True, temperature_param=1.5):
    """Combine original + calibrated uncertainty metrics"""
    metrics = {}
    
    # Original metrics (keep these for comparison)
    metrics.update(calculate_entropy_metrics(output))
    metrics.update(calculate_confidence_metrics(output))
    metrics.update(calculate_concentration_metrics(output))
    metrics.update(calculate_sequence_metrics(output))
    metrics.update(calculate_distribution_metrics(output))
    
    # NEW: Add calibrated metrics
    if use_calibrated:
        metrics.update(calculate_calibrated_uncertainty_metrics(output, temperature_param))
        metrics.update(calculate_mutual_information_uncertainty(output, temperature_param))
    
    return metrics

# =============================================================================
# IMPROVED CODE GENERATION PIPELINE
# =============================================================================

def generate_single_response(model, tokenizer, instruction, seq_id):
    """Generate code for single problem and extract metrics - IMPROVED"""
    try:
        # Create and format prompt
        messages = create_qwen_prompt(instruction)
        prompt = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        
        # Tokenize and move to device
        inputs = tokenizer(prompt, return_tensors="pt")
        device = next(model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # IMPROVED GENERATION PARAMETERS FOR UNCERTAINTY EXTRACTION
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=200,
                return_dict_in_generate=True,
                output_scores=True,
                do_sample=True,  # This is crucial for uncertainty
                temperature=1.2,  # HIGHER temperature for more uncertainty spread
                top_p=0.85,      # Slightly lower top_p
                top_k=40,        # Smaller top_k for more focused sampling
                repetition_penalty=1.05,  # Prevent repetitive patterns
                pad_token_id=tokenizer.eos_token_id,
                num_beams=1,     # Single beam for uncertainty
                early_stopping=False,  # Let it generate more naturally
            )
        
        # Debug: Check if we have scores
        if not hasattr(output, 'scores') or not output.scores:
            print(f"WARNING: No scores returned for {seq_id}")
            return create_empty_result(seq_id, instruction, "No scores returned")
        
        print(f"DEBUG: Generated {len(output.scores)} tokens with scores for {seq_id}")
        
        # Extract generated text and code
        generated_text = tokenizer.decode(
            output.sequences[0][inputs['input_ids'].shape[1]:], 
            skip_special_tokens=True
        )
        
        generated_code = extract_code_from_response(generated_text)
        
        # Extract all uncertainty metrics (including calibrated ones)
        metrics = extract_all_metrics(output, use_calibrated=True, temperature_param=1.5)
        
        # Debug: Print some metrics including calibrated ones
        print(f"  Original  - Entropy: {metrics.get('mean_entropy', 0):.4f}, "
              f"Confidence: {metrics.get('mean_confidence', 0):.4f}")
        print(f"  Calibrated- Entropy: {metrics.get('calibrated_mean_entropy', 0):.4f}, "
              f"Confidence: {metrics.get('calibrated_mean_confidence', 0):.4f}")
        print(f"  Advanced  - Token Disagreement: {metrics.get('token_disagreement_mean', 0):.4f}, "
              f"Mutual Info: {metrics.get('mutual_information', 0):.4f}")
        
        # Create result record
        result = {
            'seq_id': seq_id,
            'instruction': instruction,
            'generated_text': generated_text,
            'generated_code': generated_code,
            'prompt': prompt
        }
        
        # Add all metrics
        result.update(metrics)
        
        # Clear memory
        del output, inputs
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        return result
        
    except Exception as e:
        print(f"Error generating for {seq_id}: {e}")
        return create_empty_result(seq_id, instruction, str(e))

def create_empty_result(seq_id, instruction, error_msg):
    """Create empty result with zero metrics"""
    result = {
        'seq_id': seq_id,
        'instruction': instruction,
        'generated_text': '',
        'generated_code': '',
        'prompt': '',
        'error': error_msg
    }
    
    # Add zero metrics - original ones
    original_metric_names = ['mean_entropy', 'max_entropy', 'min_entropy', 'std_entropy',
                           'mean_confidence', 'min_confidence', 'std_confidence',
                           'mean_top5_conc', 'mean_top10_conc', 'min_top5_conc',
                           'sequence_length', 'entropy_trend', 'confidence_trend',
                           'entropy_p25', 'entropy_p75', 'confidence_p25', 'confidence_p75']
    
    # Add zero metrics - calibrated ones
    calibrated_metric_names = ['calibrated_mean_entropy', 'calibrated_max_entropy', 'calibrated_std_entropy',
                              'calibrated_mean_confidence', 'calibrated_min_confidence',
                              'token_disagreement_mean', 'token_disagreement_max',
                              'prob_concentration_mean', 'prob_concentration_std',
                              'semantic_uncertainty_mean', 'semantic_uncertainty_max',
                              'entropy_coefficient_variation', 'confidence_range',
                              'calibration_temperature', 'mutual_information',
                              'avg_token_surprise', 'max_token_surprise']
    
    all_metric_names = original_metric_names + calibrated_metric_names
    
    for metric in all_metric_names:
        result[metric] = 0.0
        
    return result

def create_qwen_prompt(instruction):
    """Create structured prompt for Qwen model"""
    return [
        {
            "role": "system",
            "content": "You are a skilled Python programmer. Write clean, working code without comments or explanations. Focus only on solving the problem correctly."
        },
        {
            "role": "user", 
            "content": f"Write a Python function to solve this problem. Provide only the function code without comments or explanations:\n\n{instruction}"
        }
    ]

def extract_code_from_response(response):
    """Extract Python code from model response"""
    # Look for code blocks first
    if "```python" in response:
        start = response.find("```python") + 9
        end = response.find("```", start)
        if end != -1:
            return response[start:end].strip()
    
    if "```" in response:
        start = response.find("```") + 3
        end = response.find("```", start)
        if end != -1:
            return response[start:end].strip()
    
    # Look for function definitions
    lines = response.split('\n')
    code_lines = []
    in_function = False
    
    for line in lines:
        if line.strip().startswith('def '):
            in_function = True
        if in_function:
            code_lines.append(line)
    
    return '\n'.join(code_lines).strip() if code_lines else response.strip()

# =============================================================================
# MAIN GENERATION FUNCTION
# =============================================================================

def generate_code_with_metrics(csv_path, model, tokenizer, output_dir="generation_results", save_every=50, max_samples=None):
    """
    Generate code for all problems and extract metrics.
    
    Args:
        csv_path: Path to input CSV
        model: Your language model
        tokenizer: Your tokenizer
        output_dir: Directory to save results
        save_every: Save progress every N generations
        max_samples: Limit number of samples (None for all)
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    # Load dataset
    print("Loading dataset...")
    df = pd.read_csv(csv_path)
    
    if max_samples:
        df = df.head(max_samples)
        print(f"Limited to {len(df)} samples")
    else:
        print(f"Loaded {len(df)} problems")
    
    results = []
    
    for idx, row in df.iterrows():
        print(f"\nGenerating {idx+1}/{len(df)}: {row['seq_id']}")
        
        # Generate single response with metrics
        result = generate_single_response(
            model, tokenizer, row['instruction'], row['seq_id']
        )
        
        # Add original data
        result.update({
            'original_output': row['output'],
            'original_code': row['code'],
            'entry_point': row['entry_point'],
            'testcase': row['testcase']
        })
        
        results.append(result)
        
        # Print key metrics
        if 'error' not in result:
            print(f"  Final Summary - Original Entropy: {result['mean_entropy']:.4f}, "
                  f"Calibrated Entropy: {result['calibrated_mean_entropy']:.4f}")
            print(f"                  Original Confidence: {result['mean_confidence']:.4f}, "
                  f"Calibrated Confidence: {result['calibrated_mean_confidence']:.4f}")
        else:
            print(f"  Error: {result['error']}")
        
        # Save progress every N generations
        if (idx + 1) % save_every == 0 or idx == len(df) - 1:
            df_batch = pd.DataFrame(results)
            save_path = os.path.join(output_dir, f"generations_up_to_{idx+1}.csv")
            df_batch.to_csv(save_path, index=False)
            print(f"\nSaved progress to {save_path}")
    
    # Final save
    final_df = pd.DataFrame(results)
    final_path = os.path.join(output_dir, "all_generations_with_calibrated_metrics.csv")
    final_df.to_csv(final_path, index=False)
    
    print(f"\n✅ Generation complete! Saved {len(results)} results to {final_path}")
    
    # Enhanced summary with calibrated metrics
    successful = len([r for r in results if 'error' not in r and r['mean_entropy'] > 0])
    print(f"\nSummary:")
    print(f"- Successful generations: {successful}/{len(results)}")
    
    if successful > 0:
        valid_results = [r for r in results if 'error' not in r and r['mean_entropy'] > 0]
        
        # Original metrics summary
        print(f"\nOriginal Metrics:")
        print(f"- Average entropy: {np.mean([r['mean_entropy'] for r in valid_results]):.4f}")
        print(f"- Average confidence: {np.mean([r['mean_confidence'] for r in valid_results]):.4f}")
        print(f"- Entropy range: {np.min([r['mean_entropy'] for r in valid_results]):.4f} - {np.max([r['mean_entropy'] for r in valid_results]):.4f}")
        
        # Calibrated metrics summary
        print(f"\nCalibrated Metrics:")
        print(f"- Average calibrated entropy: {np.mean([r['calibrated_mean_entropy'] for r in valid_results]):.4f}")
        print(f"- Average calibrated confidence: {np.mean([r['calibrated_mean_confidence'] for r in valid_results]):.4f}")
        print(f"- Average token disagreement: {np.mean([r['token_disagreement_mean'] for r in valid_results]):.4f}")
        print(f"- Average mutual information: {np.mean([r['mutual_information'] for r in valid_results]):.4f}")
        print(f"- Calibrated entropy range: {np.min([r['calibrated_mean_entropy'] for r in valid_results]):.4f} - {np.max([r['calibrated_mean_entropy'] for r in valid_results]):.4f}")
    
    return final_df

# =============================================================================
# MAIN EXECUTION
# =============================================================================

if __name__ == "__main__":
    import torch
    from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
    
    # Setup quantized model
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
    )
    
    model_name = "unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit"
    print(f"Loading model: {model_name}")
    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Ensure tokenizer has pad token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    print("Model loaded successfully!")
    print("="*60)
    
    # Phase 1: Generate code and extract metrics (including calibrated uncertainty)
    print("STARTING PHASE 1: Code generation with calibrated uncertainty metrics")
    print("="*60)
    
    generations_df = generate_code_with_metrics(
        csv_path="C:/Users/s/Desktop/Dev/SamsungProject/extract/top-30-educational-instruct-rows.csv",
        model=model,
        tokenizer=tokenizer,
        output_dir="generation_results_calibrated",
        save_every=25,  # Save every 25 generations
        max_samples=50  # Start with 50 samples to test
    )
    
    print("\n" + "="*60)
    print("PHASE 1 COMPLETE: Code generation with calibrated uncertainty metrics")
    print("="*60)
    print("Generated files:")
    print("- all_generations_with_calibrated_metrics.csv")
    print("\nKey improvements:")
    print("✅ Temperature calibration (1.2 generation, 1.5 post-processing)")
    print("✅ Token disagreement metrics")
    print("✅ Mutual information uncertainty")
    print("✅ Semantic uncertainty (position-aware)")
    print("✅ Better sampling parameters")
    print("\nNext steps:")
    print("1. Inspect the calibrated metrics - should see more variation!")
    print("2. Compare original vs calibrated uncertainty distributions")
    print("3. Run testing phase to create labels using Pass@1")
    print("="*60)

Loading model: unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit
Model loaded successfully!
STARTING PHASE 1: Code generation with calibrated uncertainty metrics
Loading dataset...
Limited to 30 samples

Generating 1/30: 660113403
DEBUG: Generated 43 tokens with scores for 660113403
  Original  - Entropy: 0.0371, Confidence: 0.9833
  Calibrated- Entropy: 0.0424, Confidence: 0.9796
  Advanced  - Token Disagreement: 18.3773, Mutual Info: 0.0000
  Final Summary - Original Entropy: 0.0371, Calibrated Entropy: 0.0424
                  Original Confidence: 0.9833, Calibrated Confidence: 0.9796

Generating 2/30: 14192481277
DEBUG: Generated 26 tokens with scores for 14192481277
  Original  - Entropy: 0.0680, Confidence: 0.9666
  Calibrated- Entropy: 0.0742, Confidence: 0.9593
  Advanced  - Token Disagreement: 18.3435, Mutual Info: 0.0000
  Final Summary - Original Entropy: 0.0680, Calibrated Entropy: 0.0742
                  Original Confidence: 0.9666, Calibrated Confidence: 0.9593

Generating 3/30