## Enhanced Inference with Fine-tuned Qwen2.5-Coder

This notebook performs inference using the fine-tuned Qwen2.5-Coder model on the test dataset with advanced test-driven refinement.

## Setup and Model Loading

In [None]:
# Import required libraries
from transformers import pipeline
import torch
import pandas as pd
from tqdm.auto import tqdm
import json
import re
import os
import gc
import ast
import signal
import sys
from io import StringIO
from collections import defaultdict
import random
from typing import List, Dict, Tuple, Optional
from datasets import Dataset

In [None]:
# Memory management and system check
def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

clear_memory()

# System resources
if torch.cuda.is_available():
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"GPU Memory: {gpu_memory:.1f} GB")
else:
    print("No GPU detected - using CPU (will be very slow)")

## Data Loading

In [None]:
# Load test dataset
test_data_path = 'PATH_TO_TRANSLATED_DATESET'
test_df = pd.read_csv(test_data_path)
print(f"Loaded {len(test_df)} test samples")
print(f"Sample data structure: {list(test_df.columns)}")

In [None]:
# Load fine-tuned model
print("Loading fine-tuned Qwen2.5-Coder model...")

# First try to load the fine-tuned model, fallback to base model if not available
try:
    from unsloth import FastLanguageModel
    
    # Try loading fine-tuned model
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "qwen25_mbpp_finetuned",  # Fine-tuned model path
        max_seq_length = 2048,
        dtype = None,
        load_in_4bit = True,
    )
    FastLanguageModel.for_inference(model)
    print("Fine-tuned model loaded successfully!")
    
except:
    print("⚠️  Fine-tuned model not found, loading base model...")
    pipe = pipeline(
        "text-generation", 
        model="unsloth/Qwen2.5-Coder-14B-Instruct-bnb-4bit",
        trust_remote_code=True,
        device_map="auto" if torch.cuda.is_available() else None,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
    )
    model = None
    tokenizer = pipe.tokenizer

# Create pipeline function for unified interface
if model is not None:
    # Fine-tuned model inference
    def generate_code(prompt, **kwargs):
        inputs = tokenizer.apply_chat_template(
            prompt if isinstance(prompt, list) else [{"role": "user", "content": prompt}],
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to("cuda")
        
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=kwargs.get('max_new_tokens', 768),
            temperature=kwargs.get('temperature', 0.1),
            top_p=kwargs.get('top_p', 0.95),
            do_sample=kwargs.get('do_sample', True),
            pad_token_id=tokenizer.eos_token_id
        )
        
        response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
        return [{'generated_text': response}]
        
    pipe = type('Pipeline', (), {'tokenizer': tokenizer, '__call__': lambda self, prompt, **kwargs: generate_code(prompt, **kwargs)})()
    print("Using fine-tuned model for generation")
else:
    print("Using base model pipeline for generation")

In [None]:
def format_prompt(example):
    """Format a single example into the required prompt format for fine-tuned model"""
    instruction = example['instruction']
    test_list = example['test_list']
    
    # Parse function name from instruction
    function_name = "unknown_function"
    if "Example:" in instruction:
        example_part = instruction.split("Example:")[1].strip()
        func_match = re.search(r'(\w+)\s*\(', example_part)
        if func_match:
            function_name = func_match.group(1)
    
    # Enhanced system message with algorithm-specific guidance
    system_message = """You are an expert Python programmer specializing in algorithmic problem solving. Your task is to generate clean, efficient, and correct Python code that passes all given test cases.

CRITICAL REQUIREMENTS:
1. Analyze the problem description carefully and identify algorithm type (DP, graph, string, math, etc.)
2. Study the test cases to understand input/output patterns, data types, and edge cases
3. Write clean, readable Python code with proper error handling
4. Ensure your solution passes ALL test cases exactly
5. Use appropriate algorithms and data structures for efficiency
6. Handle edge cases like empty inputs, None values, boundary conditions, negative numbers
7. Follow Python best practices and coding standards
8. Import necessary modules at the beginning if needed

RESPONSE FORMAT:
- Provide ONLY the Python code implementation
- Do NOT include explanations, comments about the approach, or markdown formatting
- Do NOT wrap code in backticks or code blocks
- Write complete, executable functions that solve the problem
- Start with imports if needed (math, re, collections, itertools, etc.)

ALGORITHM-SPECIFIC PATTERNS:

Dynamic Programming:
def dp_problem(n):
    dp = [0] * (n + 1)
    dp[0] = base_case
    for i in range(1, n + 1):
        dp[i] = recurrence_relation
    return dp[n]

String Processing:
def string_problem(s):
    if not s:  # Handle empty string
        return default_value
    # Process string with appropriate method
    return result

Mathematical:
import math
def math_problem(n):
    if n <= 0:  # Handle edge cases
        return base_case
    # Use math functions efficiently
    return result

EXAMPLES OF CORRECT RESPONSES:

Problem: Write a function to find the first repeated character in a string.
Test Cases: assert first_repeated_char("abcabc") == "a"

Correct Response:
def first_repeated_char(s):
    seen = set()
    for char in s:
        if char in seen:
            return char
        seen.add(char)
    return "None"

Problem: Write a function to check if a number is prime.
Test Cases: assert prime_num(13) == True

Correct Response:
import math
def prime_num(n):
    if n < 2:
        return False
    if n == 2:
        return True
    if n % 2 == 0:
        return False
    for i in range(3, int(math.sqrt(n)) + 1, 2):
        if n % i == 0:
            return False
    return True

Problem: Write a function to find similar elements in two tuples.
Test Cases: assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)

Correct Response:
def similar_elements(test_tup1, test_tup2):
    result = tuple(sorted(set(test_tup1) & set(test_tup2)))
    return result

Problem: Write a function for dynamic programming - Fibonacci.
Test Cases: assert fib(10) == 55

Correct Response:
def fib(n):
    if n <= 1:
        return n
    dp = [0] * (n + 1)
    dp[1] = 1
    for i in range(2, n + 1):
        dp[i] = dp[i-1] + dp[i-2]
    return dp[n]"""
        
    user_prompt = f"""Problem: {instruction}

Test Cases:
{test_list}

Expected Function Name: {function_name}

ANALYSIS CHECKLIST:
✓ Identify algorithm type (DP, graph, string manipulation, math, etc.)
✓ Check input/output data types from test cases
✓ Consider edge cases (empty, single element, boundary values)
✓ Choose optimal data structures and algorithms
✓ Ensure exact return format matches test expectations

Generate the Python code solution that passes ALL test cases."""
        
    # Format for chat template
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_prompt}
    ]
        
    # Apply chat template
    formatted_prompt = pipe.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
        
    return formatted_prompt

In [None]:
# Prepare dataset for inference
instructions_data = []
formatted_prompts = []
ids_list = []

for _, row in tqdm(test_df.iterrows(), desc="Preparing data", unit="row", total=len(test_df)):
    instructions_data.append({
        'instruction': row['instruction'],
        'test_list': row['test_list'],
        'id': row['id']
    })

for item in tqdm(instructions_data, desc="Formatting prompts", unit="prompt"):
    formatted_prompt = format_prompt(item)
    formatted_prompts.append(formatted_prompt)
    ids_list.append(item['id'])

print(f"Formatted {len(formatted_prompts)} prompts")

# Create dataset
dataset_dict = {
    'prompt': formatted_prompts,
    'id': ids_list
}

dataset = Dataset.from_dict(dataset_dict)
print(f"Dataset created with {len(dataset)} samples")

## Test Execution Engine
Advanced testing system that executes generated code against test cases with comprehensive error reporting and timeout protection.

In [None]:
def timeout_handler(signum, frame):
    """Handler for timeout signals"""
    raise TimeoutError("Test execution timed out")

def execute_and_test_code(code, test_cases_str, timeout=15):
    """
    Execute code and run test cases with detailed error reporting
    Returns: (success, summary, detailed_results)
    """
    
    # Clean code - NO MARKDOWN WRAPPING REMOVAL since we don't generate it
    clean_code = code.strip()
    
    # Parse test cases
    try:
        test_cases = ast.literal_eval(test_cases_str)
    except Exception as e:
        return False, f"Test parsing error: {e}", []
    
    # Execute the function definition
    namespace = {}
    try:
        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(timeout)
        exec(clean_code, namespace)
        signal.alarm(0)
    except TimeoutError:
        print(f"Code execution timeout ({timeout}s)")
        return False, f"Code execution timeout ({timeout}s)", []
    except SyntaxError as e:
        return False, f"Syntax error: {e}", []
    except Exception as e:
        return False, f"Runtime error in function definition: {e}", []
    
    # Run each test case
    results = []
    passed_count = 0
    
    for i, test_case in enumerate(test_cases):
        try:
            signal.alarm(timeout)
            
            # Execute test case
            exec(test_case, namespace)
            
            signal.alarm(0)
            results.append({
                'test_case': test_case,
                'status': 'PASSED',
                'error': None,
                'index': i + 1
            })
            passed_count += 1
            
        except TimeoutError:
            print(f"Test case {i+1} timeout ({timeout}s)")
            results.append({
                'test_case': test_case,
                'status': 'TIMEOUT',
                'error': f'Test execution timeout ({timeout}s)',
                'index': i + 1
            })
            break  # Stop on timeout
            
        except AssertionError as e:
            results.append({
                'test_case': test_case,
                'status': 'ASSERTION_FAILED',
                'error': str(e) if str(e) else 'Assertion failed - expected vs actual values differ',
                'index': i + 1
            })
            break  # Stop on first failure
            
        except Exception as e:
            results.append({
                'test_case': test_case,
                'status': 'RUNTIME_ERROR',
                'error': str(e),
                'index': i + 1
            })
            break  # Stop on first error
        finally:
            signal.alarm(0)
    
    success = passed_count == len(test_cases)
    summary = f"Passed {passed_count}/{len(test_cases)} tests"
    
    return success, summary, results

def create_enhanced_error_feedback(test_results, instruction, test_cases_str, previous_attempts):
    """Create enhanced feedback for failed tests with pattern analysis"""
    
    if not test_results:
        return "\nNo test results available. Please check if the function was defined correctly."
    
    # Find the first failed test
    failed_test = next((r for r in test_results if r['status'] != 'PASSED'), None)
    
    if not failed_test:
        return "\nAll tests passed!"
    
    # Analyze previous attempts for patterns
    attempt_analysis = ""
    if len(previous_attempts) > 1:
        attempt_analysis = f"""

# PATTERN ANALYSIS FROM {len(previous_attempts)} ATTEMPTS:
- Attempt 1: {len(previous_attempts[0])} characters
- Latest: {len(previous_attempts[-1])} characters
- Different approaches tried: {len(set(attempt[:50] for attempt in previous_attempts))}

# AVOID REPEATING: The same logic pattern has failed multiple times. Try a fundamentally different approach."""

    # Enhanced error analysis based on error type
    specific_guidance = ""
    if failed_test['status'] == 'ASSERTION_FAILED':
        specific_guidance = """
## ASSERTION FAILURE GUIDANCE:
- Check return data type (int, str, list, tuple, bool)
- Verify exact return format matches expected output
- Consider sorting if order doesn't matter
- Handle empty cases explicitly"""
    elif failed_test['status'] == 'RUNTIME_ERROR':
        error_msg = failed_test['error'].lower()
        if 'index' in error_msg or 'list' in error_msg:
            specific_guidance = """
## INDEX/LIST ERROR GUIDANCE:
- Check for empty list/string handling
- Verify array bounds (0 to len-1)
- Handle edge case when input is empty"""
        elif 'key' in error_msg or 'dict' in error_msg:
            specific_guidance = """
## DICTIONARY ERROR GUIDANCE:
- Check if key exists before accessing
- Use .get() method with default values
- Initialize dictionaries properly"""
        elif 'attribute' in error_msg:
            specific_guidance = """
## ATTRIBUTE ERROR GUIDANCE:
- Check object types before method calls
- Verify variable is initialized
- Import necessary modules"""
    
    # Create comprehensive error feedback
    feedback = f"""

## PREVIOUS ATTEMPT FAILED - ADVANCED DEBUGGING:

- Error Type: {failed_test['status']}
- Error Message: {failed_test['error']}
- Failing Test Case: {failed_test['test_case']}
- Failed at Test #{failed_test['index']} out of {len([r for r in test_results if 'index' in r])}

{specific_guidance}

{attempt_analysis}

🔧 SYSTEMATIC DEBUGGING APPROACH:
1. ANALYZE INPUT/OUTPUT: What data types and patterns do test cases show?
2. EDGE CASE CHECK: Empty inputs, single elements, boundary values
3. ALGORITHM CHOICE: Is this DP, greedy, two-pointer, sliding window, etc.?
4. IMPLEMENTATION: Step through the failing test case manually
5. IMPORTS: Add math, re, collections, itertools if needed

# Original Task: {instruction}

# CRITICAL SUCCESS FACTORS:
- Function signature must match test case exactly
- Return type must match expected output precisely  
- Handle ALL edge cases shown in test patterns
- Use efficient algorithm for the problem type

## GENERATE A COMPLETELY NEW APPROACH - Previous attempts failed for a reason."""

    return feedback

def create_error_feedback(test_results, instruction, test_cases_str):
    """Legacy function - kept for compatibility"""
    return create_enhanced_error_feedback(test_results, instruction, test_cases_str, [])

def get_function_name_from_test(test_cases_str):
    """Extract function name from test cases for better error reporting"""
    try:
        test_cases = ast.literal_eval(test_cases_str)
        
        if test_cases:
            # Find function name in first test case
            func_match = re.search(r'assert\s+(\w+)\s*\(', test_cases[0])
            if func_match:
                return func_match.group(1)
    except Exception as e:
        pass
    return "function"

## Enhanced Generation with Test-Driven Refinement

In [None]:
def generate_with_testing(prompt, test_cases_str, instruction, sample_id, max_attempts=3):
    """
    Generate code with iterative testing and refinement using fine-tuned model
    Returns: (generated_code, full_success, attempts_used, best_score, input_tokens, output_tokens)
    """
    
    best_code = None
    best_score = 0
    all_attempts = []
    total_input_tokens = 0
    total_output_tokens = 0
    
    func_name = get_function_name_from_test(test_cases_str)
    
    for attempt in range(max_attempts):
        temp = [0.1, 0.3, 0.5][attempt]
        
        try:
            result = pipe(
                prompt,
                max_new_tokens=768,  # Increased for complex problems
                temperature=temp,
                top_p=0.95,
                do_sample=True,
                return_full_text=False,
                pad_token_id=pipe.tokenizer.eos_token_id
            )
            
            generated_code = result[0]['generated_text'].strip()
            all_attempts.append(generated_code)
            
            success, summary, test_results = execute_and_test_code(
                generated_code, test_cases_str
            )
            
            if test_results:
                passed_tests = sum(1 for r in test_results if r['status'] == 'PASSED')
                total_tests = len([r for r in test_results if 'index' in r])
                current_score = passed_tests / total_tests if total_tests > 0 else 0
            else:
                current_score = 0
            
            # Keep track of best attempt
            if current_score > best_score or (current_score == best_score and best_code is None):
                best_code = generated_code
                best_score = current_score
            
            if success:
                print(f"ID {sample_id} - Attempt {attempt + 1}: {func_name}() - All tests passed! 🎉")
                return generated_code, True, attempt + 1, 1.0, total_input_tokens, total_output_tokens
            
            else:
                print(f"ID {sample_id} - Attempt {attempt + 1}: {func_name}() - {summary} (temp={temp})")
                
                # Enhanced error feedback for next attempt (if not last attempt)
                if attempt < max_attempts - 1:
                    error_feedback = create_enhanced_error_feedback(test_results, instruction, test_cases_str, all_attempts)
                    
                    # Enhanced system content for retry
                    system_content = """You are an expert Python programmer specializing in algorithmic problem solving. Your task is to generate clean, efficient, and correct Python code that passes all given test cases.

CRITICAL REQUIREMENTS:
1. Analyze the problem description carefully and identify the algorithm type (DP, graph, string manipulation, math, etc.)
2. Study the test cases to understand input/output patterns, data types, and edge cases
3. Write clean, readable Python code with proper error handling
4. Ensure your solution passes ALL test cases exactly
5. Use appropriate algorithms and data structures for efficiency
6. Handle edge cases like empty inputs, None values, boundary conditions, negative numbers
7. Follow Python best practices and coding standards
8. Import necessary modules (math, re, collections, itertools, etc.) at the beginning

RESPONSE FORMAT:
- Provide ONLY the Python code implementation
- Do NOT include explanations, comments about the approach, or markdown formatting
- Do NOT wrap code in backticks or code blocks
- Write complete, executable functions that solve the problem
- Start with necessary imports if needed

ALGORITHM-SPECIFIC GUIDANCE:
- Dynamic Programming: Use memoization or tabulation appropriately
- String Processing: Consider edge cases with empty strings, special characters
- Mathematical: Handle negative numbers, zero, overflow conditions
- Data Structures: Choose optimal structures (set, dict, list, deque, etc.)
- Graph Problems: Consider connectivity, cycles, traversal methods

Your code will be tested against the provided test cases, so accuracy is paramount."""

                    # Enhanced user prompt with error feedback and previous attempts analysis
                    enhanced_user_prompt = f"""Problem: {instruction}

Test Cases:
{test_cases_str}

{error_feedback}

PREVIOUS ATTEMPTS ANALYSIS:
{len(all_attempts)} attempts made. Learn from these patterns to avoid repeating mistakes.

Generate the Python code solution that passes ALL test cases."""

                    # Create retry prompt
                    retry_messages = [
                        {"role": "system", "content": system_content},
                        {"role": "user", "content": enhanced_user_prompt}
                    ]
                    
                    prompt = pipe.tokenizer.apply_chat_template(
                        retry_messages,
                        tokenize=False,
                        add_generation_prompt=True
                    )
                
        except Exception as e:
            print(f"ID {sample_id} - Attempt {attempt + 1} generation failed: {e}")
            continue
    
    # Return best attempt
    print(f"ID {sample_id}: Used best attempt - {func_name}() with {best_score:.1%} pass rate")
    
    return best_code or "def placeholder(): pass", False, max_attempts, best_score, total_input_tokens, total_output_tokens

In [None]:
responses = []
failed_ids = []

success_stats = {
    'full_success': 0,       # 100% pass rate
    'partial_success': 0,    # Some tests passed but not all
    'complete_failure': 0,   # No tests passed or execution failed
    'total_attempts': 0,     # Total generation attempts across all samples
    'total_samples': 0,
    'attempt_distribution': [0, 0, 0]  # Track success by attempt number
}

print("Starting ENHANCED Test-Driven Code Generation with Fine-tuned Model...")

for idx in tqdm(range(len(dataset)), desc="Enhanced generation"):
    try:
        # Get sample data
        prompt = dataset[idx]['prompt']
        sample_id = dataset[idx]['id']
        
        # Get corresponding test cases and instruction from original data
        original_row = test_df[test_df['id'] == sample_id].iloc[0]
        test_cases_str = original_row['test_list']
        instruction = original_row['instruction']
        
        # Generate with enhanced iterative testing and refinement
        generated_code, full_success, attempts_used, final_score, input_tokens, output_tokens = generate_with_testing(
            prompt, test_cases_str, instruction, sample_id, max_attempts=3
        )
        
        responses.append(generated_code)
        
        # Update enhanced statistics
        success_stats['total_attempts'] += attempts_used
        success_stats['total_samples'] += 1
        success_stats['total_input_tokens'] += input_tokens
        success_stats['total_output_tokens'] += output_tokens
        
        if full_success:
            success_stats['full_success'] += 1
            # Track which attempt succeeded
            if attempts_used <= 5:
                success_stats['attempt_distribution'][attempts_used - 1] += 1
        elif final_score > 0:
            success_stats['partial_success'] += 1  
        else:
            success_stats['complete_failure'] += 1
            
        # Memory management - clear every 15 samples (more frequent)
        if (idx + 1) % 15 == 0:
            clear_memory()
            print(f"\nMemory cleared after {idx + 1} samples")
            
        # Enhanced progress update every 50 samples
        if (idx + 1) % 50 == 0:
            current_success_rate = success_stats['full_success'] / success_stats['total_samples']
            avg_attempts = success_stats['total_attempts'] / success_stats['total_samples']
            avg_input_tokens = success_stats['total_input_tokens'] / success_stats['total_samples']
            avg_output_tokens = success_stats['total_output_tokens'] / success_stats['total_samples']
            
            print(f"\nProgress Update after {idx + 1} samples:")
            print(f"Success Rate: {current_success_rate:.1%} ({success_stats['full_success']}/{success_stats['total_samples']})")
            print(f"Avg Attempts: {avg_attempts:.1f}")
            print(f"Avg Input Tokens: {avg_input_tokens:.0f}")
            print(f"Avg Output Tokens: {avg_output_tokens:.0f}")
            print(f"Expected Final: {current_success_rate:.1%}")
            
    except Exception as e:
        print(f"Complete failure for ID {sample_id}: {e}")
        failed_ids.append(sample_id)
        responses.append("def placeholder(): pass")
        success_stats['complete_failure'] += 1
        success_stats['total_samples'] += 1
        continue


print("ENHANCED GENERATION WITH FINE-TUNED MODEL COMPLETED!")

In [None]:
import pickle
with open("raw_responses_backup.pkl", "wb") as f:
    pickle.dump(responses, f)
print("Successfully saved raw responses to 'raw_responses_backup.pkl'")

In [None]:
test_df.to_json("test_df.json", orient="records", indent=2)
print("Saved test_df to 'test_df.json'")