In [None]:
!pip install -q -U transformers accelerate bitsandbytes codecarbon tqdm

In [None]:
# =======================================================================
#       AUTOMATED TEST GENERATION USING AP_V0, AP_V1,AP_V2,AP_V3 PROMPTS
# =======================================================================

import os
import ast
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login
from codecarbon import EmissionsTracker

# --- Configuration Section ---
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Prevent tokenizer deadlocks

# Input and output directories
CODE_DIR = "HumanEval_Integrated_Dataset"
OUTPUT_DIR = "APV0_Phi-3.5-mini-instruct" #Output directory to save the generated test scripts
MODEL_ID = "microsoft/Phi-3.5-mini-instruct" #change the model name here
EMISSIONS_FILE_PATH = "APV0_Phi-3.5-mini-instruct.csv" #To save the .csv file generated by CodeCarbon

# Ensure the output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Authenticate with Hugging Face
login(token="hf_XXXXXXXXXXXXXXXXXXXXX") #Use Hugging Face Token here

# --- Model Loading Section ---
# Load Phi-3.5 model in 8-bit quantized mode for efficiency on T4 GPU
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)

print(f"Loading 8-bit quantized model '{MODEL_ID}'...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto"
)
print("✅ Model loaded successfully!")

# --- Collect all code files ---
file_indices = range(164)
code_files = [os.path.join(CODE_DIR, f"HumanEval_{i}_code.py") for i in file_indices]
print(f"Found {len(code_files)} code files to process.")

# --- Utility Function: Extract Main Function Name using AST ---
def extract_function_name(code_text):
    """
    Extracts the primary (last top-level) function name from a Python file.
    - Handles 1–3 top-level functions (common in HumanEval dataset)
    - Ignores nested or inner functions
    """
    try:
        tree = ast.parse(code_text)
        function_names = [node.name for node in tree.body if isinstance(node, ast.FunctionDef)]
        return function_names[-1] if function_names else "unknown_function"
    except Exception:
        return "unknown_function"

# --- Batch Processing Setup ---
BATCH_SIZE = 5
num_batches = (len(code_files) + BATCH_SIZE - 1) // BATCH_SIZE

# --- Main Generation Loop ---
for i in tqdm(range(num_batches), desc="Processing batches"):
    # Initialize CodeCarbon emissions tracker
    tracker = EmissionsTracker(
        project_name=f"{MODEL_ID.replace('/', '_')}_batch_{i}_Test_Generation",
        output_dir=os.path.dirname(EMISSIONS_FILE_PATH),
        output_file=os.path.basename(EMISSIONS_FILE_PATH)
    )
    tracker.start()

    start_index = i * BATCH_SIZE
    end_index = min(start_index + BATCH_SIZE, len(code_files))
    batch_files = code_files[start_index:end_index]

    # Process each file in current batch
    for file_path in batch_files:
        if not os.path.exists(file_path):
            print(f"⚠️ Skipping missing file: {file_path}")
            continue

        try:
            # --- Step 1: Read the source code ---
            with open(file_path, "r", encoding="utf-8") as f:
                code_content = f.read()

            # --- Step 2: Extract module and function names ---
            module_name = os.path.basename(file_path).replace(".py", "")
            function_name = extract_function_name(code_content)

            # --- Step 3: Prompt for Test Generation (AP_V0 - Baseline) --- #Change the prompt version here APV1, APV2 and APV3
            # Feature 0 ➜ Simple runnable unittest generation
            messages = [
                {
                    "role": "user",
                    "content": f"""Generate a unittest test script for the following Python function.
The script should fully test the function and be runnable directly.

### Output Formatting
1. Start with: import unittest
2. Include: from {module_name} import {function_name}
3. End with:
if __name__ == '__main__':
    unittest.main()

Function:
{code_content}
"""
                }
            ]

            # --- Step 4: Tokenize and Generate Test Script ---
            model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
            generated_ids = model.generate(model_inputs, max_new_tokens=1024, do_sample=True, temperature=0.2)
            generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

            # --- Step 5: Clean and Save Output ---
            generated_test = generated_text.strip().replace("```python", "").replace("```", "").strip()
            task_id = os.path.basename(file_path).replace("_code.py", "")
            output_filename = f"test_{task_id}_test.py"
            output_path = os.path.join(OUTPUT_DIR, output_filename)

            with open(output_path, "w", encoding="utf-8") as test_file:
                test_file.write(generated_test)

        except Exception as e:
            print(f"❌ Error processing {file_path}: {e}")
            continue

    emissions = tracker.stop()

print("\n✅ Test generation complete!")
print(f"Emissions (kg CO2eq): {emissions}")


Loading 8-bit quantized model 'microsoft/Phi-3.5-mini-instruct'...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

✅ Model loaded successfully!
Found 164 code files to process.


[codecarbon INFO @ 20:58:46] [setup] RAM Tracking...
[codecarbon INFO @ 20:58:46] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 20:58:47] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.20GHz
[codecarbon INFO @ 20:58:47] [setup] GPU Tracking...
[codecarbon INFO @ 20:58:47] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 20:58:47] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: pynvml
            
[codecarbon INFO @ 20:58:47] >>> Tracker's metadata:
[codecarbon INFO @ 20:58:47]   Platform system: Linux-6.6.97+-x86_64-with-glibc2.35
[codecarbon INFO @ 20:58:47]   Python version: 3.12.11
[codecarbon INFO @ 20:58:47]   CodeCarbon version: 3.0.6
[codecarbon INFO @ 20:58:47]   Available RAM : 12.671 GB
[codecarbon INFO


✅ Test generation complete!
Emissions (kg CO2eq): 0.0006609601706131696



