## Test case generation with Large Language Models

In this series of exercises, we will investigate the use of LLM to generate test cases.

### Step 1: Our reference code

As opposed to the previous experience with code generation - where we had valid test cases - we assume this time that we have valid solutions for given software requirements. Our task now is to generate test cases for valid code.



In [1]:
#the same code is saved in the python script function_01.py

original_function="""def racer_disqualified(times, winner_times, n_penalties, penalties):
    \"""
    Determines if a racer is disqualified based on their times, penalties, and winner times.

    Parameters:
        times (list of int): List of the racer's times for three events.
        winner_times (list of int): List of winner times for the same three events.
        n_penalties (int): Number of penalties the racer incurred.
        penalties (list of int): List of penalty values.

    Returns:
        bool: True if the racer is disqualified, False otherwise.

    Raises:
        ValueError: If inputs do not meet the required types or constraints.
    \"""
    # Input validation
    if not (isinstance(times, list) and len(times) == 3 and all(isinstance(t, int) for t in times)):
        raise ValueError("times must be a list of three integers.")

    if not (isinstance(winner_times, list) and len(winner_times) == 3 and all(isinstance(wt, int) for wt in winner_times)):
        raise ValueError("winner_times must be a list of three integers.")

    if not isinstance(n_penalties, int):
        raise ValueError("n_penalties must be an integer.")

    if not (isinstance(penalties, list) and all(isinstance(p, int) for p in penalties)):
        raise ValueError("penalties must be a list of integers.")

    if n_penalties != len(penalties):
        raise ValueError("n_penalties must match the length of the penalties list.")

    disqualified = False
    tot_penalties = 0

    # Calculate total penalties and check for any excessive penalty
    for penalty in penalties:
        tot_penalties += penalty
        if penalty > 100:
            disqualified = True

    # Check for disqualification based on total penalties or number of penalties
    if tot_penalties > 100 or n_penalties > 5:
        disqualified = True

    # Check if any time exceeds 1.5 times the corresponding winner time
    for i in range(3):
        max_time = winner_times[i] * 1.5
        if times[i] > max_time:
            disqualified = True

    return disqualified"""


file_path = "function_01.py"

with open(file_path, 'w') as file:
    file.write(original_function)



def racer_disqualified(times, winner_times, n_penalties, penalties):
    """
    Determines if a racer is disqualified based on their times, penalties, and winner times.

    Parameters:
        times (list of int): List of the racer's times for three events.
        winner_times (list of int): List of winner times for the same three events.
        n_penalties (int): Number of penalties the racer incurred.
        penalties (list of int): List of penalty values.

    Returns:
        bool: True if the racer is disqualified, False otherwise.

    Raises:
        ValueError: If inputs do not meet the required types or constraints.
    """
    # Input validation
    if not (isinstance(times, list) and len(times) == 3 and all(isinstance(t, int) for t in times)):
        raise ValueError("times must be a list of three integers.")

    if not (isinstance(winner_times, list) and len(winner_times) == 3 and all(isinstance(wt, int) for wt in winner_times)):
        raise ValueError("winner_times must be a list of three integers.")

    if not isinstance(n_penalties, int):
        raise ValueError("n_penalties must be an integer.")

    if not (isinstance(penalties, list) and all(isinstance(p, int) for p in penalties)):
        raise ValueError("penalties must be a list of integers.")

    if n_penalties != len(penalties):
        raise ValueError("n_penalties must match the length of the penalties list.")

    disqualified = False
    tot_penalties = 0

    # Calculate total penalties and check for any excessive penalty
    for penalty in penalties:
        tot_penalties += penalty
        if penalty > 100:
            disqualified = True

    # Check for disqualification based on total penalties or number of penalties
    if tot_penalties > 100 or n_penalties > 5:
        disqualified = True

    # Check if any time exceeds 1.5 times the corresponding winner time
    for i in range(3):
        max_time = winner_times[i] * 1.5
        if times[i] > max_time:
            disqualified = True

    return disqualified

### Step 2: Define some pytest test cases

We now setup an environment to run test cases and obtain the coverage of test cases. To start, we define a couple of test cases with the PyTest library.

In [10]:
import pytest
import ipytest

ipytest.clean()

ipytest.autoconfig()



def test_valid_disqualified_for_tot_penalty():
    times = [0, 1, 3]
    winner_times = [0, 1, 2]
    penalties = [20, 30, 60]
    n_penalties = 3
    assert racer_disqualified(times, winner_times, n_penalties, penalties)


def test_valid_disqualified_for_single_penalty():
    times = [0, 1, 3]
    winner_times = [0, 1, 2]
    penalties = [101]
    n_penalties = 1
    assert racer_disqualified(times, winner_times, n_penalties, penalties)


def test_valid_disqualified_for_excessive_time():
    times = [0, 1, 6]
    winner_times = [0, 1, 3]
    penalties = [10]
    n_penalties = 1
    assert racer_disqualified(times, winner_times, n_penalties, penalties)


def test_valid_NOT_disqualified_for_tot_penalty():
    times = [0, 1, 3]
    winner_times = [0, 1, 2]
    penalties = [20, 30, 40]
    n_penalties = 3
    assert not racer_disqualified(times, winner_times, n_penalties, penalties)




def run_tests():
    ipytest.run('-vv')  

# Running the tests with ipytests
run_tests()



platform win32 -- Python 3.12.0, pytest-9.0.1, pluggy-1.6.0 -- c:\Users\Utente\OneDrive - Politecnico di Torino\Universita\Magistrale\Secondo Anno\LLM\LABS\venv\Scripts\python.exe
cachedir: .pytest_cache
rootdir: c:\Users\Utente\OneDrive - Politecnico di Torino\Universita\Magistrale\Secondo Anno\LLM\LABS\LAB10
plugins: anyio-4.11.0
[1mcollecting ... [0mcollected 4 items

t_e4cd07f825ff457eae222e13ac514143.py::test_valid_disqualified_for_tot_penalty [32mPASSED[0m[32m        [ 25%][0m
t_e4cd07f825ff457eae222e13ac514143.py::test_valid_disqualified_for_single_penalty [32mPASSED[0m[32m     [ 50%][0m
t_e4cd07f825ff457eae222e13ac514143.py::test_valid_disqualified_for_excessive_time [32mPASSED[0m[32m     [ 75%][0m
t_e4cd07f825ff457eae222e13ac514143.py::test_valid_NOT_disqualified_for_tot_penalty [32mPASSED[0m[32m    [100%][0m



### Step 3: Computing the pass rate

The first objective of our analysis is computing the pass rate of the test cases.

The pass rate for a test suite is defined as the ratio between the passing test cases and all the test cases executed.

Notice that this ratio is computed in the same way as the Functional Correctness when you are comparing generated code against an existing test suite, but there is a subtle difference in what we are measuring: 
- when we compute functional correctness, we have a correct test suite, and we are verifying if the code complies to requirements by executing the test cases.
- when we compute the pass rate, we have correct code, and we are verifying if the test cases comply to the requirements by executing them against the code.

For now, we are defining the test cases manually: we make sure that the pass rate is 100%.

In [17]:
import re

def parse_test_result(line):
    errors = re.search(r"(\d+)\s+errors?", line)
    failed = re.search(r"(\d+)\s+failed", line)
    passed = re.search(r"(\d+)\s+passed", line)

    return int(errors.group(1)) if errors else 0 , int(failed.group(1)) if failed else 0, int(passed.group(1)) if passed else 0

In [22]:
import pytest
import io
import sys
import subprocess


test_file = 'test_function_01.py'

result = subprocess.run(
    ['pytest', test_file, '--disable-warnings', '--tb=short', '-q', '--color=no'],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True
)

output_lines = result.stdout.split("\n")
summary_line = next((line for line in output_lines if 'passed' in line or 'failed' in line or 'error' in line))


errors, failures, passes = parse_test_result(summary_line)


print(f"# Passed: {passes}")
print(f"# Failed: {failures}")
print(f"# Errors: {errors}")

#compute the pass rate of the test cases
pass_rate = passes / (errors + passes + failures)
print(f"Pass Rate: {pass_rate}")

# Passed: 4
# Failed: 0
# Errors: 0
Pass Rate: 1.0


In [32]:
def run_test_cases(test_file):
    result = subprocess.run(
        ['pytest', test_file, '--disable-warnings', '--tb=short', '-q', '--color=no'],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
        )

    output_lines = result.stdout.split("\n")
    summary_line = next((line for line in output_lines if 'passed' in line or 'failed' in line or 'error' in line))

    return parse_test_result(summary_line)


### Step 4: Compute the coverage

To compute the coverage of a test suite over a function or a set of functions, we can use the coverage library.

pip install pytest-cov

Once we have the coverage module installed, it is possible to launch the coverage by launching the following command line instructions:
- coverage run -m pytest test_function_name
- coverage report -m

In this code section, define multiple subprocess runs to obtain the results of the coverage computation inside a variable.

In [36]:
def run_coverage(test_file):
    # Run the pytest coverage run command
    result = subprocess.run(
        ['coverage', 'run', '-m', 'pytest', test_file],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )


    # Run the pytest coverage report command
    result2 = subprocess.run(
        ['coverage', 'report', '-m', test_file],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )

    #define code to extract the coverage from the coverage report
    #1) find the line where the function is defined (the line will report the name of the file)
    line = re.search(rf"{test_file}\s+\d+\s+\d+\s+(\d+)%", result2.stdout)
    #2) extract the coverage
    coverage = int(line.group(1))
    return coverage

In [30]:
# Run the pytest coverage run command
result = subprocess.run(
    ['coverage', 'run', '-m', 'pytest', test_file],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True
)


# Run the pytest coverage report command
result2 = subprocess.run(
    ['coverage', 'report', '-m', test_file],
    stdout=subprocess.PIPE,
    stderr=subprocess.PIPE,
    text=True
)

#define code to extract the coverage from the coverage report
#1) find the line where the function is defined (the line will report the name of the file)
line = re.search(rf"{test_file}\s+\d+\s+\d+\s+(\d+)%", result2.stdout)
#2) extract the coverage
coverage = int(line.group(1))

print(f"Coverage: {coverage}%")

Coverage: 100%


### Step 5: Introducing mutations

To try out mutation testing, produce a set of variants of the function by changing operators and values. Save all these variants in a dictionary of mutations by modifying the text of the function like in the example below.

Remeber to introduce a single mutant in each mutated version of the function.

**Note**: several tools exist to automate mutation. You can refer to the libraries mutatest and mutpy to generate automatic mutations for test cases written with pytest. In this example, we will introduce mutations manually.

In [None]:

#in this mutant, the check "if penalty > 100" is changed to "if penalty < 100"

mutant1 = """def racer_disqualified(times, winner_times, n_penalties, penalties):
    \"""
    Determines if a racer is disqualified based on their times, penalties, and winner times.

    Parameters:
        times (list of int): List of the racer's times for three events.
        winner_times (list of int): List of winner times for the same three events.
        n_penalties (int): Number of penalties the racer incurred.
        penalties (list of int): List of penalty values.

    Returns:
        bool: True if the racer is disqualified, False otherwise.

    Raises:
        ValueError: If inputs do not meet the required types or constraints.
    \"""
    # Input validation
    if not (isinstance(times, list) and len(times) == 3 and all(isinstance(t, int) for t in times)):
        raise ValueError("times must be a list of three integers.")

    if not (isinstance(winner_times, list) and len(winner_times) == 3 and all(isinstance(wt, int) for wt in winner_times)):
        raise ValueError("winner_times must be a list of three integers.")

    if not isinstance(n_penalties, int):
        raise ValueError("n_penalties must be an integer.")

    if not (isinstance(penalties, list) and all(isinstance(p, int) for p in penalties)):
        raise ValueError("penalties must be a list of integers.")

    if n_penalties != len(penalties):
        raise ValueError("n_penalties must match the length of the penalties list.")

    disqualified = False
    tot_penalties = 0

    # Calculate total penalties and check for any excessive penalty
    for penalty in penalties:
        tot_penalties += penalty
        if penalty < 100:
            disqualified = True

    # Check for disqualification based on total penalties or number of penalties
    if tot_penalties > 100 or n_penalties > 5:
        disqualified = True

    # Check if any time exceeds 1.5 times the corresponding winner time
    for i in range(3):
        max_time = winner_times[i] * 1.5
        if times[i] > max_time:
            disqualified = True

    return disqualified"""


#in this mutant, the check "if penalty > max_time" is changed to "if penalty < max_time"
mutant2 = """
def racer_disqualified(times, winner_times, n_penalties, penalties):
    \"""
    Determines if a racer is disqualified based on their times, penalties, and winner times.

    Parameters:
        times (list of int): List of the racer's times for three events.
        winner_times (list of int): List of winner times for the same three events.
        n_penalties (int): Number of penalties the racer incurred.
        penalties (list of int): List of penalty values.

    Returns:
        bool: True if the racer is disqualified, False otherwise.

    Raises:
        ValueError: If inputs do not meet the required types or constraints.
    \"""
    # Input validation
    if not (isinstance(times, list) and len(times) == 3 and all(isinstance(t, int) for t in times)):
        raise ValueError("times must be a list of three integers.")

    if not (isinstance(winner_times, list) and len(winner_times) == 3 and all(isinstance(wt, int) for wt in winner_times)):
        raise ValueError("winner_times must be a list of three integers.")

    if not isinstance(n_penalties, int):
        raise ValueError("n_penalties must be an integer.")

    if not (isinstance(penalties, list) and all(isinstance(p, int) for p in penalties)):
        raise ValueError("penalties must be a list of integers.")

    if n_penalties != len(penalties):
        raise ValueError("n_penalties must match the length of the penalties list.")

    disqualified = False
    tot_penalties = 0

    # Calculate total penalties and check for any excessive penalty
    for penalty in penalties:
        tot_penalties += penalty
        if penalty > 100:
            disqualified = True

    # Check for disqualification based on total penalties or number of penalties
    if tot_penalties > 100 or n_penalties > 5:
        disqualified = True

    # Check if any time exceeds 1.5 times the corresponding winner time
    for i in range(3):
        max_time = winner_times[i] * 1.5
        if times[i] < max_time:
            disqualified = True

    return disqualified
"""

#in this mutant, the initialization disqualified=False is changed to disqualified=True
mutant3 = """
def racer_disqualified(times, winner_times, n_penalties, penalties):
    \"""
    Determines if a racer is disqualified based on their times, penalties, and winner times.

    Parameters:
        times (list of int): List of the racer's times for three events.
        winner_times (list of int): List of winner times for the same three events.
        n_penalties (int): Number of penalties the racer incurred.
        penalties (list of int): List of penalty values.

    Returns:
        bool: True if the racer is disqualified, False otherwise.

    Raises:
        ValueError: If inputs do not meet the required types or constraints.
    \"""
    # Input validation
    if not (isinstance(times, list) and len(times) == 3 and all(isinstance(t, int) for t in times)):
        raise ValueError("times must be a list of three integers.")

    if not (isinstance(winner_times, list) and len(winner_times) == 3 and all(isinstance(wt, int) for wt in winner_times)):
        raise ValueError("winner_times must be a list of three integers.")

    if not isinstance(n_penalties, int):
        raise ValueError("n_penalties must be an integer.")

    if not (isinstance(penalties, list) and all(isinstance(p, int) for p in penalties)):
        raise ValueError("penalties must be a list of integers.")

    if n_penalties != len(penalties):
        raise ValueError("n_penalties must match the length of the penalties list.")

    disqualified = True
    tot_penalties = 0

    # Calculate total penalties and check for any excessive penalty
    for penalty in penalties:
        tot_penalties += penalty
        if penalty > 100:
            disqualified = True

    # Check for disqualification based on total penalties or number of penalties
    if tot_penalties > 100 or n_penalties > 5:
        disqualified = True

    # Check if any time exceeds 1.5 times the corresponding winner time
    for i in range(3):
        max_time = winner_times[i] * 1.5
        if times[i] > max_time:
            disqualified = True

    return disqualified
"""

#in this mutant, the condition     if not (isinstance(times, list) and len(times) == 3 and all(isinstance(t, int) for t in times)):
# is changed  to   if (isinstance(times, list) and len(times) == 3 and all(isinstance(t, int) for t in times)):


mutant4 = """
def racer_disqualified(times, winner_times, n_penalties, penalties):
    \"""
    Determines if a racer is disqualified based on their times, penalties, and winner times.

    Parameters:
        times (list of int): List of the racer's times for three events.
        winner_times (list of int): List of winner times for the same three events.
        n_penalties (int): Number of penalties the racer incurred.
        penalties (list of int): List of penalty values.

    Returns:
        bool: True if the racer is disqualified, False otherwise.

    Raises:
        ValueError: If inputs do not meet the required types or constraints.
    \"""
    # Input validation
    if (isinstance(times, list) and len(times) == 3 and all(isinstance(t, int) for t in times)):
        raise ValueError("times must be a list of three integers.")

    if not (isinstance(winner_times, list) and len(winner_times) == 3 and all(isinstance(wt, int) for wt in winner_times)):
        raise ValueError("winner_times must be a list of three integers.")

    if not isinstance(n_penalties, int):
        raise ValueError("n_penalties must be an integer.")

    if not (isinstance(penalties, list) and all(isinstance(p, int) for p in penalties)):
        raise ValueError("penalties must be a list of integers.")

    if n_penalties != len(penalties):
        raise ValueError("n_penalties must match the length of the penalties list.")

    disqualified = False
    tot_penalties = 0

    # Calculate total penalties and check for any excessive penalty
    for penalty in penalties:
        tot_penalties += penalty
        if penalty > 100:
            disqualified = True

    # Check for disqualification based on total penalties or number of penalties
    if tot_penalties > 100 or n_penalties > 5:
        disqualified = True

    # Check if any time exceeds 1.5 times the corresponding winner time
    for i in range(3):
        max_time = winner_times[i] * 1.5
        if times[i] > max_time:
            disqualified = True

    return disqualified
"""


#in this mutant, the condition  operation max_time = winner_times[i] * 1.5 is changed to max_time = winner_times[i] * 2


mutant5 = """ 
def racer_disqualified(times, winner_times, n_penalties, penalties):
    \"""
    Determines if a racer is disqualified based on their times, penalties, and winner times.

    Parameters:
        times (list of int): List of the racer's times for three events.
        winner_times (list of int): List of winner times for the same three events.
        n_penalties (int): Number of penalties the racer incurred.
        penalties (list of int): List of penalty values.

    Returns:
        bool: True if the racer is disqualified, False otherwise.

    Raises:
        ValueError: If inputs do not meet the required types or constraints.
    \"""
    # Input validation
    if not (isinstance(times, list) and len(times) == 3 and all(isinstance(t, int) for t in times)):
        raise ValueError("times must be a list of three integers.")

    if not (isinstance(winner_times, list) and len(winner_times) == 3 and all(isinstance(wt, int) for wt in winner_times)):
        raise ValueError("winner_times must be a list of three integers.")

    if not isinstance(n_penalties, int):
        raise ValueError("n_penalties must be an integer.")

    if not (isinstance(penalties, list) and all(isinstance(p, int) for p in penalties)):
        raise ValueError("penalties must be a list of integers.")

    if n_penalties != len(penalties):
        raise ValueError("n_penalties must match the length of the penalties list.")

    disqualified = False
    tot_penalties = 0

    # Calculate total penalties and check for any excessive penalty
    for penalty in penalties:
        tot_penalties += penalty
        if penalty > 100:
            disqualified = True

    # Check for disqualification based on total penalties or number of penalties
    if tot_penalties > 100 or n_penalties > 5:
        disqualified = True

    # Check if any time exceeds 1.5 times the corresponding winner time
    for i in range(3):
        max_time = winner_times[i] * 2
        if times[i] > max_time:
            disqualified = True

    return disqualified
""" 
mutants = [mutant1, mutant2, mutant3, mutant4, mutant5]



### Step 6: Calculating Mutation Score

Now cycle over the list of mutants. For every mutant, overwrite the function function_01.py and re-execute the test cases. For each mutant you can compute the following outcome:
- Mutant killed: one or more test cases failed
- Mutant survived: all test cases passed

At the end of the iteration over mutants, compute the mutation score:
- Mutation score = survived mutants / total number of mutants

In [37]:
def compute_mutation_score(file_path):
    #initialize killed mutants and survived mutants
    killed_mutants = 0
    survived_mutants = 0

    # Iterate over the list of mutants 

    for mutant in mutants:
        
        #overwrite the file with the function with each mutant
        with open(file_path, 'w') as file:
            file.write(mutant)
        
        #run the test cases and collect the number of passed tests
        errors, failures, passes = run_test_cases(test_file)

        #update the number of survived or killed mutants
        if errors == 0 and failures == 0:
            survived_mutants += 1
        else:
            killed_mutants += 1


        pass

    #compute the mutation score
    mutation_score = survived_mutants / len(mutants)
    return mutation_score

In [33]:
#define the path where to save the mutants

file_path = "function_01.py"


#initialize killed mutants and survived mutants
killed_mutants = 0
survived_mutants = 0

# Iterate over the list of mutants 

for mutant in mutants:
    
    #overwrite the file with the function with each mutant
    with open(file_path, 'w') as file:
        file.write(mutant)
    
    #run the test cases and collect the number of passed tests
    errors, failures, passes = run_test_cases(test_file)

    #update the number of survived or killed mutants
    if errors == 0 and failures == 0:
        survived_mutants += 1
    else:
        killed_mutants += 1


    pass

#compute the mutation score
mutation_score = survived_mutants / len(mutants)

print(f"Mutation score: {round(mutation_score*100, 2)}%")





Mutation score: 0.0%


### Step 7 : Generating tests with LLMs

This time, we will consider again at least two alternatives for test case generation:
- a model from HuggingFace, e.g., CodeLLAMA
- a chat engine, e.g., ChatGPT or Qwen2.5

With each engine, we will generate a new test file (e.g., test_function_01_gpt.py, and test_function_01_llama.py), and replicate the pass rate, coverage and mutation analysis performed before with pre-defined test cases.

In [34]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import gc

torch.cuda.empty_cache()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
qwen_chat = "Qwen/Qwen2.5-14B-Instruct"

model = AutoModelForCausalLM.from_pretrained(qwen_chat, dtype=torch.float16, load_in_4bit=True).to(device)
tokenizer = AutoTokenizer.from_pretrained(qwen_chat)

messages = [
    {'role': 'system',
     'content': "you are a powerful AI assistant capable to generate test cases in python language basing on a given function, following the user's instructions."},
    {'role': 'user',
     'content': """
        give me exahustive test_cases for the following function:
 
def racer_disqualified(times, winner_times, n_penalties, penalties):
    \"""
    Determines if a racer is disqualified based on their times, penalties, and winner times.

    Parameters:
        times (list of int): List of the racer's times for three events.
        winner_times (list of int): List of winner times for the same three events.
        n_penalties (int): Number of penalties the racer incurred.
        penalties (list of int): List of penalty values.

    Returns:
        bool: True if the racer is disqualified, False otherwise.

    Raises:
        ValueError: If inputs do not meet the required types or constraints.
    \"""
    # Input validation
    if not (isinstance(times, list) and len(times) == 3 and all(isinstance(t, int) for t in times)):
        raise ValueError("times must be a list of three integers.")

    if not (isinstance(winner_times, list) and len(winner_times) == 3 and all(isinstance(wt, int) for wt in winner_times)):
        raise ValueError("winner_times must be a list of three integers.")

    if not isinstance(n_penalties, int):
        raise ValueError("n_penalties must be an integer.")

    if not (isinstance(penalties, list) and all(isinstance(p, int) for p in penalties)):
        raise ValueError("penalties must be a list of integers.")

    if n_penalties != len(penalties):
        raise ValueError("n_penalties must match the length of the penalties list.")

    disqualified = False
    tot_penalties = 0

    # Calculate total penalties and check for any excessive penalty
    for penalty in penalties:
        tot_penalties += penalty
        if penalty > 100:
            disqualified = True

    # Check for disqualification based on total penalties or number of penalties
    if tot_penalties > 100 or n_penalties > 5:
        disqualified = True

    # Check if any time exceeds 1.5 times the corresponding winner time
    for i in range(3):
        max_time = winner_times[i] * 2
        if times[i] > max_time:
            disqualified = True

    return disqualified

in the following format:

Example: "import pytest
from function_01 import racer_disqualified

# Test cases for disqualification due to total penalties > 100
def test_disqualified_total_penalties_over_100():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 3, [50, 40, 20])

def test_disqualified_total_penalties_exactly_101():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 2, [60, 41])

# Test cases for disqualification due to single penalty > 100
def test_disqualified_single_penalty_over_100():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 1, [101])

def test_disqualified_single_penalty_exactly_101():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 1, [101])"
    """}
    ]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
	return_dict=True,
	return_tensors="pt").to(device)

outputs = model.generate(
    **inputs,
    max_new_tokens = 500,
    temperature = 0.8, 
    num_return_sequences = 1,
)

qwen_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

try:
    del model
    del tokenizer
except:
    pass

gc.collect()
torch.cuda.empty_cache()

code_llama = "codellama/CodeLlama-13b-Instruct-hf"

model = AutoModelForCausalLM.from_pretrained(code_llama, dtype=torch.float16, load_in_4bit=True).to(device)
tokenizer = AutoTokenizer.from_pretrained(code_llama)


messages = [
    {'role': 'system',
     'content': "you are a powerful AI assistant capable to generate test cases in python language basing on a given function, following the user's instructions."},
    {'role': 'user',
     'content': """
        give me exahustive test_cases for the following function:
 
def racer_disqualified(times, winner_times, n_penalties, penalties):
    \"""
    Determines if a racer is disqualified based on their times, penalties, and winner times.

    Parameters:
        times (list of int): List of the racer's times for three events.
        winner_times (list of int): List of winner times for the same three events.
        n_penalties (int): Number of penalties the racer incurred.
        penalties (list of int): List of penalty values.

    Returns:
        bool: True if the racer is disqualified, False otherwise.

    Raises:
        ValueError: If inputs do not meet the required types or constraints.
    \"""
    # Input validation
    if not (isinstance(times, list) and len(times) == 3 and all(isinstance(t, int) for t in times)):
        raise ValueError("times must be a list of three integers.")

    if not (isinstance(winner_times, list) and len(winner_times) == 3 and all(isinstance(wt, int) for wt in winner_times)):
        raise ValueError("winner_times must be a list of three integers.")

    if not isinstance(n_penalties, int):
        raise ValueError("n_penalties must be an integer.")

    if not (isinstance(penalties, list) and all(isinstance(p, int) for p in penalties)):
        raise ValueError("penalties must be a list of integers.")

    if n_penalties != len(penalties):
        raise ValueError("n_penalties must match the length of the penalties list.")

    disqualified = False
    tot_penalties = 0

    # Calculate total penalties and check for any excessive penalty
    for penalty in penalties:
        tot_penalties += penalty
        if penalty > 100:
            disqualified = True

    # Check for disqualification based on total penalties or number of penalties
    if tot_penalties > 100 or n_penalties > 5:
        disqualified = True

    # Check if any time exceeds 1.5 times the corresponding winner time
    for i in range(3):
        max_time = winner_times[i] * 2
        if times[i] > max_time:
            disqualified = True

    return disqualified

in the following format:

Example: "import pytest
from function_01 import racer_disqualified

# Test cases for disqualification due to total penalties > 100
def test_disqualified_total_penalties_over_100():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 3, [50, 40, 20])

def test_disqualified_total_penalties_exactly_101():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 2, [60, 41])

# Test cases for disqualification due to single penalty > 100
def test_disqualified_single_penalty_over_100():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 1, [101])

def test_disqualified_single_penalty_exactly_101():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 1, [101])"
    """}
    ]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
	return_dict=True,
	return_tensors="pt").to(device)

outputs = model.generate(
    **inputs,
    max_new_tokens = 500,
    temperature = 0.8, 
    num_return_sequences = 1,
)

code_llama_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Fetching 8 files: 100%|██████████| 8/8 [04:24<00:00, 33.07s/it] 
Loading checkpoint shards: 100%|██████████| 8/8 [00:30<00:00,  3.87s/it]
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 3/3 [00:33<00:00, 11.26s/it]
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANS

In [35]:
print(qwen_code)
print("\n****************************************************\n")
print(code_llama_code)

system
you are a powerful AI assistant capable to generate test cases in python language basing on a given function, following the user's instructions.
user

        give me exahustive test_cases for the following function:

def racer_disqualified(times, winner_times, n_penalties, penalties):
    """
    Determines if a racer is disqualified based on their times, penalties, and winner times.

    Parameters:
        times (list of int): List of the racer's times for three events.
        winner_times (list of int): List of winner times for the same three events.
        n_penalties (int): Number of penalties the racer incurred.
        penalties (list of int): List of penalty values.

    Returns:
        bool: True if the racer is disqualified, False otherwise.

    Raises:
        ValueError: If inputs do not meet the required types or constraints.
    """
    # Input validation
    if not (isinstance(times, list) and len(times) == 3 and all(isinstance(t, int) for t in times)):
     

In [40]:
code_llama_code = """ 
import pytest
from function_01 import racer_disqualified

# Test cases for disqualification due to total penalties > 100
def test_disqualified_total_penalties_over_100():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 3, [50, 40, 20])

def test_disqualified_total_penalties_exactly_101():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 2, [60, 41])

# Test cases for disqualification due to single penalty > 100
def test_disqualified_single_penalty_over_100():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 1, [101])

def test_disqualified_single_penalty_exactly_101():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 1, [101])

# Test cases for disqualification due to number of penalties > 5
def test_disqualified_number_of_penalties_over_5():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 6, [50, 40, 20])

def test_disqualified_number_of_penalties_exactly_6():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 6, [60, 41])
"""

qwen_code = """ 
import pytest
from function_01 import racer_disqualified

# Test cases for disqualification due to total penalties > 100
def test_disqualified_total_penalties_over_100():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 3, [50, 40, 20]) == True

def test_disqualified_total_penalties_exactly_101():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 2, [60, 41]) == True

# Test cases for disqualification due to single penalty > 100
def test_disqualified_single_penalty_over_100():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 1, [101]) == True

def test_disqualified_single_penalty_exactly_101():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 1, [101]) == True
"""

In [41]:



#obtain test code by using LLAMA or other chat engines, save the results in different files
codes = {}
codes['copilot'] = """ 
import pytest
from function_01 import racer_disqualified

# Test cases for disqualification due to total penalties > 100
def test_disqualified_total_penalties_over_100():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 3, [50, 40, 20])

def test_disqualified_total_penalties_exactly_101():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 2, [60, 41])

# Test cases for disqualification due to single penalty > 100
def test_disqualified_single_penalty_over_100():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 1, [101])

def test_disqualified_single_penalty_exactly_101():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 1, [101])

# Test cases for disqualification due to n_penalties > 5
def test_disqualified_more_than_5_penalties():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 6, [10, 10, 10, 10, 10, 10])

def test_disqualified_exactly_6_penalties():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 6, [5, 5, 5, 5, 5, 5])

# Test cases for disqualification due to excessive time (> 2x winner time)
def test_disqualified_excessive_time_first_event():
    assert racer_disqualified([5, 1, 2], [2, 1, 2], 1, [10])

def test_disqualified_excessive_time_second_event():
    assert racer_disqualified([0, 7, 2], [0, 3, 2], 1, [10])

def test_disqualified_excessive_time_third_event():
    assert racer_disqualified([0, 1, 9], [0, 1, 4], 1, [10])

def test_disqualified_time_exactly_2x_winner_time_plus_1():
    assert racer_disqualified([0, 1, 7], [0, 1, 3], 1, [10])

# Test cases for NOT disqualified (boundary cases)
def test_not_disqualified_total_penalties_exactly_100():
    assert not racer_disqualified([0, 1, 2], [0, 1, 2], 2, [50, 50])

def test_not_disqualified_single_penalty_exactly_100():
    assert not racer_disqualified([0, 1, 2], [0, 1, 2], 1, [100])

def test_not_disqualified_exactly_5_penalties():
    assert not racer_disqualified([0, 1, 2], [0, 1, 2], 5, [10, 10, 10, 10, 10])

def test_not_disqualified_time_exactly_2x_winner_time():
    assert not racer_disqualified([0, 2, 4], [0, 1, 2], 1, [10])

def test_not_disqualified_all_valid():
    assert not racer_disqualified([0, 1, 2], [0, 1, 2], 3, [20, 30, 40])

# Test cases for input validation errors
def test_invalid_times_not_list():
    with pytest.raises(ValueError, match="times must be a list of three integers"):
        racer_disqualified((0, 1, 2), [0, 1, 2], 1, [10])

def test_invalid_times_wrong_length():
    with pytest.raises(ValueError, match="times must be a list of three integers"):
        racer_disqualified([0, 1], [0, 1, 2], 1, [10])

def test_invalid_times_not_integers():
    with pytest.raises(ValueError, match="times must be a list of three integers"):
        racer_disqualified([0.5, 1, 2], [0, 1, 2], 1, [10])

def test_invalid_winner_times_not_list():
    with pytest.raises(ValueError, match="winner_times must be a list of three integers"):
        racer_disqualified([0, 1, 2], (0, 1, 2), 1, [10])

def test_invalid_winner_times_wrong_length():
    with pytest.raises(ValueError, match="winner_times must be a list of three integers"):
        racer_disqualified([0, 1, 2], [0, 1], 1, [10])

def test_invalid_n_penalties_not_int():
    with pytest.raises(ValueError, match="n_penalties must be an integer"):
        racer_disqualified([0, 1, 2], [0, 1, 2], 1.5, [10])

def test_invalid_penalties_not_list():
    with pytest.raises(ValueError, match="penalties must be a list of integers"):
        racer_disqualified([0, 1, 2], [0, 1, 2], 1, (10,))

def test_invalid_penalties_not_integers():
    with pytest.raises(ValueError, match="penalties must be a list of integers"):
        racer_disqualified([0, 1, 2], [0, 1, 2], 1, [10.5])

def test_invalid_n_penalties_mismatch():
    with pytest.raises(ValueError, match="n_penalties must match the length of the penalties list"):
        racer_disqualified([0, 1, 2], [0, 1, 2], 2, [10])

# Edge cases
def test_zero_penalties():
    assert not racer_disqualified([0, 1, 2], [0, 1, 2], 0, [])

def test_zero_winner_times():
    assert not racer_disqualified([0, 0, 0], [0, 0, 0], 1, [10])

def test_negative_penalties():
    assert not racer_disqualified([0, 1, 2], [0, 1, 2], 2, [-10, -20])

"""

codes['gpt'] = """ 
import pytest
from function_01 import racer_disqualified

# ------------------------------------------------------------
# INPUT VALIDATION ERRORS
# ------------------------------------------------------------

def test_invalid_times_not_list():
    with pytest.raises(ValueError):
        racer_disqualified("abc", [1,2,3], 1, [10])

def test_invalid_times_wrong_length():
    with pytest.raises(ValueError):
        racer_disqualified([1,2], [1,2,3], 1, [10])

def test_invalid_times_non_int():
    with pytest.raises(ValueError):
        racer_disqualified([1, "a", 3], [1,2,3], 1, [10])

def test_invalid_winner_times_not_list():
    with pytest.raises(ValueError):
        racer_disqualified([1,2,3], "abc", 1, [10])

def test_invalid_winner_times_wrong_length():
    with pytest.raises(ValueError):
        racer_disqualified([1,2,3], [1,2], 1, [10])

def test_invalid_winner_times_non_int():
    with pytest.raises(ValueError):
        racer_disqualified([1,2,3], [1, "x", 3], 1, [10])

def test_invalid_n_penalties_not_int():
    with pytest.raises(ValueError):
        racer_disqualified([1,2,3], [1,2,3], "a", [10])

def test_invalid_penalties_not_list():
    with pytest.raises(ValueError):
        racer_disqualified([1,2,3], [1,2,3], 1, "abc")

def test_invalid_penalties_non_int():
    with pytest.raises(ValueError):
        racer_disqualified([1,2,3], [1,2,3], 1, [10, "x"])

def test_invalid_n_penalties_mismatch():
    with pytest.raises(ValueError):
        racer_disqualified([1,2,3], [1,2,3], 3, [10,20])


# ------------------------------------------------------------
# NO DISQUALIFICATION CASES
# ------------------------------------------------------------

def test_no_disqualification_clean_run():
    assert not racer_disqualified([10, 20, 30], [10, 20, 30], 0, [])

def test_no_disqualification_small_penalties():
    assert not racer_disqualified([12, 22, 32], [10, 20, 30], 2, [5, 10])

def test_no_disqualification_exact_time_limit():
    assert not racer_disqualified([20, 40, 60], [10, 20, 30], 1, [0])


# ------------------------------------------------------------
# PENALTIES — TOTAL > 100
# ------------------------------------------------------------

def test_disqualified_total_penalties_over_100():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 3, [50, 40, 20])

def test_disqualified_total_penalties_exactly_101():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 2, [60, 41])


# ------------------------------------------------------------
# PENALTIES — SINGLE PENALTY > 100
# ------------------------------------------------------------

def test_disqualified_single_penalty_over_100():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 1, [101])

def test_disqualified_single_penalty_exactly_101():
    assert racer_disqualified([0, 1, 2], [0, 1, 2], 1, [101])


# ------------------------------------------------------------
# PENALTIES — TOO MANY PENALTIES
# ------------------------------------------------------------

def test_disqualified_too_many_penalties():
    assert racer_disqualified([1,2,3], [1,2,3], 6, [1,1,1,1,1,1])

def test_disqualified_too_many_penalties_with_low_values():
    assert racer_disqualified([1,2,3], [1,2,3], 7, [0,0,0,0,0,0,0])


# ------------------------------------------------------------
# TIME EXCEEDS LIMIT (> 2 × winner_time)
# ------------------------------------------------------------

def test_disqualified_time_exceeds_limit_first_event():
    assert racer_disqualified([21, 20, 30], [10, 20, 30], 0, [])

def test_disqualified_time_exceeds_limit_second_event():
    assert racer_disqualified([10, 41, 30], [10, 20, 30], 0, [])

def test_disqualified_time_exceeds_limit_third_event():
    assert racer_disqualified([10, 20, 61], [10, 20, 30], 0, [])


# ------------------------------------------------------------
# MULTIPLE DISQUALIFYING CONDITIONS
# ------------------------------------------------------------

def test_disqualified_multiple_reasons_penalty_and_time():
    assert racer_disqualified([50, 20, 30], [10, 20, 30], 2, [150, 10])

def test_disqualified_all_rules_triggered():
    assert racer_disqualified([100, 200, 300], [10, 20, 30], 10, [200]*10)

"""

codes['qwen'] = qwen_code

codes['code_llama'] = code_llama_code

#append the results on different test files
agents = ['copilot', 'gpt', 'qwen', 'code_llama']
test_files = []

for agent in agents:
    test_files.append(f'test_function_{agent}.py')


file_path = 'function_01.py'

for test_file, agent in zip(test_files, agents):
    with open(test_file, 'w') as file:
        file.write(codes[agent])

    print()
    print()
    print("Doing:", test_file)

    #restoring original function in the file
    with open(file_path, 'w') as file:
        file.write(original_function)

    #computing pass_rate
    errors, failures, passes = run_test_cases(test_file)
    pass_rate = passes / (errors + failures + passes)

    number_of_tests = errors + failures + passes
    print(f"Number of tests: {number_of_tests}")
    print(f"Pass Rate: {round(pass_rate*100, 2)}")

    #computing coverage
    cov = run_coverage(test_file)
    print(f"Coverage: {coverage}%")

    #computing mutation_score
    mut_score = compute_mutation_score(file_path)
    print(f"Mutation score: {round(mut_score*100, 2)}%")



Doing: test_function_copilot.py
Number of tests: 27
Pass Rate: 96.3
Coverage: 100%
Mutation score: 20.0%


Doing: test_function_gpt.py
Number of tests: 24
Pass Rate: 95.83
Coverage: 100%
Mutation score: 20.0%


Doing: test_function_qwen.py
Number of tests: 4
Pass Rate: 100.0
Coverage: 100%
Mutation score: 80.0%


Doing: test_function_code_llama.py
Number of tests: 6
Pass Rate: 66.67
Coverage: 100%
Mutation score: 0.0%
