# LamoomAI CICD - Multiple Test Runs Example

This notebook demonstrates how to use the enhanced LamoomAI CICD library to:
1. Generate test cases from ideal answers
2. Save test cases to JSON files
3. Run multiple tests against the same test case
4. Aggregate and visualize test results

In [4]:
import os
import json
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import sys
sys.path.append('..')
import lamoom_cicd
from lamoom_cicd import TestLLMResponsePipe, TestCase, AggregatedResult

# Load environment variables for API keys
load_dotenv()
openai_key = os.environ.get("OPENAI_API_KEY")

# Initialize the test pipeline
lamoom_pipe = TestLLMResponsePipe(openai_key=openai_key)

  from .autonotebook import tqdm as notebook_tqdm


## 1. Define Sample Data

We'll use a simple example about blockchain to demonstrate the functionality.

In [None]:
data_df = pd.read_csv('./test_data/medical_questions_answers.csv')

## 2. Create a Test Case

There are two ways to create test cases: either by generating tests directly from the ideal answer, or by running a comparison first.

### 2.1 Method 1: Generate Tests Directly

Use the `generate_tests` method to create test questions directly from an ideal answer without running a comparison.

In [None]:
# Generate test questions directly from ideal answer
generated = lamoom_pipe.generate_tests(
    ideal_answer=ideal_answer,
    optional_params={
        "prompt_id": "blockchain_direct",
        "context": {"user_query": "Explain blockchain technology to a beginner"},
        "prompt_data": "Explain blockchain in simple terms, {user}."
    }
)

print(f"Generated {len(generated['test_questions'])} test questions for prompt ID: {generated['prompt_id']}")
print("\nExample questions:")
for i, q in enumerate(generated['test_questions'][:3]):  # Show the first 3 questions
    print(f"\nQuestion {i+1}: {q['test_question']}")
    print(f"Ideal Answer: {q['ideal_answer']}")

# Create a test case directly from the generated questions
test_case = lamoom_pipe.create_test_case(
    ideal_answer=ideal_answer,
    optional_params={
        "prompt_id": "blockchain_direct",
        "context": {"user_query": "Explain blockchain technology to a beginner"}
    }
)

# Save to a JSON file
test_cases_file = "blockchain_test_cases.json"
lamoom_pipe.save_test_cases_to_json(test_cases_file)

case_id = test_case.case_id
print(f"Test case created directly with ID: {case_id}")
print(f"Test case saved to {test_cases_file}")

### 2.2 Method 2: Create Test Case After Comparison

Alternatively, you can run a comparison first and then save the results as a test case.

In [None]:
# Run initial test to generate questions and compare
result = lamoom_pipe.compare(
    ideal_answer=ideal_answer,
    llm_response=llm_responses[0],  # Use the first response for initial test
    optional_params={"prompt_id": "blockchain_example"}
)

# Display the test results
print(f"Test Score: {result.score.score}% (Passed: {result.score.passed})")
print("\nTest Questions:")
for q in result.questions:
    print(f"\nQ: {q.test_question}")
    print(f"Ideal Answer: {q.ideal_answer}")
    print(f"LLM Answer: {q.llm_answer}")
    print(f"Matches: {q.does_match_ideal_answer}")

# Save the test case from comparison results
comparison_test_case = lamoom_pipe.save_test_case(result)
comparison_case_id = comparison_test_case.case_id

# Update the JSON file with both test cases
lamoom_pipe.save_test_cases_to_json(test_cases_file)

print(f"Comparison test case saved with ID: {comparison_case_id}")
print(f"Updated test cases in {test_cases_file}")

## 3. Run Multiple Tests with the Same Test Case

Now we'll run tests with all our different LLM responses against the same test case.

In [None]:
# Run multiple tests with different responses
results = lamoom_pipe.run_multiple_tests(case_id, llm_responses)

# Display results
for i, result in enumerate(results):
    print(f"Run {i+1}: Score {result.score.score}% (Passed: {result.score.passed})")

## 5. Load Test Cases from JSON

In [None]:
# Create a new test pipeline and load the test cases
new_pipe = TestLLMResponsePipe(openai_key=openai_key)
loaded_cases = new_pipe.load_test_cases_from_json(test_cases_file)

print(f"Loaded {len(loaded_cases)} test cases:")
for case in loaded_cases:
    print(f"\nCase ID: {case.case_id}")
    print(f"Prompt ID: {case.prompt_id}")
    print(f"Number of questions: {len(case.test_questions)}")

## 6. Generate Test Cases from CSV

In [None]:
# Create a sample CSV file
csv_file = "sample_test_cases.csv"

# Write the CSV content
with open(csv_file, 'w') as f:
    f.write("ideal_answer,llm_response,optional_params\n")
    f.write(f'"{ideal_answer}","","{{\"prompt_id\": \"blockchain_from_csv\"}}"\n')
    f.write('"Artificial Intelligence is software that can learn and adapt.","","{{\"prompt_id\": \"ai_definition\"}}"\n')

# Generate test cases from the CSV
output_json = "generated_test_cases.json"
cases = new_pipe.save_test_cases_from_csv(csv_file, output_json)

print(f"Generated {len(cases)} test cases and saved to {output_json}")

## 7. Save and Load Test Results

## 8. Unified Test Runner

The `run_tests` method provides a streamlined way to run tests with various configurations.

In [ ]:
# Save LLM responses for later use
responses_file = "llm_responses.json"
responses_dict = {
    case_id: llm_responses  # Use the same responses for our test case
}
lamoom_pipe.save_llm_responses(responses_dict, responses_file)
print(f"Saved LLM responses to {responses_file}")

In [ ]:
# Method 1: Run tests from a test cases file with LLM responses
print("Method 1: Running tests from test cases file...")
results1 = new_pipe.run_tests(
    llm_response_provider=responses_file,  # Load responses from file
    test_cases=test_cases_file,            # Load test cases from file
    results_file="run_method1_results.json"
)

print(f"\nSummary:")
print(f"Total tests: {results1['summary']['total_tests']}")
print(f"Average score: {results1['summary']['avg_score']:.2f}%")
print(f"Pass rate: {results1['summary']['overall_pass_rate']:.2f}%")
print(f"Time taken: {results1['summary']['elapsed_time']:.2f} seconds")

In [ ]:
# Method 2: Run tests with a list of LLM responses
print("\nMethod 2: Running tests with a list of responses...")
results2 = new_pipe.run_tests(
    llm_response_provider=llm_responses[:2],  # Use just 2 responses
    test_cases=loaded_cases,                  # Use loaded test cases
    runs_per_case=2,                          # Run each case twice
    results_file="run_method2_results.json"
)

print(f"\nSummary:")
print(f"Total tests: {results2['summary']['total_tests']}")
print(f"Average score: {results2['summary']['avg_score']:.2f}%")
print(f"Pass rate: {results2['summary']['overall_pass_rate']:.2f}%")
print(f"Time taken: {results2['summary']['elapsed_time']:.2f} seconds")

In [ ]:
# Method 3: Run tests with a dictionary mapping case_ids to specific responses
print("\nMethod 3: Running tests with a dictionary of responses...")

# Create a dictionary of case_id to specific responses
custom_responses = {
    case_id: [llm_responses[0], llm_responses[1]],  # First test case gets first two responses
    comparison_case_id: [llm_responses[2], llm_responses[3]]  # Second test case gets next two
}

results3 = new_pipe.run_tests(
    llm_response_provider=custom_responses,  # Dictionary mapping case_ids to responses
    results_file="run_method3_results.json"
)

print(f"\nSummary:")
print(f"Total tests: {results3['summary']['total_tests']}")
print(f"Average score: {results3['summary']['avg_score']:.2f}%")
print(f"Pass rate: {results3['summary']['overall_pass_rate']:.2f}%")
print(f"Time taken: {results3['summary']['elapsed_time']:.2f} seconds")

In [ ]:
# Visualize all the results
plt.figure(figsize=(14, 8))
plt.subplots_adjust(bottom=0.3)  # Make room for the case_id labels

all_aggregated = results3["aggregated"]  # Use the most recent results with all cases
case_ids = [agg.case_id for agg in all_aggregated]
avg_scores = [agg.avg_score for agg in all_aggregated]
std_devs = [agg.std_deviation for agg in all_aggregated]
pass_rates = [agg.pass_rate for agg in all_aggregated]

# Plot average scores with error bars
plt.bar(
    range(len(case_ids)),
    avg_scores,
    yerr=std_devs,
    capsize=5,
    alpha=0.7,
    color='skyblue'
)

# Add pass rates as labels
for i, (avg, pr) in enumerate(zip(avg_scores, pass_rates)):
    plt.text(i, avg + std_devs[i] + 2, f"{pr:.0f}% pass", 
             ha='center', va='bottom', fontweight='bold')

# Add threshold line
plt.axhline(y=new_pipe.threshold, color='r', linestyle='--', 
            label=f"Passing threshold ({new_pipe.threshold}%)")

plt.title("Test Results Across All Cases")
plt.xlabel("Test Case")
plt.ylabel("Average Score (%)")
plt.xticks(range(len(case_ids)), case_ids, rotation=45)
plt.ylim(0, 100)
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [ ]:
# Save test results to JSON
results_file = "test_results.json"
lamoom_pipe.save_test_results_to_json(results_file)

# Load results in a new pipe
another_pipe = TestLLMResponsePipe()
loaded_results = another_pipe.load_test_results_from_json(results_file)

print(f"Loaded {len(loaded_results)} test results")