In [None]:
import os
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
os.environ["OPEN_AI_KEY"] = api_key
import subprocess
import re
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

In [None]:
# Step 1: Run a Python script and capture its output
def run_script(file_path):
    try:
        result = subprocess.run(
            ["python", file_path],
            capture_output=True,
            text=True,
            check=True
        )
        return result.stdout.strip()
    except subprocess.CalledProcessError as e:
        print(f"Error running script {file_path}: {e}")
        return None

# Step 2: Evaluate outputs using an LLM
def evaluate_outputs(true_output, generated_output, llm):
    prompt_template = PromptTemplate(
        input_variables=["true_output", "generated_output"],
        template="""
The outputs below are generated by running two Python scripts:
1. The first output is from the true script (expected result).
2. The second output is from the generated script (model prediction).

True Output:
{true_output}

Generated Output:
{generated_output}

Compare these outputs semantically. Do they represent equivalent results, even if formatting or naming conventions differ? \
    If not, explain the differences. Focus on the metric values and the core outputs of the scripts. \
    Respond with a detailed evaluation and include a binary decision ("0" for equivalent and "1" for not equivalent).

Decision: 0 (equivalent) or 1 (not equivalent)
"""
    )
    chain = LLMChain(llm=llm, prompt=prompt_template)
    inputs = {
        "true_output": true_output,
        "generated_output": generated_output,
    }
    result = chain.invoke(inputs)
    llm_output = result['text'].strip()

    match = re.search(r"Decision:\s*([01])", llm_output)
    if match:
        decision = int(match.group(1))  # Convert the matched value to an integer
        equivalent = decision == 0     # True if 0, False if 1
    else:
        print("Error parsing LLM decision. Defaulting to Not Equivalent.")
        decision = 1
        equivalent = False

    return llm_output, equivalent

# Step 3: Filter file names based on inclusion rules
def filter_files(file_list, base_folder, generated_folder):
    filtered_files = []
    for entry in file_list:
        file_name = entry.split()[0]
        numbers = entry.split()[1:] if len(entry.split()) > 1 else ["1", "2", "3"]

        for num in numbers:
            filtered_files.append({
                "true_file": os.path.join(base_folder, f"modified_{file_name}_{num}.py"),
                "generated_file": os.path.join(generated_folder, f"prompt_{file_name}_{num}_generated_ICL.py"),
                "file_name": f"{file_name}_{num}"
            })

    return filtered_files

# Step 4: Compare true and generated files
def compare_outputs(file_entries, llm):
    comparison_results = {}

    for entry in file_entries:
        true_file_path = entry["true_file"]
        generated_file_path = entry["generated_file"]
        file_name = entry["file_name"]

        if not os.path.exists(true_file_path):
            print(f"True file not found: {true_file_path}")
            continue

        if not os.path.exists(generated_file_path):
            print(f"Generated file not found: {generated_file_path}")
            continue

        # Run both scripts and capture outputs
        true_output = run_script(true_file_path)
        generated_output = run_script(generated_file_path)

        if true_output is None or generated_output is None:
            print(f"Error running one of the scripts: {file_name}")
            continue

        # Evaluate outputs using the LLM
        llm_output, equivalent = evaluate_outputs(true_output, generated_output, llm)

        # Store the results
        comparison_results[file_name] = {
            "true_output": true_output,
            "generated_output": generated_output,
            "evaluation": llm_output,
            "equivalent": equivalent
        }

        print(f"Comparison for {file_name} completed.")

    return comparison_results

# Step 5: Save results to a file
def save_results(results, output_file):
    import json
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=4)
    print(f"Results saved to {output_file}")

# Example usage
if __name__ == "__main__":
    # File list provided in the request
    file_list = [
        "dl_retx_analyzer_test.py",
        "lte_dl_retx_analyzer_example.py",
        "lte_mac_analyzer_example.py 1 2",
        "lte_meas_analyzer_example.py",
        "lte_phy_analyzer_example.py 1",
        "lte_rlc_analyzer_example.py 1 2",
        "mm_analyzer_example.py 1 3",
        "modem_debug_analyzer_example.py 1",
        "nn_rrc_analyzer_example.py 1 2",
        "offline-latency-analysis-ul.py",
        "track_cell_info_analyzer_example.py 2 3",
        "ul_mac_latency_analyzer_example.py 1 2",
        "umts_nas_analyzer_example.py 1",
        "wcdma_rrc_analyzer_example.py 1 2"
    ]

    # Base folder containing the files
    base_folder = '/home/harshbull/Desktop/LLM-assisted_mobile_trace_analysis/generated_outer_working_dataset'

    generated_folder = '/home/harshbull/Desktop/LLM-assisted_mobile_trace_analysis/generated_outer_analyzers_ICL'

    # Filter the files based on the rules
    filtered_files = filter_files(file_list, base_folder, generated_folder)

    # Initialize the LLM
    llm = ChatOpenAI(model="gpt-4o", temperature=0)

    # Compare outputs
    results = compare_outputs(filtered_files, llm)

    # Save results
    save_results(results, r"comparison_results_ICL.json")
