# LLM-Performance-Based Routing Heuristic

### Model Training

In [24]:
import time
import joblib
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

# Configuration
exclude_percentile = 80  # Removes the top X% most complex queries
train_ratio = 0.90       # 90% Training, 10% Testing

# Parameters for training multiple models on different t-thresholds
file_names = [
    ("t0", "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt0_tt3_rt3.jsonl"),
    ("t-0.005", "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.005_tt3_rt3.jsonl"),
    ("t-0.01", "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.01_tt3_rt3.jsonl"),
    ("t-0.015", "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.015_tt3_rt3.jsonl"),
    ("t-0.02", "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.02_tt3_rt3.jsonl"),
    ("t-0.025", "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.025_tt3_rt3.jsonl"),
    ("t-0.03", "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.03_tt3_rt3.jsonl"),
]

# Loop over each dataset
for t_threshold, file_path in file_names:
    print(f"\n--- Training Model for t-threshold: {t_threshold} ---")
    
    # Load dataset
    df = pd.read_json(file_path, orient='records', lines=True)

    # Filter out the top X% most complex queries
    complexity_threshold = df['query_complexity_score'].quantile(exclude_percentile / 100)
    filtered_df = df[df['query_complexity_score'] <= complexity_threshold].copy()
    num_removed = len(df) - len(filtered_df)

    print(f"Filtered out the top {100 - exclude_percentile}% most complex queries.")
    print(f"Total queries removed: {num_removed}. Remaining for training/testing: {len(filtered_df)}.")

    # Compute dynamic train-test split
    num_train = round(len(filtered_df) * train_ratio)  # 90% Training
    num_test = len(filtered_df) - num_train            # 10% Testing

    train_df = filtered_df.iloc[:num_train].copy()     # Training Data

    print(f"Final split: {num_train} queries for training, {num_test} queries for testing.")

    # Keep only relevant columns for training
    train_df = train_df[['id', 'query', 'instruction', 'input', 
                         'query_chars_count', 'query_words_count', 'query_unique_word_count', 
                         'query_readability_score', 'query_special_tokens_count', 
                         'query_keywords_count', 'query_contains_url', 'evaluation_model_label']]

    # Extract X and Y for training
    X_train_evaluation = train_df[['query', 'query_chars_count', 'query_words_count', 'query_unique_word_count', 
                                 'query_readability_score', 'query_special_tokens_count', 'query_keywords_count',
                                 'query_contains_url']]
    y_train_evaluation = train_df['evaluation_model_label']

    # Define model pipeline
    column_transformer = ColumnTransformer(
        transformers=[
            ('tfidf', TfidfVectorizer(max_features=5000), 'query'),
            ('scaler', StandardScaler(), [
                'query_chars_count', 'query_words_count', 'query_unique_word_count',
                'query_readability_score', 'query_special_tokens_count', 'query_keywords_count', 'query_contains_url'
            ])
        ]
    )

    pipeline = Pipeline([
        ('transformer', column_transformer),
        ('clf', RandomForestClassifier(n_estimators=200, max_depth=None, n_jobs=-1, class_weight='balanced', random_state=42))
    ])

    # Train model
    start_time = time.time()
    pipeline.fit(X_train_evaluation, y_train_evaluation)
    end_time = time.time()
    train_time = end_time - start_time

    # Save model with matching name (t-threshold & percentile)
    model_filename = f"trained_models/evaluation_model_{t_threshold}_p{exclude_percentile}_rf_nest200.pkl"
    joblib.dump(pipeline, model_filename)
    
    print(f"Evaluation Model for {t_threshold} Trained in {train_time:.4f} seconds.")
    print(f"Saved as: {model_filename}")



--- Training Model for t-threshold: t0 ---
Filtered out the top 20% most complex queries.
Total queries removed: 9996. Remaining for training/testing: 40004.
Final split: 36004 queries for training, 4000 queries for testing.
Evaluation Model for t0 Trained in 83.1312 seconds.
Saved as: approach_2_pretrained_models/evaluation_model_t0_p80_rf_nest200.pkl

--- Training Model for t-threshold: t-0.005 ---
Filtered out the top 20% most complex queries.
Total queries removed: 9996. Remaining for training/testing: 40004.
Final split: 36004 queries for training, 4000 queries for testing.
Evaluation Model for t-0.005 Trained in 97.6311 seconds.
Saved as: approach_2_pretrained_models/evaluation_model_t-0.005_p80_rf_nest200.pkl

--- Training Model for t-threshold: t-0.01 ---
Filtered out the top 20% most complex queries.
Total queries removed: 9996. Remaining for training/testing: 40004.
Final split: 36004 queries for training, 4000 queries for testing.
Evaluation Model for t-0.01 Trained in 116.

### Model Testing for Feature Calculation + Model Label Prediction for all Pretrained Models

In [1]:
import pandas as pd
import time
import json
import re
import joblib
from textstat import textstat
from sklearn.metrics import accuracy_score, classification_report

# Configuration
exclude_percentile = 80  # Exclude top X% most complex queries
train_ratio = 0.90       # 90% Training, 10% Testing
output_text_file = "discrepancy_thresholds_testing_results.txt"

# Dataset & model pairs
evaluation_files = [
    ("t0", "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt0_tt3_rt3.jsonl"),
    ("t-0.005", "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.005_tt3_rt3.jsonl"),
    ("t-0.01", "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.01_tt3_rt3.jsonl"),
    ("t-0.015", "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.015_tt3_rt3.jsonl"),
    ("t-0.02", "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.02_tt3_rt3.jsonl"),
    ("t-0.025", "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.025_tt3_rt3.jsonl"),
    ("t-0.03", "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.03_tt3_rt3.jsonl"),
]

trained_models = [
    ("t0", "trained_models/evaluation_model_t0_p80_rf_nest200.pkl"),
    ("t-0.005", "trained_models/evaluation_model_t-0.005_p80_rf_nest200.pkl"),
    ("t-0.01", "trained_models/evaluation_model_t-0.01_p80_rf_nest200.pkl"),
    ("t-0.015", "trained_models/evaluation_model_t-0.015_p80_rf_nest200.pkl"),
    ("t-0.02", "trained_models/evaluation_model_t-0.02_p80_rf_nest200.pkl"),
    ("t-0.025", "trained_models/evaluation_model_t-0.025_p80_rf_nest200.pkl"),
    ("t-0.03", "trained_models/evaluation_model_t-0.03_p80_rf_nest200.pkl"),
]

# Open text file to store results
with open(output_text_file, "w") as result_file:

    # Iterate through each dataset & corresponding trained model
    for (t_threshold, dataset_path), (_, model_path) in zip(evaluation_files, trained_models):
        print(f"\n--- Testing Model for t-threshold: {t_threshold} ---")
        result_file.write(f"\n--- Testing Model for t-threshold: {t_threshold} ---\n")

        # Load correct dataset
        df = pd.read_json(dataset_path, orient='records', lines=True)

        # Determine complexity threshold
        complexity_threshold = df['query_complexity_score'].quantile(exclude_percentile / 100)

        # Filter out the top X% most complex queries
        filtered_df = df[df['query_complexity_score'] <= complexity_threshold].copy()
        num_removed = len(df) - len(filtered_df)

        print(f"Filtered out the top {100 - exclude_percentile}% most complex queries.")
        print(f"Total queries removed: {num_removed}. Remaining for training/testing: {len(filtered_df)}.")

        # Compute Train-Test Split
        num_train = round(len(filtered_df) * train_ratio)
        num_test = len(filtered_df) - num_train

        train_df = filtered_df.iloc[:num_train].copy()
        test_df = filtered_df.iloc[num_train:].copy()

        print(f"Final split: {num_train} queries for training, {num_test} queries for testing.")
        result_file.write(f"Final split: {num_train} queries for training, {num_test} queries for testing.\n")
        
        # Load the correct model
        pipeline_evaluation = joblib.load(model_path)
        
        # Prepare testing data
        test_df = test_df[['id', 'query', 'instruction', 'input']]
        print(f"Loaded {len(test_df)} queries for testing.")

        def calculate_query_features(df):
            """
            Extracts features from queries, such as length, special tokens, keywords, and readability.

            Args:
                df (pd.DataFrame): DataFrame containing query data.

            Returns:
                pd.DataFrame: Updated DataFrame with additional query-based features.
            """
            special_tokens = [":", ";", "=", "+", "-", "_", "/", ".", "'", '"', "´", "`", ",", "<", ">", "[", "]", "{", "}", "(", ")", "?", "!", "*", "&", "$", "#", "@", "%", "^", "~", "|", "\\"]
            keywords = [
                "analyze", "synthesize", "interpret", "evaluate", "justify", "compare", 
                "optimize", "hypothesize", "formulate", "simulate", "derive", "describe",
                "validate", "correlate", "quantify", "investigate", "predict", "forecast",
                "prove", "assess", "criticize", "argue", "solve", "reconstruct", "theorize",
                "explore", "elaborate", "deduce", "refute", "conceptualize", "identify", "outline",
                "rationalize", "articulate", "summarize", "innovate", "extrapolate", "explain", "clarify"
            ]

            def remove_urls(text):
                """
                Removes URLs and image links from a given text.

                Args:
                    text (str): The input text.

                Returns:
                    str: The cleaned text without URLs.
                """
                return re.sub(r'!\[.*?\]\(.*?\)|http[s]?://\S+', '', text)

            df['query_chars_count'] = df['query'].apply(lambda x: len(str(x)))
            df['query_words_count'] = df['query'].apply(lambda x: len(str(x).split()))
            df['query_unique_word_count'] = df['query'].apply(lambda x: len(set(str(x).split())))
            df['query_readability_score'] = df['query'].apply(lambda x: textstat.flesch_kincaid_grade(remove_urls(x)))
            df['query_special_tokens_count'] = df['query'].apply(lambda x: sum(1 for char in str(x) if char in special_tokens))
            df['query_keywords_count'] = df.apply(lambda row: (
                sum(len(re.findall(r'\b{}\b'.format(re.escape(keyword)), str(row.get('instruction', '')).lower())) for keyword in keywords)
                if row.get('instruction') 
                else sum(len(re.findall(r'\b{}\b'.format(re.escape(keyword)), str(row.get('input', '')).lower())) for keyword in keywords)
                ), axis=1)
            df['query_contains_url'] = df['query'].apply(lambda x: int(bool(re.search(r'http[s]?://', str(x)))))

            return df

        # Apply feature calculation
        start_time = time.time()
        test_df = calculate_query_features(test_df)
        end_time = time.time()
        query_features_time = end_time - start_time

        print(f"Query Features Computed in {query_features_time:.4f} seconds.")
        result_file.write(f"Query Features Computed in {query_features_time:.4f} seconds.\n")

        # Label prediction
        features_to_drop = ['id', 'instruction', 'input']
        test_X_evaluation = test_df.drop(columns=features_to_drop, errors='ignore')

        start_time = time.time()
        y_pred_proba = pipeline_evaluation.predict_proba(test_X_evaluation)  # Get Probabilities
        y_pred_evaluation = y_pred_proba.argmax(axis=1) + 1  # Convert to label (1,2,3)
        end_time = time.time()
        evaluation_test_time = end_time - start_time

        print(f"Model Prediction Completed in {evaluation_test_time:.4f} seconds.")
        result_file.write(f"Model Prediction Completed in {evaluation_test_time:.4f} seconds.\n")

        # Store predictions and probabilities
        test_df['predicted_model_label'] = y_pred_evaluation
        test_df['proba_base'] = y_pred_proba[:, 0]
        test_df['proba_large'] = y_pred_proba[:, 1]
        test_df['proba_xl'] = y_pred_proba[:, 2]
        
        # Time summary
        total_time = evaluation_test_time + query_features_time
        avg_time_per_query = total_time / len(test_df)

        print(f"Total Feature Extraction & Prediction Time: {total_time:.4f} seconds")
        result_file.write(f"Total Inference Time (All Steps): {total_time:.4f} seconds.\n")
        print(f"Avg Time per Query: {avg_time_per_query:.6f} seconds")
        result_file.write(f"Avg Time per Query: {avg_time_per_query:.6f} seconds.\n")

        # Restore actual labels for evaluation
        test_df['evaluation_model_label'] = filtered_df.iloc[num_train:]['evaluation_model_label'].values

        # Model evaluation
        evaluation_accuracy = accuracy_score(test_df['evaluation_model_label'], test_df['predicted_model_label'])
        evaluation_report = classification_report(test_df['evaluation_model_label'], test_df['predicted_model_label'])

        print("\nEVALUATION MODEL EVALUATION")
        result_file.write("\nEVALUATION MODEL EVALUATION\n")
        print(f"Model Label Prediction Accuracy: {evaluation_accuracy * 100:.2f}%")
        result_file.write(f"Model Label Prediction Accuracy: {evaluation_accuracy * 100:.2f}%\n")
        print("\nClassification Report:\n", evaluation_report)
        result_file.write("\nClassification Report:\n" + evaluation_report + "\n")


        # Define output file path
        output_file_path = f'predicted_model_label/predicted_model_label_{t_threshold}.jsonl'

        # Write DataFrame to a JSONL file including probabilities
        with open(output_file_path, 'w') as output_file:
            for entry in test_df[['id', 'predicted_model_label', 'proba_base', 'proba_large', 'proba_xl']].to_dict(orient='records'):
                json.dump(entry, output_file, separators=(",", ":"))
                output_file.write('\n')

        print(f"Saved predictions for {t_threshold} to: {output_file_path}")
        result_file.write(f"Saved predictions for {t_threshold} to: {output_file_path}\n")



--- Testing Model for t-threshold: t0 ---
Filtered out the top 20% most complex queries.
Total queries removed: 9996. Remaining for training/testing: 40004.
Final split: 36004 queries for training, 4000 queries for testing.
Loaded 4000 queries for testing.
Query Features Computed in 1.9887 seconds.
Model Prediction Completed in 0.3862 seconds.
Total Feature Extraction & Prediction Time: 2.3749 seconds
Avg Time per Query: 0.000594 seconds

EVALUATION MODEL EVALUATION
Model Label Prediction Accuracy: 69.42%

Classification Report:
               precision    recall  f1-score   support

           1       0.41      0.15      0.22       552
           2       0.59      0.21      0.31       754
           3       0.72      0.94      0.82      2694

    accuracy                           0.69      4000
   macro avg       0.57      0.43      0.45      4000
weighted avg       0.65      0.69      0.64      4000

Saved predictions for t0 to: predicted_model_label/predicted_model_label_t0.jsonl


### Evaluation

In [6]:
import pandas as pd
import time
import json
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report

# Configuration
exclude_percentile = 80  # Exclude top X% most complex queries
train_ratio = 0.90       # 90% Training, 10% Testing

# Define different probability thresholds
probability_thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Define compute unit consumption for each model
COMPUTE_UNITS = {1: 1.0, 2: 3.12, 3: 12.0}

# === Define Model Files and Corresponding Datasets ===
model_files = [
    "trained_models/evaluation_model_t0_p80_rf_nest200.pkl",
    "trained_models/evaluation_model_t-0.005_p80_rf_nest200.pkl",
    "trained_models/evaluation_model_t-0.01_p80_rf_nest200.pkl",
    "trained_models/evaluation_model_t-0.015_p80_rf_nest200.pkl",
    "trained_models/evaluation_model_t-0.02_p80_rf_nest200.pkl",
    "trained_models/evaluation_model_t-0.025_p80_rf_nest200.pkl",
    "trained_models/evaluation_model_t-0.03_p80_rf_nest200.pkl",
]

dataset_files = [
    "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt0_tt3_rt3.jsonl",
    "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.005_tt3_rt3.jsonl",
    "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.01_tt3_rt3.jsonl",
    "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.015_tt3_rt3.jsonl",
    "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.02_tt3_rt3.jsonl",
    "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.025_tt3_rt3.jsonl",
    "../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt-0.03_tt3_rt3.jsonl",
]

def confidence_based_routing(df, y_pred_proba, confidence_threshold):
    """
    Routes queries based on confidence scores of model predictions.
    
    Args:
        df (pd.DataFrame): The dataset containing query information.
        y_pred_proba (numpy.ndarray): Predicted probabilities for each class.
        confidence_threshold (float): Minimum confidence required to accept a prediction.
    
    Returns:
        pd.DataFrame: Updated dataset with assigned model labels.
    """
    routed_to = []

    for probs in y_pred_proba:
        max_confidence = max(probs)
        predicted_label = probs.argmax() + 1  # Add 1 to match labels (1, 2, 3)

        # Use confidence threshold to adjust predictions
        if max_confidence < confidence_threshold:
            routed_to.append(3)  # Route to XL
        else:
            routed_to.append(predicted_label)

    df['routed_to'] = routed_to
    return df

def calculate_quality_discrepancies(df):
    """
    Computes the quality loss when routing based on predicted labels.

    Args:
        df (pd.DataFrame): The dataset with routing results.

    Returns:
        tuple: (quality_loss_to_gt, quality_loss_to_xl)
    """
    total_score_gt = df.apply(lambda row: row[f"avg_normalized_score_t5_{['base', 'large', 'xl'][row['evaluation_model_label'] - 1]}"], axis=1).sum()
    total_score_router = df.apply(lambda row: row[f"avg_normalized_score_t5_{['base', 'large', 'xl'][row['routed_to'] - 1]}"], axis=1).sum()
    total_score_xl = df['avg_normalized_score_t5_xl'].sum()

    num_queries = len(df)

    avg_score_gt = total_score_gt / num_queries
    avg_score_router = total_score_router / num_queries
    avg_score_xl = total_score_xl / num_queries

    quality_loss_to_gt = ((avg_score_router - avg_score_gt) / avg_score_gt) * 100
    quality_loss_to_xl = ((avg_score_router - avg_score_xl) / avg_score_xl) * 100

    return quality_loss_to_gt, quality_loss_to_xl

# Open file for writing results
result_file_path = "discrepancy_thresholds_evaluation_results.txt"

with open(result_file_path, "w") as result_file:
    for model_file, dataset_file in zip(model_files, dataset_files):
        t_threshold = model_file.split("_")[-4]  # Extract t-threshold

        result_file.write(f"\n--- Evaluating Model for t-threshold: {t_threshold} ---\n")

        # Load dataset
        df = pd.read_json(dataset_file, orient='records', lines=True)

        # Filter out the top X% most complex queries
        complexity_threshold = df['query_complexity_score'].quantile(exclude_percentile / 100)
        filtered_df = df[df['query_complexity_score'] <= complexity_threshold].copy()

        num_removed = len(df) - len(filtered_df)

        result_file.write(f"Filtered out the top {100 - exclude_percentile}% most complex queries.\n")

        # Split dataset
        num_train = round(len(filtered_df) * train_ratio)
        num_test = len(filtered_df) - num_train

        train_df = filtered_df.iloc[:num_train].copy()
        test_df = filtered_df.iloc[num_train:].copy()

        result_file.write(f"Final split: {num_train} queries for training, {num_test} queries for testing.\n")

        # Load predictions and probabilities
        predictions_file = f"predicted_model_label/predicted_model_label_{t_threshold}.jsonl"
        predicted_labels_df = pd.read_json(predictions_file, orient='records', lines=True)

        # Merge with test data
        test_df = test_df.merge(predicted_labels_df, on='id', how='left')
        
        # Check if merge worked
        if 'predicted_model_label' not in test_df.columns:
            raise KeyError("Error: 'predicted_model_label' is missing after merging!")
            
        # Compute average maximum probability across all queries
        avg_max_proba = test_df[['proba_base', 'proba_large', 'proba_xl']].max(axis=1).mean()

        result_file.write(f"Average Maximum Probability per Query: {avg_max_proba:.4f}\n")

        # Iterate over probability thresholds
        for threshold in probability_thresholds:
            result_file.write(f"\nEvaluating Probability Threshold: {threshold:.1f}\n")

            # Measure routing time
            start_time = time.time()
            test_df = confidence_based_routing(test_df, test_df[['proba_base', 'proba_large', 'proba_xl']].values, threshold)
            end_time = time.time()
            routing_time = end_time - start_time
            avg_routing_time_per_query = routing_time / len(test_df)
            
            # Count queries routed to each model
            num_routed_base = (test_df['routed_to'] == 1).sum()
            num_routed_large = (test_df['routed_to'] == 2).sum()
            num_routed_xl = (test_df['routed_to'] == 3).sum()
            
            # Compute metrics
            accuracy = accuracy_score(test_df['evaluation_model_label'], test_df['routed_to']) * 100
            quality_loss_gt, quality_loss_xl = calculate_quality_discrepancies(test_df)

            # Define compute usage and savings
            compute_units_gt = sum(test_df['evaluation_model_label'].map(COMPUTE_UNITS))
            compute_units_router = sum(test_df['routed_to'].map(COMPUTE_UNITS))
            compute_units_xl = len(test_df) * COMPUTE_UNITS[3]

            compute_savings_gt = ((compute_units_gt - compute_units_router) / compute_units_gt) * 100
            compute_savings_xl = ((compute_units_xl - compute_units_router) / compute_units_xl) * 100

            # Print & save results
            output = (
                f"Quality Loss compared to GT: {quality_loss_gt:.2f}%\n"
                f"Quality Loss compared to XL: {quality_loss_xl:.2f}%\n"
                f"Compute Savings compared to GT: {compute_savings_gt:.2f}%\n"
                f"Compute Savings compared to XL: {compute_savings_xl:.2f}%\n"
                f"Routing Accuracy: {accuracy:.2f}%\n"
                f"Total Compute Units Used: {compute_units_router:.2f} CUs\n"
                f"Avg Routing Time per Query: {avg_routing_time_per_query:.8f} seconds\n"
                f"Number of queries routed to Base: {num_routed_base}\n"
                f"Number of queries routed to Large: {num_routed_large}\n"
                f"Number of queries routed to XL: {num_routed_xl}\n"
            )

            result_file.write(output)

print(f"\nEvaluation completed. Results saved to: {result_file_path}")



Evaluation completed. Results saved to: discrepancy_thresholds_evaluation_results.txt


### Plot Results

In [8]:
import matplotlib.pyplot as plt
import pandas as pd
import re

# Define result file path
result_file_path = "discrepancy_thresholds_evaluation_results.txt"

# Initialize lists to store parsed results
t_thresholds = []
probability_thresholds = []
quality_loss_gt = []
quality_loss_xl = []
compute_savings_gt = []
compute_savings_xl = []
routing_accuracy = []

# Read and parse the result file
with open(result_file_path, "r") as file:
    current_t_threshold = None
    current_prob_threshold = None

    for line in file:
        line = line.strip()

        # Detect new t-threshold
        if line.startswith("--- Evaluating Model for t-threshold: "):
            match = re.search(r"t-?\d*\.?\d+", line)
            if match:
                current_t_threshold = match.group(0)

        # Detect probability threshold
        elif line.startswith("Evaluating Probability Threshold: "):
            current_prob_threshold = float(line.split(": ")[1])

        # Extract metrics
        elif line.startswith("Quality Loss compared to GT:"):
            quality_loss_gt.append(float(re.findall(r"[-+]?\d*\.\d+|\d+", line)[0]))
        elif line.startswith("Quality Loss compared to XL:"):
            quality_loss_xl.append(float(re.findall(r"[-+]?\d*\.\d+|\d+", line)[0]))
        elif line.startswith("Compute Savings compared to GT:"):
            compute_savings_gt.append(float(re.findall(r"[-+]?\d*\.\d+|\d+", line)[0]))
        elif line.startswith("Compute Savings compared to XL:"):
            compute_savings_xl.append(float(re.findall(r"[-+]?\d*\.\d+|\d+", line)[0]))
        elif line.startswith("Routing Accuracy:"):
            routing_accuracy.append(float(re.findall(r"[-+]?\d*\.\d+|\d+", line)[0]))

            # Store the extracted threshold values correctly
            t_thresholds.append(current_t_threshold)
            probability_thresholds.append(current_prob_threshold)

# Create DataFrame for easier plotting
df_results = pd.DataFrame({
    "t_threshold": t_thresholds,
    "prob_threshold": probability_thresholds,
    "quality_loss_gt": quality_loss_gt,
    "quality_loss_xl": quality_loss_xl,
    "compute_savings_gt": compute_savings_gt,
    "compute_savings_xl": compute_savings_xl,
    "routing_accuracy": routing_accuracy
})

# Define distinct colors for each threshold, avoiding light colors
threshold_colors = {
    "t0": "blue",
    "t-0.005": "green",
    "t-0.01": "red",
    "t-0.015": "purple",
    "t-0.02": "darkorange",
    "t-0.025": "brown",
    "t-0.03": "dodgerblue"
}

# Dictionary of metric names
metric_titles = {
    "quality_loss_gt": "Quality Loss vs GT",
    "quality_loss_xl": "Quality Loss vs XL",
    "compute_savings_gt": "Compute Savings vs GT",
    "compute_savings_xl": "Compute Savings vs XL",
    "routing_accuracy": "Routing Accuracy"
}

# Function to generate a plot for a given metric
def plot_metric(metric, ylabel, filename):

    plt.figure(figsize=(12, 6))
    
    for t in sorted(df_results["t_threshold"].unique()):
        subset = df_results[df_results["t_threshold"] == t]
        color = threshold_colors.get(t, "black")
        plt.plot(subset["prob_threshold"], subset[metric], marker="o", label=f"{t}", color=color, alpha=1.0)

    plt.xlabel("Probability Threshold")
    plt.ylabel(ylabel)
    plt.legend()
    plt.grid(True)

    # Save as PDF
    plt.savefig(filename, format="pdf", bbox_inches="tight")

    plt.close()

# Generate and save plots
plot_metric("quality_loss_gt", "Quality Loss vs GT (%)", "../../plots/quality_loss_gt.pdf")
plot_metric("quality_loss_xl", "Quality Loss vs XL (%)", "../../plots/quality_loss_xl.pdf")
plot_metric("compute_savings_gt", "Compute Savings vs GT (%)", "../../plots/compute_savings_gt.pdf")
plot_metric("compute_savings_xl", "Compute Savings vs XL (%)", "../../plots/compute_savings_xl.pdf")
plot_metric("routing_accuracy", "Routing Accuracy (%)", "../../plots/routing_accuracy.pdf")

print("Plots saved successfully!")


Plots saved successfully!
