# Historical Runtime-Informed Routing Heuristic

### Model Training

In [2]:
import time
import joblib
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

# Configuration
exclude_percentile = 80  # Removes the top X% most complex queries
train_ratio = 0.90       # 90% Training, 10% Testing

# Load dataset
df = pd.read_json('../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt0_tt3_rt3.jsonl', orient='records', lines=True)

# Filter out the top X% most complex queries
complexity_threshold = df['query_complexity_score'].quantile(exclude_percentile / 100)
filtered_df = df[df['query_complexity_score'] <= complexity_threshold].copy()
num_removed = len(df) - len(filtered_df)

print(f"Filtered out the top {100 - exclude_percentile}% most complex queries.")
print(f"Total queries removed: {num_removed}. Remaining for training/testing: {len(filtered_df)}.")

# Compute dynamic train-test split
num_train = round(len(filtered_df) * train_ratio)  # First 90% for training
num_test = len(filtered_df) - num_train            # Remaining 10% for testing

train_df = filtered_df.iloc[:num_train].copy()     # Train set (first 90%)

print(f"Final split: {num_train} queries for training, {num_test} queries for testing.")

# Keep only relevant columns for training
train_df = train_df[['id', 'query', 'rt_t5_base_avg', 'rt_t5_large_avg', 'rt_t5_xl_avg',
                 'r_t5_base_length_avg', 'r_t5_large_length_avg', 'r_t5_xl_length_avg',
                 'r_t5_base_trash_count', 'r_t5_large_trash_count', 'r_t5_xl_trash_count',
                 'r_t5_base_query_repetition_count', 'r_t5_large_query_repetition_count', 'r_t5_xl_query_repetition_count',
                 'discrepancy_base_vs_large', 'discrepancy_large_vs_xl', 'discrepancy_base_vs_xl', 'evaluation_model_label']]

# Extract X and Y for training
X_train_historical = train_df[['query', 'rt_t5_base_avg', 'rt_t5_large_avg', 'rt_t5_xl_avg',
                 'r_t5_base_length_avg', 'r_t5_large_length_avg', 'r_t5_xl_length_avg',
                 'r_t5_base_trash_count', 'r_t5_large_trash_count', 'r_t5_xl_trash_count',
                 'r_t5_base_query_repetition_count', 'r_t5_large_query_repetition_count', 'r_t5_xl_query_repetition_count',
                 'discrepancy_base_vs_large', 'discrepancy_large_vs_xl', 'discrepancy_base_vs_xl']]
y_train_historical = train_df['evaluation_model_label']

# Train model
column_transformer = ColumnTransformer(
    transformers=[
        ('tfidf', TfidfVectorizer(max_features=5000), 'query'),
        ('scaler', StandardScaler(), [
            'rt_t5_base_avg', 'rt_t5_large_avg', 'rt_t5_xl_avg',
             'r_t5_base_length_avg', 'r_t5_large_length_avg', 'r_t5_xl_length_avg',
             'r_t5_base_trash_count', 'r_t5_large_trash_count', 'r_t5_xl_trash_count',
             'r_t5_base_query_repetition_count', 'r_t5_large_query_repetition_count', 'r_t5_xl_query_repetition_count',
             'discrepancy_base_vs_large', 'discrepancy_large_vs_xl', 'discrepancy_base_vs_xl'
        ])
    ]
)

# Create a pipeline with the column transformer and Random Forest Classifier
pipeline_historical = Pipeline([
    ('transformer', column_transformer),
    ('clf', RandomForestClassifier(n_estimators=200, max_depth=None, n_jobs=-1, class_weight='balanced', random_state=42))
])

start_time = time.time()
pipeline_historical.fit(X_train_historical, y_train_historical)
end_time = time.time()
historical_train_time = end_time - start_time

# Save category model with percentile in filename
historical_model_filename = f"trained_models/historical_model_p{exclude_percentile}.pkl"
joblib.dump(pipeline_historical, historical_model_filename)
print(f"Historical Model Trained in {historical_train_time:.4f} seconds.")
print(f"Saved as: {historical_model_filename}")


Filtered out the top 20% most complex queries.
Total queries removed: 9996. Remaining for training/testing: 40004.
Final split: 36004 queries for training, 4000 queries for testing.
Historical Model Trained in 22.9189 seconds.
Saved as: approach_3_pretrained_models/historical_model_p80.pkl


### Model Testing

In [1]:
import pandas as pd
import time
import json
import re
import joblib
from textstat import textstat
from sklearn.metrics import accuracy_score, classification_report

# Configuration
exclude_percentile = 80  # Exclude top X% most complex queries
train_ratio = 0.90       # 90% Training, 10% Testing

# Load dataset
df = pd.read_json('../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt0_tt3_rt3.jsonl', orient='records', lines=True)

# Determine complexity threshold
complexity_threshold = df['query_complexity_score'].quantile(exclude_percentile / 100)

# Filter out the top X% most complex queries
filtered_df = df[df['query_complexity_score'] <= complexity_threshold].copy()
num_removed = len(df) - len(filtered_df)

print(f"Filtered out the top {100 - exclude_percentile}% most complex queries.")
print(f"Total queries removed: {num_removed}. Remaining for training/testing: {len(filtered_df)}.")

# Compute train-test split
num_train = round(len(filtered_df) * train_ratio)
num_test = len(filtered_df) - num_train

train_df = filtered_df.iloc[:num_train].copy()
test_df = filtered_df.iloc[num_train:].copy()

print(f"Final split: {num_train} queries for training, {num_test} queries for testing.")

# Load the correct model
pipeline_historical = joblib.load("trained_models/historical_model_p80.pkl")

# Prepare testing data
test_df = test_df[['id', 'query', 'rt_t5_base_avg', 'rt_t5_large_avg', 'rt_t5_xl_avg',
                 'r_t5_base_length_avg', 'r_t5_large_length_avg', 'r_t5_xl_length_avg',
                 'r_t5_base_trash_count', 'r_t5_large_trash_count', 'r_t5_xl_trash_count',
                 'r_t5_base_query_repetition_count', 'r_t5_large_query_repetition_count', 'r_t5_xl_query_repetition_count',
                 'discrepancy_base_vs_large', 'discrepancy_large_vs_xl', 'discrepancy_base_vs_xl']]
print(f"Loaded {len(test_df)} queries for testing.")

# Label prediction
test_X_historical= test_df.drop(columns=['id'], errors='ignore')

start_time = time.time()
y_pred = pipeline_historical.predict(test_X_historical)
end_time = time.time()
historical_test_time = end_time - start_time
avg_time_per_query = historical_test_time / len(test_df)

print(f"Model Prediction Completed in {historical_test_time:.4f} seconds.")
print(f"Avg Time per Query (Inference Only): {avg_time_per_query:.6f} seconds")

# Append predicted model labels to test_df
test_df['predicted_model_label'] = y_pred

# Restore actual labels for evaluation
test_df['evaluation_model_label'] = filtered_df.iloc[num_train:]['evaluation_model_label'].values

# Model evaluation
historical_accuracy = accuracy_score(test_df['evaluation_model_label'], test_df['predicted_model_label'])
historical_report = classification_report(test_df['evaluation_model_label'], test_df['predicted_model_label'])

print("\nMODEL EVALUATION")
print(f"Model Label Prediction Accuracy: {historical_accuracy * 100:.2f}%")
print("\nClassification Report:\n", historical_report)

# Define output file path
output_file_path = 'predicted_model_label.jsonl'

# Write DataFrame to a JSONL file including probabilities
with open(output_file_path, 'w') as output_file:
    for entry in test_df[['id', 'predicted_model_label']].to_dict(orient='records'):
        json.dump(entry, output_file, separators=(",", ":"))
        output_file.write('\n')
        
print(f"Dataset with predicted model labels saved to: {output_file_path}")


Filtered out the top 20% most complex queries.
Total queries removed: 9996. Remaining for training/testing: 40004.
Final split: 36004 queries for training, 4000 queries for testing.
Loaded 4000 queries for testing.
Model Prediction Completed in 0.3181 seconds.
Avg Time per Query (Inference Only): 0.000080 seconds

MODEL EVALUATION
Model Label Prediction Accuracy: 98.85%

Classification Report:
               precision    recall  f1-score   support

           1       0.99      0.97      0.98       552
           2       0.99      0.97      0.98       754
           3       0.99      1.00      0.99      2694

    accuracy                           0.99      4000
   macro avg       0.99      0.98      0.98      4000
weighted avg       0.99      0.99      0.99      4000

Dataset with predicted model labels saved to: predicted_model_label.jsonl


### Evaluation

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report

# Configuration
exclude_percentile = 80  # Exclude top X% most complex queries
train_ratio = 0.90       # 90% Training, 10% Testing

# Define compute unit consumption for each model
COMPUTE_UNITS = {1: 1.0, 2: 3.12, 3: 12.0}

# Load dataset
df = pd.read_json('../../datasets/generated/step_13/train_data_with_query_and_response_features_discrepancies_with_scores_model_label_dt0_tt3_rt3.jsonl',orient='records', lines=True)

# Determine complexity threshold
complexity_threshold = df['query_complexity_score'].quantile(exclude_percentile / 100)

# Filter out the top X% most complex queries
filtered_df = df[df['query_complexity_score'] <= complexity_threshold].copy()
num_removed = len(df) - len(filtered_df)

print(f"Filtered out the top {100 - exclude_percentile}% most complex queries.")
print(f"Total queries removed: {num_removed}. Remaining for training/testing: {len(filtered_df)}.")

# Compute dynamic train-test split
num_train = round(len(filtered_df) * train_ratio)  # First 90% for training
num_test = len(filtered_df) - num_train            # Remaining 10% for testing

train_df = filtered_df.iloc[:num_train].copy()     # Train set (first 90%)
test_df = filtered_df.iloc[num_train:].copy()      # Test set (remaining 10%)

print(f"Final split: {num_train} queries for training, {num_test} queries for testing.")

# Load predicted labels from saved file
predicted_labels_df = pd.read_json('predicted_model_label.jsonl', orient='records', lines=True)

# Merge complexity scores with test data
test_df = test_df.merge(predicted_labels_df, on='id', how='left')

# Check if merge worked
if 'predicted_model_label' not in test_df.columns:
    raise KeyError("Error: 'predicted_model_label' is missing after merging!")
    
def calculate_quality_discrepancies(df):
    """
    Computes the quality loss when routing based on predicted labels.

    Args:
        df (pd.DataFrame): The dataset with routing results.

    Returns:
        tuple: (quality_loss_to_gt, quality_loss_to_xl)
    """
    total_score_gt = df.apply(lambda row: row[f"avg_normalized_score_t5_{['base', 'large', 'xl'][row['evaluation_model_label'] - 1]}"], axis=1).sum()
    total_score_router = df.apply(lambda row: row[f"avg_normalized_score_t5_{['base', 'large', 'xl'][row['predicted_model_label'] - 1]}"], axis=1).sum()
    total_score_xl = df['avg_normalized_score_t5_xl'].sum()

    num_queries = len(df)

    avg_score_gt = total_score_gt / num_queries
    avg_score_router = total_score_router / num_queries
    avg_score_xl = total_score_xl / num_queries

    quality_loss_to_gt = ((avg_score_router - avg_score_gt) / avg_score_gt) * 100
    quality_loss_to_xl = ((avg_score_router - avg_score_xl) / avg_score_xl) * 100

    return quality_loss_to_gt, quality_loss_to_xl

# Compute metrics
accuracy = accuracy_score(test_df['evaluation_model_label'], test_df['predicted_model_label']) * 100
quality_loss_gt, quality_loss_xl = calculate_quality_discrepancies(test_df)

# Count queries routed to each model
num_routed_base = (test_df['predicted_model_label'] == 1).sum()
num_routed_large = (test_df['predicted_model_label'] == 2).sum()
num_routed_xl = (test_df['predicted_model_label'] == 3).sum()

# Define compute usage and savings
compute_units_gt = sum(test_df['evaluation_model_label'].map(COMPUTE_UNITS))
compute_units_router = sum(test_df['predicted_model_label'].map(COMPUTE_UNITS))
compute_units_xl = len(test_df) * COMPUTE_UNITS[3]

compute_savings_gt = ((compute_units_gt - compute_units_router) / compute_units_gt) * 100
compute_savings_xl = ((compute_units_xl - compute_units_router) / compute_units_xl) * 100

# Print final evaluation metrics
print("\n--- Routing Heuristic Evaluation ---")
print(f"Quality Loss compared to GT: {quality_loss_gt:.2f}%")
print(f"Quality Loss compared to XL: {quality_loss_xl:.2f}%")
print(f"Compute Savings compared to GT: {compute_savings_gt:.2f}%")
print(f"Compute Savings compared to XL: {compute_savings_xl:.2f}%")
print(f"Routing Accuracy: {accuracy:.2f}%")
print(f"Total Compute Units Used: {compute_units_router:.2f} CUs")
print(f"Number of queries routed to Base: {num_routed_base}")
print(f"Number of queries routed to Large: {num_routed_large}")
print(f"Number of queries routed to XL: {num_routed_xl}")


Filtered out the top 20% most complex queries.
Total queries removed: 9996. Remaining for training/testing: 40004.
Final split: 36004 queries for training, 4000 queries for testing.

--- Routing Heuristic Evaluation ---
Quality Loss compared to GT: -0.03%
Quality Loss compared to XL: 2.98%
Compute Savings compared to GT: -0.95%
Compute Savings compared to XL: 25.90%
Routing Accuracy: 98.85%
Total Compute Units Used: 35566.20 CUs
Number of queries routed to Base: 537
Number of queries routed to Large: 735
Number of queries routed to XL: 2728
