In [1]:


import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import re
import io

# Function to convert string percentages to floats
def convert_percentage(value):
    if isinstance(value, str):
        if '%' in value:
            return float(value.strip('%')) / 100
    return value

# Function to clean and convert cost data
def clean_cost(value):
    if isinstance(value, str):
        if value == "NULL":
            return 0
        return float(re.sub(r'[^\d.]', '', value))
    return 0 if pd.isna(value) else value

# Load and clean the Cost Comparison data
def load_cost_data():
    cost_data = pd.DataFrame([
        ['Gemini 1.5 Flash', 1000000, 0.35, 0.70],
        ['Gemini 2.0 Flash (Exp)', 1000000, 0, 0],
        ['AWS Nova Pro', 300000, 0.0008, 0.0032],
        ['AWS Nova Lite', 300000, 0.00006, 0.00024],
        ['AWS Nova Micro', 300000, 0.000035, 0.00014],
        ['Claude 3 Opus', 200000, 15.00, 75.00],
        ['Claude 3 Sonnet', 200000, 3.00, 15.00],
        ['Claude 3 Haiku', 200000, 0.25, 1.25],
        ['Claude 3.5 Sonnet', 200000, 3, 15],
        ['Claude 3.5 Haiku', 200000, 0.80, 4],
        ['Qwen2.5-72b', 131000, 0.4, 0.75],
        ['GPT-4 Turbo', 128000, 10.00, 30.00],
        ['Gemini 1.5 Pro', 128000, 7, 21],
        ['GPT4o', 128000, 5, 15],
        ['GPT-4o mini', 128000, 0.15, 0.60],
        ['OpenAI o1', 128000, 15, 60],
        ['OpenAI o3-mini', 128000, 1.10, 4.40],
        ['DeepSeek V3', 128000, 0.27, 1.1],
        ['DeepSeek R1', 128000, 0.55, 2.19],
        ['GPT 4.5', 128000, 25, 150],
        ['OpenAI o1-mini', 64000, 1.10, 4.40],
        ['GPT-4-32k', 32000, 60.00, 120.00],
        ['Gemini Pro', 32000, 0.125, 0.375],
        ['Mistral Medium', 32000, 2.7, 8.1],
        ['Mistral Large', 32000, 8.00, 24.00],
        ['GPT-3.5 Turbo', 16000, 0.5, 1.5],
        ['Mistral Small', 16000, 2.00, 6.00],
        ['GPT-4', 8000, 30.00, 60.00],
        ['GPT-3.5 Turbo Instruct', 4000, 1.5, 2.00]
    ], columns=['Models', 'Context Window', 'Input Cost / 1M tokens', 'Output Cost / 1M tokens'])
    
    # Calculate average cost
    cost_data['Average Cost / 1M tokens'] = (cost_data['Input Cost / 1M tokens'] + cost_data['Output Cost / 1M tokens']) / 2
    
    return cost_data

# Load and clean the General Model Performance data
def load_performance_data():
    perf_data = pd.DataFrame([
        ['OpenAI o1', 0.8539, 0.918, 0.757, 0.924, 0.964, 0.6673, 0.893],
        ['Claude 3.5 Sonnet', 0.845, 0.883, 0.65, 0.937, 0.783, 0.902, 0.916],
        ['GPT-4o', 0.805, 0.887, 0.536, 0.902, 0.766, 0.8359, 0.905],
        ['Llama 3.1 405b', 0.804, 0.886, 0.511, 0.89, 0.738, 0.885, 0.916],
        ['OpenAI o1-mini', 0.8007, 0.852, 0.6, 0.924, 0.9, 0.6289, 0.899],
        ['GPT-Turbo', 0.781, 0.865, 0.48, 0.871, 0.726, 0.86, 0.885],
        ['OpenAI o1-mini', 0.775, 0.852, 0.6, 0.826, 0.924, 0.522, 0.899],
        ['Claude 3 Opus', 0.767, 0.857, 0.504, 0.849, 0.601, 0.884, 0.907],
        ['DeepSeek V3', 0.7624, 0.885, 0.591, 0.826, 0.902, 0.5723, 0.798],
        ['GPT-4', 0.755, 0.864, 0.414, 0.866, 0.645, 0.883, 0.859],
        ['Llama 3.1 70b', 0.755, 0.86, 0.467, 0.805, 0.68, 0.848, 0.869],
        ['Llama 3.3 70b', 0.745, 0.86, 0.48, 0.884, 0.77, 0.775, 0.911],
        ['Gemini 1.5 Pro', 0.741, 0.859, 0.462, 0.719, 0.677, 0.8435, 0.887],
        ['Claude 3.5 Haiku', 0.683, 0.65, 0.416, 0.881, 0.694, 0.6, 0.856],
        ['Gemini 1.5 Flash', 0.667, 0.789, 0.395, 0.715, 0.549, 0.7988, 0.755],
        ['Claude 3 Haiku', 0.629, 0.752, 0.357, 0.759, 0.389, 0.7465, 0.717],
        ['Llama 3.1 8b', 0.626, 0.73, 0.328, 0.726, 0.519, 0.761, 0.689],
        ['GPT-3.5 Turbo', 0.592, 0.698, 0.308, 0.68, 0.341, 0.6441, 0.563],
        ['Gemini 2.0 Flash', 0, 0.764, 0.621, 0, 0.897, 0, 0],
        ['AWS Nova Micro', 0, 0.776, 0.4, 0.811, 0.693, 0.562, 0],
        ['AWS Nova Lite', 0, 0.805, 0.42, 0.854, 0.733, 0.666, 0],
        ['AWS Nova Pro', 0, 0.859, 0.469, 0.89, 0.766, 0.684, 0],
        ['GPT-4o mini', 0, 0.82, 0.402, 0.872, 0.702, 0, 0.87],
        ['Gemini Ultra', 0, 0.837, 0.357, 0, 0.532, 0, 0.79],
        ['OpenAI o3-mini', 0, 0.869, 0.797, 0, 0.979, 0, 0.92],
        ['Qwen2.5-72b', 0, 0.702, 0.49, 0.88, 0.85, 0.6131, 0],
        ['OpenAI o3-mini', 0, 0.869, 0.707, 0, 0.979, 0, 0.92],
        ['DeepSeek-R1', 0, 0.908, 0.715, 0, 0.973, 0, 0],
        ['Grok-2', 0, 0.875, 0.56, 0.884, 0.761, 0, 0],
        ['Grok-2 mini', 0, 0.862, 0.51, 0.857, 0.73, 0, 0]
    ], columns=['Model', 'Average', 'MMLU(General)', 'GPQA(Reasoning)', 'HumanEval(Coding)', 'Math', 'BFCL(Tool Use)', 'MGSM(MUltilingual)'])
    
    return perf_data

# Load and clean HumanEval data
def load_humaneval_data():
    humaneval_data = pd.DataFrame([
        ['Claude 3.5 Sonnet', 0.937],
        ['GPT-4o', 0.902],
        ['AWS Nova Pro', 0.89],
        ['Llama 3.1 405b', 0.89],
        ['Grok-2', 0.884],
        ['Claude 3.5 Haiku', 0.881],
        ['Qwen2.5-70b', 0.88],
        ['GPT-4o mini', 0.872],
        ['GPT-Turbo', 0.871],
        ['GPT-4', 0.866],
        ['Grok-2 mini', 0.857],
        ['AWS Nova Lite', 0.854],
        ['Claude 3 Opus', 0.849],
        ['OpenAI o1-mini', 0.826],
        ['AWS Nova Micro', 0.811],
        ['Llama 3.3 70b', 0.805],
        ['Llama 3.1 70b', 0.805],
        ['Llama 3.1 8b', 0.726],
        ['Gemini 1.5 Pro', 0.719],
        ['Gemini 1.5 Flash', 0.715],
        ['GPT-3.5 Turbo', 0.68]
    ], columns=['Model', 'HumanEval (0 shot)'])
    
    return humaneval_data

# Load and clean Reasoning Dataset
def load_reasoning_data():
    reasoning_data = pd.DataFrame([
        ['Claude 3.7 Sonnet (reasoner)', 0.848, 0, 0, 0, 0.861, 0.75, 0.932, 0.962, 0.8],
        ['Grok 3 Beta', 0.846, 0, 0, 0, 0, 0.78, 0, 0, 0.933],
        ['OpenAI o3-mini (High)', 0.797, 0.493, 0, 0, 0.795, 0, 0, 0.979, 0.873],
        ['OpenAI o1', 0.78, 0.489, 0.735, 0.542, 0.877, 0.782, 0, 0.964, 0.833],
        ['DeepSeek R1', 0.715, 0.492, 0, 0, 0, 0, 0.833, 0.973, 0.798],
        ['Claude 3.7 Sonnet', 0.68, 0.703, 0.812, 0.584, 0.832, 0.718, 0.908, 0.822, 0.233],
        ['Claude 3.5 Sonnet', 0.65, 0.49, 0.715, 0.488, 0.821, 0.704, 0.902, 0.78, 0.16]
    ], columns=['Model', 'GPQA Diamond (Reasoning)', 'SWE-bench (Agent coding)', 'Tool Use(Retail)',
                'Tool Use(Airline)', 'MMMLU(Multilingual)', 'MMMU(Visual)', 'IFEval', 'MATH 500', 'AIME 2024 (Math)'])
    
    return reasoning_data

# Load and merge all data
def prepare_combined_dataset():
    cost_df = load_cost_data()
    perf_df = load_performance_data()
    humaneval_df = load_humaneval_data()
    reasoning_df = load_reasoning_data()
    
    # Merge datasets on model name
    # First, merge cost and performance
    merged_df = pd.merge(cost_df, perf_df, left_on='Models', right_on='Model', how='outer')
    
    # Then merge HumanEval
    merged_df = pd.merge(merged_df, humaneval_df, on='Model', how='outer')
    
    # Finally, merge reasoning data
    merged_df = pd.merge(merged_df, reasoning_df, on='Model', how='outer')
    
    # Drop duplicate columns and handle missing values
    if 'Model_x' in merged_df.columns:
        merged_df.drop('Model_x', axis=1, inplace=True)
    if 'Model_y' in merged_df.columns:
        merged_df.drop('Model_y', axis=1, inplace=True)
    
    # Use model name as identifier
    if 'Models' in merged_df.columns and 'Model' in merged_df.columns:
        merged_df['Model'] = merged_df['Model'].fillna(merged_df['Models'])
        merged_df.drop('Models', axis=1, inplace=True)
    elif 'Models' in merged_df.columns and 'Model' not in merged_df.columns:
        merged_df.rename(columns={'Models': 'Model'}, inplace=True)
    
    # Fill missing values
    merged_df.fillna(0, inplace=True)
    
    # Handle duplicate HumanEval values (keep the one from HumanEval dataset if available)
    if 'HumanEval(Coding)' in merged_df.columns and 'HumanEval (0 shot)' in merged_df.columns:
        merged_df['HumanEval'] = merged_df.apply(
            lambda row: row['HumanEval (0 shot)'] if row['HumanEval (0 shot)'] > 0 else row['HumanEval(Coding)'],
            axis=1
        )
        merged_df.drop(['HumanEval(Coding)', 'HumanEval (0 shot)'], axis=1, inplace=True)
    elif 'HumanEval(Coding)' in merged_df.columns:
        merged_df.rename(columns={'HumanEval(Coding)': 'HumanEval'}, inplace=True)
    elif 'HumanEval (0 shot)' in merged_df.columns:
        merged_df.rename(columns={'HumanEval (0 shot)': 'HumanEval'}, inplace=True)
    
    return merged_df

# Build the SVM model
def build_svm_model(data):
    # Select features
    feature_cols = [col for col in data.columns if col != 'Model' and not pd.api.types.is_string_dtype(data[col])]
    X = data[feature_cols]
    
    # Standard scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Simple SVM model
    model = SVC(kernel='rbf', probability=True)
    model.fit(X_scaled, data['Model'])
    
    return model, scaler, feature_cols

# Build the neural network using TensorFlow
def build_nn_model(data):
    # Select features
    feature_cols = [col for col in data.columns if col != 'Model' and not pd.api.types.is_string_dtype(data[col])]
    X = data[feature_cols]
    
    # Standard scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Encode the target (model names)
    encoder = OneHotEncoder(sparse_output=False)
    y_encoded = encoder.fit_transform(data[['Model']])
    
    # Define model architecture
    model = keras.Sequential([
        keras.layers.Dense(64, activation='relu', input_shape=(X_scaled.shape[1],)),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(y_encoded.shape[1], activation='softmax')
    ])
    
    # Compile the model
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    # Train the model
    history = model.fit(X_scaled, y_encoded, epochs=200, batch_size=16, verbose=0)
    
    return model, scaler, encoder, feature_cols, history

# Function for recommending models
def recommend_models(user_inputs, svm_model, svm_scaler, tf_model, tf_scaler, tf_encoder, feature_cols, all_models_df):
    # Prepare user input
    user_vector = np.zeros(len(feature_cols))
    
    for i, feature in enumerate(feature_cols):
        if feature in user_inputs:
            user_vector[i] = user_inputs[feature]
    
    # Scale user input
    svm_user_scaled = svm_scaler.transform([user_vector])
    tf_user_scaled = tf_scaler.transform([user_vector])
    
    # Get SVM predictions
    svm_probas = svm_model.predict_proba(svm_user_scaled)[0]
    svm_indices = np.argsort(svm_probas)[::-1]
    svm_classes = svm_model.classes_
    svm_top5 = [(svm_classes[idx], svm_probas[idx]) for idx in svm_indices[:5]]
    
    # Get Neural Network predictions
    tf_probas = tf_model.predict(tf_user_scaled, verbose=0)[0]
    tf_top_indices = np.argsort(tf_probas)[::-1][:5]
    tf_classes = tf_encoder.categories_[0]
    tf_top5 = [(tf_classes[idx], tf_probas[idx]) for idx in tf_top_indices]
    
    # Combine predictions with weighted ensemble
    combined_predictions = {}
    
    for model, score in svm_top5:
        if model not in combined_predictions:
            combined_predictions[model] = 0
        combined_predictions[model] += score * 0.5
    
    for model, score in tf_top5:
        if model not in combined_predictions:
            combined_predictions[model] = 0
        combined_predictions[model] += score * 0.5
    
    # Sort combined predictions
    sorted_predictions = sorted(combined_predictions.items(), key=lambda x: x[1], reverse=True)
    top5_models = [model for model, _ in sorted_predictions[:5]]
    
    # Get details of recommended models
    recommendations = all_models_df[all_models_df['Model'].isin(top5_models)]
    
    return svm_top5, tf_top5, sorted_predictions[:5], recommendations

# Function to get user input through a simple interactive approach
def get_user_input(feature_cols):
    user_inputs = {}
    
    print("\n--- LLM Recommendation System ---\n")
    print("Please rate the importance of the following factors (0-10):")
    
    # Context window size
    if 'Context Window' in feature_cols:
        context_window = int(input("Context window size (0=small, 10=very large): "))
        user_inputs['Context Window'] = context_window * 100000 / 10  # Scale to match data
    
    # Cost vs Performance
    cost_importance = int(input("Cost sensitivity (0=cost doesn't matter, 10=very cost-sensitive): "))
    if 'Average Cost / 1M tokens' in feature_cols:
        # Inverse relationship - higher cost sensitivity means lower cost tolerance
        user_inputs['Average Cost / 1M tokens'] = (10 - cost_importance) * 5 / 10  # Scale to match data
    
    # Performance categories
    print("\nImportance of performance in different categories (0-10):")
    
    if 'MMLU(General)' in feature_cols:
        general_knowledge = int(input("General knowledge: "))
        user_inputs['MMLU(General)'] = general_knowledge / 10  # Scale to 0-1
    
    if 'GPQA(Reasoning)' in feature_cols:
        reasoning = int(input("Reasoning abilities: "))
        user_inputs['GPQA(Reasoning)'] = reasoning / 10  # Scale to 0-1
    
    if 'HumanEval' in feature_cols:
        coding = int(input("Coding capabilities: "))
        user_inputs['HumanEval'] = coding / 10  # Scale to 0-1
    
    if 'Math' in feature_cols:
        math = int(input("Math skills: "))
        user_inputs['Math'] = math / 10  # Scale to 0-1
    
    if 'BFCL(Tool Use)' in feature_cols:
        tool_use = int(input("Tool use capabilities: "))
        user_inputs['BFCL(Tool Use)'] = tool_use / 10  # Scale to 0-1
    
    if 'MGSM(MUltilingual)' in feature_cols:
        multilingual = int(input("Multilingual capabilities: "))
        user_inputs['MGSM(MUltilingual)'] = multilingual / 10  # Scale to 0-1
    
    return user_inputs

# Visualization function for recommendations
def visualize_recommendations(recommendations, user_inputs, feature_cols):
    # Prepare data for radar chart
    categories = [feat for feat in feature_cols if feat in [
        'MMLU(General)', 'GPQA(Reasoning)', 'HumanEval', 'Math', 
        'BFCL(Tool Use)', 'MGSM(MUltilingual)'
    ]]
    
    fig, axs = plt.subplots(1, 2, figsize=(14, 6))
    
    # Performance radar chart for top models
    ax = axs[0]
    top_models = recommendations['Model'].tolist()[:5]
    
    # Number of variables
    N = len(categories)
    
    # What will be the angle of each axis in the plot
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]  # Close the loop
    
    # Draw one axis per variable and add labels
    ax.set_theta_offset(np.pi / 2)
    ax.set_theta_direction(-1)
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels([cat.split('(')[0] for cat in categories])
    
    # Draw y-axis lines and labels
    ax.set_ylim(0, 1)
    ax.set_yticks([0.2, 0.4, 0.6, 0.8])
    ax.set_yticklabels(['0.2', '0.4', '0.6', '0.8'])
    
    # Plot data for each model
    colors = ['b', 'g', 'r', 'c', 'm']
    for i, model in enumerate(top_models[:5]):
        model_data = recommendations[recommendations['Model'] == model]
        values = [model_data[cat].values[0] for cat in categories]
        values += values[:1]  # Close the loop
        
        # Plot values
        ax.plot(angles, values, color=colors[i], linewidth=2, label=model)
        ax.fill(angles, values, color=colors[i], alpha=0.1)
    
    # Add legend
    ax.legend(loc='upper right', bbox_to_anchor=(-0.1, 1.1))
    ax.set_title('Performance Comparison')
    
    # Cost vs Context Window chart
    ax = axs[1]
    for i, model in enumerate(top_models[:5]):
        model_data = recommendations[recommendations['Model'] == model]
        context = model_data['Context Window'].values[0] if 'Context Window' in model_data.columns else 0
        cost = model_data['Average Cost / 1M tokens'].values[0] if 'Average Cost / 1M tokens' in model_data.columns else 0
        
        ax.scatter(context, cost, s=100, color=colors[i], label=model)
    
    # Add labels and legend
    ax.set_xlabel('Context Window Size')
    ax.set_ylabel('Average Cost / 1M tokens')
    ax.set_title('Cost vs Context Window')
    ax.legend()
    
    # Use log scale for better visualization
    ax.set_xscale('log')
    ax.set_yscale('log')
    
    plt.tight_layout()
    plt.savefig('llm_recommendations.png')
    
    # Convert plot to bytes
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    
    return buf

# Main function
def main():
    # Prepare dataset
    print("Loading and preparing data...")
    all_data = prepare_combined_dataset()
    
    # Build SVM model
    print("Building SVM model...")
    svm_model, svm_scaler, feature_cols = build_svm_model(all_data)
    
    # Build Neural Network model
    print("Building Neural Network model...")
    tf_model, tf_scaler, tf_encoder, tf_feature_cols, history = build_nn_model(all_data)
    
    # Get user input
    user_inputs = get_user_input(feature_cols)
    
    # Get recommendations
    print("\nGenerating recommendations...")
    svm_top5, tf_top5, ensemble_top5, recommendations = recommend_models(
        user_inputs, svm_model, svm_scaler, tf_model, tf_scaler, tf_encoder, feature_cols, all_data
    )
    
    # Display results
    print("\n--- Model Recommendations ---\n")
    
    print("SVM Model Recommendations:")
    for model, score in svm_top5:
        print(f"- {model}: {score:.4f}")
    
    print("\nNeural Network Recommendations:")
    for model, score in tf_top5:
        print(f"- {model}: {score:.4f}")
    
    print("\nEnsemble Recommendations:")
    for model, score in ensemble_top5:
        print(f"- {model}: {score:.4f}")
    
    print("\nDetails of recommended models:")
    relevant_cols = ['Model', 'Context Window', 'Average Cost / 1M tokens', 'Average',
                     'MMLU(General)', 'GPQA(Reasoning)', 'HumanEval', 'Math', 
                     'BFCL(Tool Use)', 'MGSM(MUltilingual)']
    
    display_cols = [col for col in relevant_cols if col in recommendations.columns]
    print(recommendations[display_cols].sort_values(by='Average', ascending=False).to_string(index=False))
    
    # Visualize recommendations
    img_data = visualize_recommendations(recommendations, user_inputs, feature_cols)
    
    print("\nRecommendation visualization saved as 'llm_recommendations.png'")
    
    return svm_model, tf_model, svm_scaler, tf_scaler, tf_encoder, feature_cols, all_data

# Run the system
if __name__ == "__main__":
    svm_model, tf_model, svm_scaler, tf_scaler, tf_encoder, feature_cols, all_data = main()

# Create a simple Flask API for the LLM recommendation system
def create_flask_api():
    from flask import Flask, request, jsonify, send_file
    import io
    import base64
    
    app = Flask(__name__)
    
    # Load data and models
    all_data = prepare_combined_dataset()
    svm_model, svm_scaler, feature_cols = build_svm_model(all_data)
    tf_model, tf_scaler, tf_encoder, _, _ = build_nn_model(all_data)
    
    @app.route('/recommend', methods=['POST'])
    def recommend():
        user_inputs = request.json
        
        svm_top5, tf_top5, ensemble_top5, recommendations = recommend_models(
            user_inputs, svm_model, svm_scaler, tf_model, tf_scaler, tf_encoder, feature_cols, all_data
        )
        
        # Create visualization
        img_buffer = visualize_recommendations(recommendations, user_inputs, feature_cols)
        img_str = base64.b64encode(img_buffer.getvalue()).decode('utf-8')
        
        # Prepare recommendations data
        relevant_cols = ['Model', 'Context Window', 'Average Cost / 1M tokens', 'Average',
                         'MMLU(General)', 'GPQA(Reasoning)', 'HumanEval', 'Math', 
                         'BFCL(Tool Use)', 'MGSM(MUltilingual)']
        
        display_cols = [col for col in relevant_cols if col in recommendations.columns]
        recommendations_data = recommendations[display_cols].to_dict(orient='records')
        
        return jsonify({
            'svm_recommendations': svm_top5,
            'nn_recommendations': tf_top5,
            'ensemble_recommendations': ensemble_top5,
            'details': recommendations_data,
            'visualization': img_str
        })
    
    @app.route('/', methods=['GET'])
    def index():
        return """
        <h1>LLM Recommendation API</h1>
        <p>POST to /recommend with JSON data containing your preferences to get recommendations.</p>
        """
    
    return app

# # Code to create a simple UI using Streamlit
# def create_streamlit_ui():
#     import streamlit as st
    
#     # Define the Streamlit app
#     st.title("LLM Recommendation System")
    
#     # Load data and models
#     @st.cache_resource
#     def load_models():

Loading and preparing data...
Building SVM model...
Building Neural Network model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



--- LLM Recommendation System ---

Please rate the importance of the following factors (0-10):

Importance of performance in different categories (0-10):


ValueError: invalid literal for int() with base 10: ''