# 03 - Model Training & Optimization

Enterprise-grade ML ensemble for AI agent performance prediction and optimization.

**Course:** DATA 230 (Data Visualization) at SJSU

## Models:
1. **Performance Optimization Engine**: XGBoost + SHAP for business_value_score prediction
2. **Cost-Performance Tradeoff Analyzer**: Multi-objective optimization for Pareto frontier
3. **Failure Prediction System**: Isolation Forest for anomaly detection
4. **Agent Recommendation Engine**: Cosine similarity for task-agent matching


In [1]:
import pandas as pd
import numpy as np
import joblib
import warnings
from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity
from xgboost import XGBRegressor
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings('ignore')

# Model version
MODEL_VERSION = datetime.now().strftime("%Y%m%d_%H%M%S")
print(f"Model Version: {MODEL_VERSION}")

# Load strategic features data
df = pd.read_csv('../data/ml/strategic_agent_features.csv')
print(f"Loaded {len(df)} records with {len(df.columns)} features")


Model Version: 20251130_140311
Loaded 5000 records with 37 features


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Prepare features - encode categorical variables
df_model = df.copy()

# Encode categorical columns
categorical_cols = ['agent_type', 'model_architecture', 'deployment_environment', 'task_category', 'cost_efficiency_tier', 'strategic_importance']
label_encoders = {}

for col in categorical_cols:
    if col in df_model.columns:
        le = LabelEncoder()
        df_model[col + '_encoded'] = le.fit_transform(df_model[col].astype(str))
        label_encoders[col] = le

# Convert boolean columns
bool_cols = ['human_intervention_required', 'multimodal_capability', 'edge_compatibility']
for col in bool_cols:
    if col in df_model.columns:
        df_model[col] = df_model[col].astype(int)

print(f"Encoded {len(categorical_cols)} categorical columns")
print(f"Label encoders saved for: {list(label_encoders.keys())}")


Encoded 6 categorical columns
Label encoders saved for: ['agent_type', 'model_architecture', 'deployment_environment', 'task_category', 'cost_efficiency_tier', 'strategic_importance']


## 1. Performance Optimization Engine (XGBoost + SHAP)


In [3]:
# Define features for performance model
feature_cols = [
    'task_complexity', 'autonomy_level', 'success_rate', 'accuracy_score', 'efficiency_score',
    'execution_time_seconds', 'response_latency_ms', 'memory_usage_mb', 'cpu_usage_percent',
    'cost_per_task_cents', 'error_recovery_rate', 'privacy_compliance_score', 'bias_detection_score',
    'data_quality_score', 'performance_index', 'cost_efficiency_ratio', 'autonomous_capability_score',
    'operational_risk_index', 'scalability_potential', 'total_cost_of_ownership',
    'performance_trend_7d', 'stability_index', 'degradation_risk_score', 'seasonality_impact',
    'performance_quartile', 'human_intervention_required', 'multimodal_capability', 'edge_compatibility',
    'agent_type_encoded', 'model_architecture_encoded', 'deployment_environment_encoded', 'task_category_encoded'
]

# Filter to available columns
available_features = [col for col in feature_cols if col in df_model.columns]
print(f"Using {len(available_features)} features")

# Target variable
target = 'business_value_score'

# Prepare data
X = df_model[available_features].fillna(0)
y = df_model[target].fillna(df_model[target].mean())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train: {len(X_train)}, Test: {len(X_test)}")


Using 32 features
Train: 4000, Test: 1000


In [4]:
# Optuna hyperparameter optimization for XGBoost
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'random_state': 42
    }
    
    model = XGBRegressor(**params)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    return scores.mean()

# Run optimization
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=30, show_progress_bar=True)

print(f"Best XGBoost parameters: {study_xgb.best_params}")
print(f"Best CV score: {study_xgb.best_value:.4f}")


Best trial: 25. Best value: -2.13497e-05: 100%|██████████| 30/30 [00:25<00:00,  1.20it/s]

Best XGBoost parameters: {'n_estimators': 233, 'max_depth': 3, 'learning_rate': 0.2228998961177029, 'subsample': 0.8182589819868664, 'colsample_bytree': 0.6834066629630816, 'min_child_weight': 8}
Best CV score: -0.0000





In [5]:
# Train final XGBoost model with best parameters
best_xgb_model = XGBRegressor(**study_xgb.best_params, random_state=42)
best_xgb_model.fit(X_train, y_train)

# Evaluate
y_pred = best_xgb_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Performance Optimization Engine Results:")
print(f"  RMSE: {rmse:.4f}")
print(f"  R² Score: {r2:.4f}")

# Calculate confidence intervals using bootstrap
n_bootstrap = 100
bootstrap_predictions = []
for _ in range(n_bootstrap):
    idx = np.random.choice(len(X_test), size=len(X_test), replace=True)
    bootstrap_predictions.append(best_xgb_model.predict(X_test.iloc[idx]))

pred_std = np.std(bootstrap_predictions, axis=0).mean()
print(f"  Prediction Std Dev (confidence): {pred_std:.4f}")


Performance Optimization Engine Results:
  RMSE: 0.0048
  R² Score: 0.9988
  Prediction Std Dev (confidence): 0.1392


In [6]:
# SHAP feature importance
import shap

explainer = shap.TreeExplainer(best_xgb_model)
shap_values = explainer.shap_values(X_test)

# Feature importance from SHAP
shap_importance = pd.DataFrame({
    'feature': available_features,
    'importance': np.abs(shap_values).mean(axis=0)
}).sort_values('importance', ascending=False)

print("Top 15 Features by SHAP Importance:")
print(shap_importance.head(15))

# Optimal configuration parameters
print("\nOptimal Configuration Parameters:")
for idx, row in shap_importance.head(10).iterrows():
    feature = row['feature']
    optimal_val = X_test[feature].iloc[np.argmax(y_pred)]
    print(f"  {feature}: {optimal_val:.4f}")


Top 15 Features by SHAP Importance:
                        feature  importance
2                  success_rate    0.045814
25  human_intervention_required    0.024791
15        cost_efficiency_ratio    0.010224
10          error_recovery_rate    0.009014
11     privacy_compliance_score    0.006834
14            performance_index    0.006581
22       degradation_risk_score    0.006304
18        scalability_potential    0.005951
19      total_cost_of_ownership    0.002092
0               task_complexity    0.001471
9           cost_per_task_cents    0.001422
17       operational_risk_index    0.001304
8             cpu_usage_percent    0.000796
24         performance_quartile    0.000761
3                accuracy_score    0.000573

Optimal Configuration Parameters:
  success_rate: 0.8118
  human_intervention_required: 0.0000
  cost_efficiency_ratio: 128.2500
  error_recovery_rate: 0.9432
  privacy_compliance_score: 0.9577
  performance_index: 0.8464
  degradation_risk_score: 0.1835
  sc

## 2. Cost-Performance Tradeoff Analyzer (Multi-objective Optimization)


In [7]:
# Multi-objective optimization using Optuna
# Objectives: Maximize performance, Minimize cost, Minimize risk

def multi_objective(trial):
    # Sample configuration parameters
    task_complexity = trial.suggest_int('task_complexity', 1, 10)
    autonomy_level = trial.suggest_int('autonomy_level', 1, 10)
    memory_usage = trial.suggest_float('memory_usage_mb', 100, 500)
    cpu_usage = trial.suggest_float('cpu_usage_percent', 20, 90)
    
    # Filter similar configurations from data
    similar = df_model[
        (df_model['task_complexity'] == task_complexity) &
        (df_model['autonomy_level'] == autonomy_level)
    ]
    
    if len(similar) == 0:
        similar = df_model
    
    # Calculate objectives
    performance = similar['performance_index'].mean()
    cost = similar['cost_per_task_cents'].mean()
    risk = similar['operational_risk_index'].mean() if 'operational_risk_index' in similar.columns else 0.5
    
    return performance, -cost, -risk  # Maximize performance, minimize cost and risk

# Run multi-objective optimization
study_mo = optuna.create_study(directions=['maximize', 'maximize', 'maximize'])
study_mo.optimize(multi_objective, n_trials=50, show_progress_bar=True)

print(f"Number of Pareto-optimal solutions: {len(study_mo.best_trials)}")


100%|██████████| 50/50 [00:00<00:00, 2155.92it/s]

Number of Pareto-optimal solutions: 1





In [8]:
# Extract Pareto frontier
pareto_frontier = []
for trial in study_mo.best_trials:
    pareto_frontier.append({
        'task_complexity': trial.params['task_complexity'],
        'autonomy_level': trial.params['autonomy_level'],
        'memory_usage_mb': trial.params['memory_usage_mb'],
        'cpu_usage_percent': trial.params['cpu_usage_percent'],
        'performance': trial.values[0],
        'cost': -trial.values[1],
        'risk': -trial.values[2]
    })

pareto_df = pd.DataFrame(pareto_frontier)
print("Pareto Frontier of Optimal Configurations:")
print(pareto_df.sort_values('performance', ascending=False).head(10))

# Save Pareto frontier
pareto_df.to_csv('../data/ml/pareto_frontier.csv', index=False)
print(f"\nPareto frontier saved with {len(pareto_df)} optimal configurations")


Pareto Frontier of Optimal Configurations:
   task_complexity  autonomy_level  memory_usage_mb  cpu_usage_percent  \
0                2               4       136.600696          59.249874   

   performance      cost      risk  
0      0.80788  0.009086  0.213881  

Pareto frontier saved with 1 optimal configurations


## 3. Failure Prediction System (Isolation Forest)


In [9]:
# Isolation Forest for anomaly detection
# Features for anomaly detection
anomaly_features = [
    'success_rate', 'accuracy_score', 'efficiency_score', 'execution_time_seconds',
    'response_latency_ms', 'memory_usage_mb', 'cpu_usage_percent', 'error_recovery_rate'
]

X_anomaly = df_model[anomaly_features].fillna(df_model[anomaly_features].mean())

# Scale features
scaler_anomaly = StandardScaler()
X_anomaly_scaled = scaler_anomaly.fit_transform(X_anomaly)

# Train Isolation Forest with Optuna optimization
def objective_iforest(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_samples': trial.suggest_float('max_samples', 0.5, 1.0),
        'contamination': trial.suggest_float('contamination', 0.01, 0.1),
        'random_state': 42
    }
    
    model = IsolationForest(**params)
    model.fit(X_anomaly_scaled)
    
    # Score based on expected anomaly rate vs actual
    predictions = model.predict(X_anomaly_scaled)
    anomaly_rate = (predictions == -1).sum() / len(predictions)
    
    # Target anomaly rate around 5%
    return -abs(anomaly_rate - 0.05)

study_iforest = optuna.create_study(direction='maximize')
study_iforest.optimize(objective_iforest, n_trials=20, show_progress_bar=True)

print(f"Best Isolation Forest parameters: {study_iforest.best_params}")


Best trial: 15. Best value: -0.0008: 100%|██████████| 20/20 [00:03<00:00,  5.82it/s]

Best Isolation Forest parameters: {'n_estimators': 73, 'max_samples': 0.8208188360782892, 'contamination': 0.049204343391846136}





In [10]:
# Train final Isolation Forest model
isolation_forest = IsolationForest(**study_iforest.best_params, random_state=42)
isolation_forest.fit(X_anomaly_scaled)

# Predict anomalies
df_model['anomaly_prediction'] = isolation_forest.predict(X_anomaly_scaled)
df_model['anomaly_score'] = -isolation_forest.score_samples(X_anomaly_scaled)

# Convert to failure probability (normalized anomaly score)
df_model['failure_probability'] = (df_model['anomaly_score'] - df_model['anomaly_score'].min()) / \
                                   (df_model['anomaly_score'].max() - df_model['anomaly_score'].min())

anomalies = df_model[df_model['anomaly_prediction'] == -1]
print(f"Failure Prediction System Results:")
print(f"  Anomalies detected: {len(anomalies)} ({len(anomalies)/len(df_model)*100:.2f}%)")
print(f"  Average failure probability: {df_model['failure_probability'].mean():.4f}")

# Identify root cause features for anomalies
print("\nRoot Cause Analysis (feature means for anomalies vs normal):")
for feature in anomaly_features:
    anomaly_mean = anomalies[feature].mean()
    normal_mean = df_model[df_model['anomaly_prediction'] == 1][feature].mean()
    diff_pct = (anomaly_mean - normal_mean) / normal_mean * 100 if normal_mean != 0 else 0
    if abs(diff_pct) > 10:
        print(f"  {feature}: {diff_pct:+.1f}% difference")


Failure Prediction System Results:
  Anomalies detected: 246 (4.92%)
  Average failure probability: 0.2209

Root Cause Analysis (feature means for anomalies vs normal):
  success_rate: +22.0% difference
  accuracy_score: +13.1% difference
  execution_time_seconds: +18.6% difference
  response_latency_ms: +164.4% difference
  memory_usage_mb: -11.9% difference
  error_recovery_rate: +18.8% difference


## 4. Agent Recommendation Engine (Cosine Similarity)


In [11]:
# Build agent capability profiles for recommendation
capability_features = [
    'task_complexity', 'autonomy_level', 'success_rate', 'accuracy_score', 'efficiency_score',
    'cost_per_task_cents', 'performance_index', 'cost_efficiency_ratio'
]

# Create agent profiles (aggregate by agent_type and model_architecture)
agent_profiles = df_model.groupby(['agent_type', 'model_architecture'])[capability_features].mean().reset_index()

# Scale capability features
scaler_rec = StandardScaler()
agent_capabilities_scaled = scaler_rec.fit_transform(agent_profiles[capability_features])

print(f"Agent Recommendation Engine:")
print(f"  Unique agent profiles: {len(agent_profiles)}")
print(f"  Capability dimensions: {len(capability_features)}")


Agent Recommendation Engine:
  Unique agent profiles: 160
  Capability dimensions: 8


In [12]:
# Recommendation function
def recommend_agent(task_requirements, top_k=5):
    """
    Find best agents for given task requirements.
    
    task_requirements: dict with keys matching capability_features
    Returns: top_k agent recommendations with similarity scores
    """
    # Create task profile vector
    task_vector = np.array([[
        task_requirements.get('task_complexity', 5),
        task_requirements.get('autonomy_level', 5),
        task_requirements.get('min_success_rate', 0.8),
        task_requirements.get('min_accuracy', 0.8),
        task_requirements.get('min_efficiency', 0.7),
        task_requirements.get('max_cost', 0.01),
        task_requirements.get('min_performance', 0.6),
        task_requirements.get('min_cost_efficiency', 50)
    ]])
    
    # Scale task vector
    task_scaled = scaler_rec.transform(task_vector)
    
    # Calculate cosine similarity
    similarities = cosine_similarity(task_scaled, agent_capabilities_scaled)[0]
    
    # Get top-k recommendations
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    recommendations = []
    for idx in top_indices:
        recommendations.append({
            'agent_type': agent_profiles.iloc[idx]['agent_type'],
            'model_architecture': agent_profiles.iloc[idx]['model_architecture'],
            'similarity_score': similarities[idx],
            'avg_performance': agent_profiles.iloc[idx]['performance_index'],
            'avg_cost': agent_profiles.iloc[idx]['cost_per_task_cents']
        })
    
    return pd.DataFrame(recommendations)

# Test recommendation
test_task = {
    'task_complexity': 7,
    'autonomy_level': 6,
    'min_success_rate': 0.85,
    'min_accuracy': 0.85,
    'min_efficiency': 0.75,
    'max_cost': 0.015,
    'min_performance': 0.7,
    'min_cost_efficiency': 60
}

print("Test Task Requirements:")
for k, v in test_task.items():
    print(f"  {k}: {v}")

print("\nTop 5 Agent Recommendations:")
recommendations = recommend_agent(test_task, top_k=5)
print(recommendations)


Test Task Requirements:
  task_complexity: 7
  autonomy_level: 6
  min_success_rate: 0.85
  min_accuracy: 0.85
  min_efficiency: 0.75
  max_cost: 0.015
  min_performance: 0.7
  min_cost_efficiency: 60

Top 5 Agent Recommendations:
            agent_type model_architecture  similarity_score  avg_performance  \
0    Translation Agent             GPT-4o          0.918448         0.572384   
1    Financial Advisor             GPT-4o          0.915257         0.574657   
2       Code Assistant        InstructGPT          0.910040         0.581674   
3  Marketing Assistant        InstructGPT          0.876312         0.572018   
4   Document Processor         Claude-3.5          0.866306         0.581190   

   avg_cost  
0  0.019164  
1  0.019323  
2  0.018562  
3  0.019051  
4  0.019530  


## Save All Models


In [13]:
# Save all models with versioning
import os

models_dir = '../models'
os.makedirs(models_dir, exist_ok=True)

# 1. Performance Optimization Engine
joblib.dump(best_xgb_model, f'{models_dir}/xgb_performance_model_{MODEL_VERSION}.joblib')
joblib.dump(study_xgb.best_params, f'{models_dir}/xgb_best_params_{MODEL_VERSION}.joblib')

# 2. Isolation Forest (Failure Prediction)
joblib.dump(isolation_forest, f'{models_dir}/isolation_forest_{MODEL_VERSION}.joblib')
joblib.dump(scaler_anomaly, f'{models_dir}/scaler_anomaly_{MODEL_VERSION}.joblib')

# 3. Recommendation Engine components
joblib.dump(scaler_rec, f'{models_dir}/scaler_recommendation_{MODEL_VERSION}.joblib')
joblib.dump(agent_profiles, f'{models_dir}/agent_profiles_{MODEL_VERSION}.joblib')
joblib.dump(agent_capabilities_scaled, f'{models_dir}/agent_capabilities_{MODEL_VERSION}.joblib')

# 4. Label encoders
joblib.dump(label_encoders, f'{models_dir}/label_encoders_{MODEL_VERSION}.joblib')

# 5. Feature lists
model_metadata = {
    'version': MODEL_VERSION,
    'performance_features': available_features,
    'anomaly_features': anomaly_features,
    'capability_features': capability_features,
    'xgb_params': study_xgb.best_params,
    'iforest_params': study_iforest.best_params
}
joblib.dump(model_metadata, f'{models_dir}/model_metadata_{MODEL_VERSION}.joblib')

# Create latest symlinks
joblib.dump(best_xgb_model, f'{models_dir}/xgb_performance_model_latest.joblib')
joblib.dump(isolation_forest, f'{models_dir}/isolation_forest_latest.joblib')
joblib.dump(model_metadata, f'{models_dir}/model_metadata_latest.joblib')

print(f"All models saved to {models_dir}/ with version {MODEL_VERSION}")
print(f"\nSaved files:")
for f in os.listdir(models_dir):
    if f.endswith('.joblib'):
        print(f"  {f}")


All models saved to ../models/ with version 20251130_140311

Saved files:
  isolation_forest_latest.joblib
  xgb_best_params_20251130_140311.joblib
  xgb_performance_model_latest.joblib
  agent_profiles_20251130_140311.joblib
  scaler_anomaly_20251130_140311.joblib
  xgb_performance_model_20251130_140311.joblib
  model_metadata_latest.joblib
  scaler_recommendation_20251130_140311.joblib
  agent_capabilities_20251130_140311.joblib
  isolation_forest_20251130_140311.joblib
  label_encoders_20251130_140311.joblib
  model_metadata_20251130_140311.joblib


In [14]:
# Model Summary Report
print("=" * 80)
print("MODEL TRAINING SUMMARY REPORT")
print("=" * 80)

print(f"\nModel Version: {MODEL_VERSION}")

print("\n### 1. PERFORMANCE OPTIMIZATION ENGINE (XGBoost)")
print("-" * 40)
print(f"Target: business_value_score")
print(f"Features: {len(available_features)}")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")
print(f"Top 3 SHAP Features: {', '.join(shap_importance.head(3)['feature'].tolist())}")

print("\n### 2. COST-PERFORMANCE TRADEOFF ANALYZER")
print("-" * 40)
print(f"Pareto-optimal configurations: {len(pareto_df)}")
print(f"Performance range: {pareto_df['performance'].min():.4f} - {pareto_df['performance'].max():.4f}")
print(f"Cost range: {pareto_df['cost'].min():.4f} - {pareto_df['cost'].max():.4f}")

print("\n### 3. FAILURE PREDICTION SYSTEM (Isolation Forest)")
print("-" * 40)
print(f"Anomalies detected: {len(anomalies)} ({len(anomalies)/len(df_model)*100:.2f}%)")
print(f"Contamination rate: {study_iforest.best_params['contamination']:.4f}")

print("\n### 4. AGENT RECOMMENDATION ENGINE")
print("-" * 40)
print(f"Agent profiles: {len(agent_profiles)}")
print(f"Capability dimensions: {len(capability_features)}")

print("\n### MODELS SAVED")
print("-" * 40)
print(f"Location: models/")
print(f"Version: {MODEL_VERSION}")
print("Files: xgb_performance_model, isolation_forest, scalers, encoders, metadata")

print("\n" + "=" * 80)


MODEL TRAINING SUMMARY REPORT

Model Version: 20251130_140311

### 1. PERFORMANCE OPTIMIZATION ENGINE (XGBoost)
----------------------------------------
Target: business_value_score
Features: 32
RMSE: 0.0048
R² Score: 0.9988
Top 3 SHAP Features: success_rate, human_intervention_required, cost_efficiency_ratio

### 2. COST-PERFORMANCE TRADEOFF ANALYZER
----------------------------------------
Pareto-optimal configurations: 1
Performance range: 0.8079 - 0.8079
Cost range: 0.0091 - 0.0091

### 3. FAILURE PREDICTION SYSTEM (Isolation Forest)
----------------------------------------
Anomalies detected: 246 (4.92%)
Contamination rate: 0.0492

### 4. AGENT RECOMMENDATION ENGINE
----------------------------------------
Agent profiles: 160
Capability dimensions: 8

### MODELS SAVED
----------------------------------------
Location: models/
Version: 20251130_140311
Files: xgb_performance_model, isolation_forest, scalers, encoders, metadata

