Predictive Modeling for Performance Index

In [34]:
from pathlib import Path

REPO_ROOT = Path.cwd()
if REPO_ROOT.name.lower() == "notebook":
    REPO_ROOT = REPO_ROOT.parent

DATA_PATH = REPO_ROOT / "data" / "cleaned_data.csv"
OUT_DIR   = REPO_ROOT / "outputs"
MODEL_DIR = REPO_ROOT / "models"
OUT_PATH  = OUT_DIR / "ml_results.csv"

# Ensure output folders exist
OUT_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Sanity check 
if not DATA_PATH.exists():
    raise FileNotFoundError(
        f"Could not find dataset at {DATA_PATH}.\n"
    )

print("Repo root:", REPO_ROOT)
print("DATA_PATH:", DATA_PATH)
print("OUT_PATH :", OUT_PATH)
print("MODEL_DIR:", MODEL_DIR)


Repo root: /Users/sayanibrahmachari/TG03
DATA_PATH: /Users/sayanibrahmachari/TG03/data/cleaned_data.csv
OUT_PATH : /Users/sayanibrahmachari/TG03/outputs/ml_results.csv
MODEL_DIR: /Users/sayanibrahmachari/TG03/models


In [35]:
import os
os.makedirs("data", exist_ok=True)
os.makedirs("outputs/models", exist_ok=True)

Imports & paths

In [36]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import joblib

Load & quick sanity check

In [37]:
from pathlib import Path
import pandas as pd
df = pd.read_csv(DATA_PATH)
print(df.shape)
print(df.dtypes)
df.head()

(5000, 26)
agent_id                        object
agent_type                      object
model_architecture              object
deployment_environment          object
task_category                   object
task_complexity                  int64
autonomy_level                   int64
success_rate                   float64
accuracy_score                 float64
efficiency_score               float64
execution_time_seconds         float64
response_latency_ms            float64
memory_usage_mb                float64
cpu_usage_percent              float64
cost_per_task_cents            float64
human_intervention_required       bool
error_recovery_rate            float64
multimodal_capability             bool
edge_compatibility                bool
privacy_compliance_score       float64
bias_detection_score           float64
timestamp                       object
data_quality_score             float64
performance_index              float64
cost_efficiency_ratio          float64
autonomous_cap

Unnamed: 0,agent_id,agent_type,model_architecture,deployment_environment,task_category,task_complexity,autonomy_level,success_rate,accuracy_score,efficiency_score,...,error_recovery_rate,multimodal_capability,edge_compatibility,privacy_compliance_score,bias_detection_score,timestamp,data_quality_score,performance_index,cost_efficiency_ratio,autonomous_capability_score
0,AG_01012,Project Manager,PaLM-2,Server,Text Processing,5,3,0.4788,0.6455,0.6573,...,0.4999,False,False,0.939,0.8061,2024-12-24 04:16:15,0.951,0.58236,50.203448,64.993
1,AG_00758,Marketing Assistant,Mixtral-8x7B,Hybrid,Decision Making,6,5,0.4833,0.566,0.5844,...,0.558,False,False,0.8281,0.7816,2024-12-24 04:16:15,0.7822,0.53844,69.030769,89.06
2,AG_00966,QA Tester,Mixtral-8x7B,Server,Communication,2,4,0.8116,0.8395,0.765,...,0.9196,False,False,0.745,0.8214,2024-12-24 04:16:15,0.7621,0.80599,127.934921,124.372
3,AG_00480,Code Assistant,CodeT5+,Hybrid,Creative Writing,8,6,0.3574,0.4888,0.4742,...,0.3809,True,False,0.9653,0.8684,2024-12-24 04:16:15,0.8117,0.43186,21.066341,86.663
4,AG_01050,QA Tester,Falcon-180B,Edge,Planning & Scheduling,3,4,0.5706,0.7137,0.7209,...,0.6717,False,True,0.9042,0.8417,2024-12-24 04:16:15,0.7762,0.65862,57.271304,87.019


Target detection & feature split
The column performance_index is our target variable — we’ll predict it using other columns.
We also remove any id-like columns that don’t contribute to prediction, and detect which columns are numeric vs. categorical.

In [38]:
TARGET_COL = "performance_index" 

has_target = TARGET_COL in df.columns and np.issubdtype(df[TARGET_COL].dtype, np.number)
print("Has numeric target:", has_target)

if has_target:
    y = df[TARGET_COL].values
    X = df.drop(columns=[TARGET_COL])
else:
    X = df.copy()

# Drop ID-like columns that don’t help learning
id_like = [c for c in X.columns if any(k in c.lower() for k in ["id", "uuid", "uid"])]
if id_like:
    X = X.drop(columns=id_like)
    print("Dropped ID-like columns:", id_like)

num_cols = [c for c in X.columns if np.issubdtype(X[c].dtype, np.number)]
cat_cols = [c for c in X.columns if c not in num_cols]
print(f"Numeric: {len(num_cols)} | Categorical: {len(cat_cols)}")


Has numeric target: True
Dropped ID-like columns: ['agent_id']
Numeric: 16 | Categorical: 8


Preprocessing pipeline
Machine learning models require consistent numeric input.
For numeric columns, we fill missing values with the median and scale them.
For categorical columns, we fill missing values with the most frequent value and one-hot encode them (convert text categories into numeric flags).
Combining these steps ensures the model gets clean, standardized data.

In [39]:
numeric_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_tf = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot",  OneHotEncoder(handle_unknown="ignore")),
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, num_cols),
        ("cat", categorical_tf, cat_cols),
    ]
)


Regression Modeling
Since we have a numeric target (performance_index), we train three regression models:
Linear Regression — simple and interpretable
Ridge Regression — regularized version that prevents overfitting
Random Forest — ensemble model capturing complex relationships
For each model, we print:
R² (accuracy) — how well predictions match actual values (1.0 = perfect)
RMSE — Root Mean Squared Error, smaller is better
We then pick the best model (highest R²) and save it for reuse.

In [40]:
results_reg = {}
best_model = None
best_name = None

if has_target:
    # Train/validation split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42
    )

    # Candidate regressors
    candidates = {
        "Linear": LinearRegression(),
        "Ridge": Ridge(alpha=1.0),
        "RandomForest": RandomForestRegressor(n_estimators=300, random_state=42),
    }

    # Train & evaluate each model (R² target ≥ 0.70 per TG-03)
    for name, estimator in candidates.items():
        pipe = Pipeline(steps=[
            ("prep", preprocessor),   # impute + encode + scale
            ("model", estimator)
        ])
        pipe.fit(X_train, y_train)

        # Predictions + metrics
        pred = pipe.predict(X_test)
        r2 = r2_score(y_test, pred)
        mse = mean_squared_error(y_test, pred)  # compatible with older sklearn
        rmse = mse ** 0.5                        # RMSE = sqrt(MSE)

        results_reg[name] = {"r2": r2, "rmse": rmse}
        print(f"{name}: R²={r2:.3f} | RMSE={rmse:.4f}")

        # Track the best by R²
        if best_model is None or r2 > results_reg.get(best_name, {"r2": -1})["r2"]:
            best_model = pipe
            best_name = name

    # Persist the best model
    print("\nBest:", best_name, results_reg[best_name])
    joblib.dump(best_model, MODEL_DIR / f"{best_name}_regressor.joblib")

else:
    print("No numeric target detected; skipping regression.")

Linear: R²=1.000 | RMSE=0.0000
Ridge: R²=1.000 | RMSE=0.0003
RandomForest: R²=0.999 | RMSE=0.0044

Best: Linear {'r2': 0.999999999952692, 'rmse': 9.245902433500903e-07}


A. Is any feature basically the target?
We observed R² ≈ 1.00. That can be legitimate (if the target is a formula of your features), but it can also be caused by leakage.
This check confirms whether any single numeric feature is identical to the target or virtually a copy.
In our run, leaky_cols == [], so no single feature is a copy of the target.


In [41]:
target = "performance_index"

# only consider numeric features for these checks
num_feat_cols = [c for c in X.columns if np.issubdtype(df[c].dtype, np.number)]

leaky_cols = []
for c in num_feat_cols:
    s = df[c].astype(float)
    t = df[target].astype(float)

    # exact equality within tiny tolerance
    same = np.allclose(s.values, t.values, rtol=0, atol=1e-12)

    # correlation (guard against constant columns)
    if s.nunique(dropna=True) > 1:
        corr = np.corrcoef(s.values, t.values)[0, 1]
    else:
        corr = np.nan

    if same or (pd.notna(corr) and abs(corr) > 0.9999):
        leaky_cols.append((c, "exact" if same else f"corr={corr:.6f}"))

leaky_cols


[]

Duplicate rows

In [28]:
dup_count = df.drop(columns=["performance_index"]).duplicated().sum()
print("Duplicate rows (excluding target):", dup_count)

Duplicate rows (excluding target): 0


Cross Validation
To check that our model generalizes well, we use 5-fold cross-validation.
This splits the data into five parts, trains on four, and tests on one — repeating five times.
A stable R² (close to 1.0 across folds) means the model is consistent.

In [29]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scorer = make_scorer(r2_score)

def cv_r2(estimator):
    pipe = Pipeline([("prep", preprocessor), ("model", estimator)])
    scores = cross_val_score(pipe, X, y, scoring=scorer, cv=cv)
    return scores.mean(), scores.std()

for name, est in {
    "Linear": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "RandomForest": RandomForestRegressor(n_estimators=300, random_state=42),
}.items():
    mean_r2, std_r2 = cv_r2(est)
    print(f"{name} CV R²: {mean_r2:.3f} ± {std_r2:.3f}")

Linear CV R²: 1.000 ± 0.000
Ridge CV R²: 1.000 ± 0.000
RandomForest CV R²: 0.999 ± 0.000


Clustering
Even though we can predict performance, it’s also useful to see natural groupings in the data.
We use KMeans clustering to group models into clusters based on cost, latency, and performance characteristics.
We test values of k (number of clusters) from 2 to 6.
For each, we calculate the Silhouette Score — a measure of cluster quality.
The best k (highest silhouette) is chosen automatically.
Cluster labels are later added to the dataset for visualization.

In [32]:
# --- clean clustering run ---

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
from pathlib import Path

# 1) Prepare features
cluster_pipe = Pipeline(steps=[("prep", preprocessor)])
X_prepared = cluster_pipe.fit_transform(X)

# 2) Force dense + sanitize (prevents runtime warnings)
try:
    from scipy import sparse
    if sparse.issparse(X_prepared):
        X_prepared = X_prepared.toarray()
except Exception:
    # scipy might not be installed; skip sparse check
    pass

# Replace NaN/Inf with 0 (you could also drop offending cols/rows if desired)
X_prepared = np.nan_to_num(X_prepared, copy=False, nan=0.0, posinf=0.0, neginf=0.0)

# 3) Try several k, pick best by silhouette
best_k = None
best_sil = -1
best_labels = None

for k in [2, 3, 4, 5, 6]:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)  # use int n_init for compat
    labels = km.fit_predict(X_prepared)

    # silhouette requires > k samples and >1 unique label
    sil = silhouette_score(X_prepared, labels) if X_prepared.shape[0] > k and len(set(labels)) > 1 else -1
    print(f"k={k} → silhouette={sil:.3f}")

    if sil > best_sil:
        best_sil, best_k, best_labels = sil, k, labels

print(f"\n Best k = {best_k} with silhouette = {best_sil:.3f}")

# 4) Build Tableau output
df_results = df.copy()
df_results["cluster"] = best_labels

if has_target and (best_model is not None):
    preds_all = best_model.predict(X)
    df_results["predicted_performance"] = preds_all

Path("outputs").mkdir(parents=True, exist_ok=True)
out_path = Path("outputs/ml_results.csv")
df_results.to_csv(out_path, index=False)
print(f" Saved → {out_path.resolve()}")

# quick cluster sizes
print("\nCluster sizes:")
print(df_results['cluster'].value_counts().sort_index().to_string())


  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  ret = a @ b
  ret = a @ b
  ret = a @ b


k=2 → silhouette=0.229


  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight


k=3 → silhouette=0.138


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight


k=4 → silhouette=0.098


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight
  current_pot = closest_dist_sq @ sample_weight


k=5 → silhouette=0.085
k=6 → silhouette=0.084

 Best k = 2 with silhouette = 0.229
 Saved → /Users/sayanibrahmachari/TG03/Notebook/outputs/ml_results.csv

Cluster sizes:
cluster
0    2542
1    2458


  ret = a @ b
  ret = a @ b
  ret = a @ b


Build ml_results.csv
We create a new dataset containing:
The original data
Predicted performance values
Cluster labels
This combined file (ml_results.csv) is exported to the outputs/ folder and will be used in Tableau for dashboards.

In [33]:
df_results = df.copy()

# add clusters
if best_labels is not None:
    df_results["cluster"] = best_labels

# add predictions if regression ran
if has_target and best_model is not None:
    preds_all = best_model.predict(X)
    df_results["predicted_performance"] = preds_all
    print(f"Best regression model: {best_name}  | R²={results_reg[best_name]['r2']:.3f} (target ≥ 0.70)")

# save
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
df_results.to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH.resolve())

Best regression model: Linear  | R²=1.000 (target ≥ 0.70)
Saved: /Users/sayanibrahmachari/TG03/Notebook/outputs/ml_results.csv


Explanation – Predictive Modeling & Clustering
Regression Results -
Best model: Linear Regression
Achieved R² = 1.00 (cross-validated)
Indicates performance_index is a deterministic function of other features
Acceptance criterion (≥ 0.70) met 
Clustering Results -
Best k = 2, Silhouette = 0.229
Roughly equal cluster sizes (≈ 2500 records each)
Suggests two major AI model groups with distinct cost–performance patterns
Exports -
Predictions (predicted_performance) and cluster labels (cluster) saved to outputs/ml_results.csv