# Classical Baseline: CatBoost

**Classical machine learning baseline using CatBoost gradient boosting for comparison with quantum models.**

In [None]:
import kagglehub
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    roc_curve,
    roc_auc_score
)
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier
import time

SEED = 123
np.random.seed(SEED)

print("Imports loaded successfully!")

In [None]:
# Download and load dataset
path = kagglehub.dataset_download("sameepvani/nasa-nearest-earth-objects")
df = pd.read_csv(f"{path}/neo.csv")

print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
features = ['est_diameter_min', 'est_diameter_max', 'relative_velocity', 'miss_distance', 'absolute_magnitude']
target = 'hazardous'

print(f"Features: {features}")
print(f"Target: {target}")

In [None]:
# Create balanced dataset
N = 8840
np.random.seed(SEED)

df_true = df[df[target] == True]
df_false = df[df[target] == False]

df_subset_true = df_true.iloc[np.random.choice(df_true.shape[0], size=N, replace=False)]
df_subset_false = df_false.iloc[np.random.choice(df_false.shape[0], size=N, replace=False)]

df_subset = pd.concat([df_subset_true, df_subset_false])

print(f"Balanced dataset shape: {df_subset.shape}")
print(f"Class distribution:")
print(df_subset[target].value_counts())

In [None]:
# Split into train (80%) and test (20%)
train_df, test_df = train_test_split(
    df_subset,
    test_size=0.2,
    random_state=SEED,
    stratify=df_subset[target]
)

# Further split train into train (80%) and validation (20%)
train_split_df, val_df = train_test_split(
    train_df,
    test_size=0.2,
    random_state=SEED,
    stratify=train_df[target]
)

print(f"Train shape: {train_split_df.shape}")
print(f"Validation shape: {val_df.shape}")
print(f"Test shape: {test_df.shape}")

In [None]:
# Extract features and labels
X_train = train_split_df[features].to_numpy()
Y_train = train_split_df[target].apply(lambda x: int(x)).to_numpy()

X_val = val_df[features].to_numpy()
Y_val = val_df[target].apply(lambda x: int(x)).to_numpy()

X_test = test_df[features].to_numpy()
Y_test = test_df[target].apply(lambda x: int(x)).to_numpy()

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print(f"X_train shape: {X_train_scaled.shape}")
print(f"X_val shape: {X_val_scaled.shape}")
print(f"X_test shape: {X_test_scaled.shape}")
print(f"\nClass balance in training set:")
print(f"  Class 0: {(Y_train == 0).sum()}")
print(f"  Class 1: {(Y_train == 1).sum()}")

In [None]:
# ============================================
# CATBOOST CLASSIFIER
# ============================================

print("=" * 60)
print("Training CatBoost Classifier")
print("=" * 60)

# Create CatBoost classifier
catboost_model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=4,
    verbose=False,
    random_seed=SEED
)

# Train on scaled data
start_time = time.time()
catboost_model.fit(X_train_scaled, Y_train, eval_set=(X_val_scaled, Y_val))
train_time = time.time() - start_time

print(f"\nTraining completed in {train_time:.2f}s")

# Validation predictions
val_pred_class = catboost_model.predict(X_val_scaled)
val_pred_proba = catboost_model.predict_proba(X_val_scaled)[:, 1]
val_accuracy = accuracy_score(Y_val, val_pred_class)
val_roc_auc = roc_auc_score(Y_val, val_pred_proba)

print(f"\nValidation Results:")
print(f"  Accuracy: {val_accuracy:.4f}")
print(f"  ROC-AUC: {val_roc_auc:.4f}")

# Test predictions
test_pred_class = catboost_model.predict(X_test_scaled)
test_pred_proba = catboost_model.predict_proba(X_test_scaled)[:, 1]
test_accuracy = accuracy_score(Y_test, test_pred_class)
test_roc_auc = roc_auc_score(Y_test, test_pred_proba)

print(f"\nTest Results:")
print(f"  Accuracy: {test_accuracy:.4f}")
print(f"  ROC-AUC: {test_roc_auc:.4f}")

print(f"\n{'='*60}")

In [None]:
# ============================================
# DETAILED EVALUATION
# ============================================

# Confusion Matrix
cm = confusion_matrix(Y_test, test_pred_class)
print("Confusion Matrix:")
print(cm)
print()

# Classification Report
print("Classification Report:")
print(classification_report(Y_test, test_pred_class))

# Plot Confusion Matrix
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Confusion Matrix Heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_xlabel('Predicted', fontsize=12)
axes[0].set_ylabel('True', fontsize=12)
axes[0].set_title('Confusion Matrix', fontsize=14, fontweight='bold')

# ROC Curve
fpr, tpr, thresholds = roc_curve(Y_test, test_pred_proba)
axes[1].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {test_roc_auc:.4f})')
axes[1].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random')
axes[1].set_xlim([0.0, 1.0])
axes[1].set_ylim([0.0, 1.05])
axes[1].set_xlabel('False Positive Rate', fontsize=12)
axes[1].set_ylabel('True Positive Rate', fontsize=12)
axes[1].set_title('ROC Curve', fontsize=14, fontweight='bold')
axes[1].legend(loc="lower right", fontsize=10)
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('catboost_evaluation.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# ============================================
# FEATURE IMPORTANCE
# ============================================

feature_importance = catboost_model.get_feature_importance()
feature_names = features

# Create DataFrame for feature importance
fi_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
}).sort_values('Importance', ascending=False)

print("Feature Importance:")
print(fi_df)
print()

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(fi_df['Feature'], fi_df['Importance'], color='steelblue', edgecolor='navy')
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('CatBoost Feature Importance', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('catboost_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# ============================================
# SUMMARY
# ============================================

print("=" * 60)
print("CATBOOST CLASSICAL BASELINE SUMMARY")
print("=" * 60)
print(f"\nModel Configuration:")
print(f"  Iterations: 100")
print(f"  Learning Rate: 0.1")
print(f"  Depth: 4")
print(f"  Features: {len(features)}")
print(f"\nDataset:")
print(f"  Training samples: {len(X_train_scaled)}")
print(f"  Validation samples: {len(X_val_scaled)}")
print(f"  Test samples: {len(X_test_scaled)}")
print(f"\nPerformance:")
print(f"  Validation Accuracy: {val_accuracy:.4f}")
print(f"  Validation ROC-AUC: {val_roc_auc:.4f}")
print(f"  Test Accuracy: {test_accuracy:.4f}")
print(f"  Test ROC-AUC: {test_roc_auc:.4f}")
print(f"  Training Time: {train_time:.2f}s")
print(f"\n{'='*60}")