In [9]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score


In [10]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
# Separate features and target
X = train_df.drop('target', axis=1, errors='ignore')  # Assuming 'target' is the column name, adjust if different
y = train_df['target'] if 'target' in train_df.columns else None


In [15]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")

# Initialize CatBoost model
model = CatBoostClassifier(
    iterations=10000,
    learning_rate=0.03,
    depth=8,
    loss_function='Logloss',
    eval_metric='Accuracy',
    random_seed=42,
    early_stopping_rounds=500,
    verbose=100
)

# Train the model
model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    use_best_model=True,
    plot=True
)

# Evaluate the model on validation set
val_preds = model.predict(X_val)
val_probs = model.predict_proba(X_val)[:, 1]

print("\nValidation Metrics:")
print(f"Accuracy: {accuracy_score(y_val, val_preds):.4f}")
print(f"ROC AUC: {roc_auc_score(y_val, val_probs):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, val_preds))

# Make predictions on test data
test_preds = model.predict_proba(test_df)[:, 1]

# Create submission dataframe
submission = pd.DataFrame({
    'id': test_df.index,
    'prediction': test_preds
})

# Save predictions to file
submission.to_csv('catboost_submission.csv', index=False)
print("\nPredictions saved to 'catboost_submission.csv'")

Training set shape: (1030, 16)
Validation set shape: (258, 16)


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.8572816	test: 0.8372093	best: 0.8372093 (0)	total: 4.71ms	remaining: 47.1s
100:	learn: 0.9310680	test: 0.8604651	best: 0.8643411 (7)	total: 349ms	remaining: 34.3s
200:	learn: 0.9640777	test: 0.8643411	best: 0.8643411 (7)	total: 957ms	remaining: 46.7s
300:	learn: 0.9825243	test: 0.8604651	best: 0.8643411 (7)	total: 1.44s	remaining: 46.3s
400:	learn: 0.9932039	test: 0.8527132	best: 0.8643411 (7)	total: 1.93s	remaining: 46.2s
500:	learn: 0.9980583	test: 0.8565891	best: 0.8643411 (7)	total: 2.49s	remaining: 47.3s
Stopped by overfitting detector  (500 iterations wait)

bestTest = 0.8643410853
bestIteration = 7

Shrink model to first 8 iterations.

Validation Metrics:
Accuracy: 0.8643
ROC AUC: 0.9256

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.83      0.85       116
           1       0.86      0.89      0.88       142

    accuracy                           0.86       258
   macro avg       0.86      0.86      0.8