In [20]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import (
    f1_score, precision_score, recall_score, accuracy_score,
    confusion_matrix, classification_report
)
import warnings
warnings.filterwarnings('ignore')

In [21]:
train = pd.read_csv("../data_processed/ispu_train_ready.csv")
test = pd.read_csv("../data_processed/ispu_preprocessed_2023_2025.csv")
sample_submission = pd.read_csv("../sample_submission.csv")

In [22]:
X_train = train.drop(columns=["kategori_encoded"])
y_train = train["kategori_encoded"]

X_test = test.drop(columns=["tanggal", "stasiun_id", "kategori", "kategori_encoded"], errors='ignore')

In [23]:
from sklearn.model_selection import train_test_split

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

In [24]:
train_pool = Pool(X_tr, y_tr)
val_pool = Pool(X_val, y_val)

In [None]:
# Improved model with better hyperparameters
model = CatBoostClassifier(
    iterations=1500,
    depth=7,
    learning_rate=0.03,
    bootstrap_type='Bernoulli',
    subsample=0.8,
    colsample_bylevel=0.8,
    l2_leaf_reg=5,
    loss_function="MultiClass",
    eval_metric="MultiClass",
    verbose=0,
    random_state=42,
    early_stopping_rounds=100
)

model.fit(train_pool, eval_set=val_pool, verbose=False)
print("âœ“ Model training completed!")

CatBoostError: catboost/private/libs/options/catboost_options.cpp:794: Error: default bootstrap type (bayesian) doesn't support 'subsample' option

In [None]:
val_preds = model.predict(val_pool)

print("\n" + "="*70)
print("MODEL EVALUATION ON VALIDATION SET")
print("="*70)

# Calculate all metrics
accuracy = accuracy_score(y_val, val_preds)
precision_macro = precision_score(y_val, val_preds, average="macro", zero_division=0)
recall_macro = recall_score(y_val, val_preds, average="macro", zero_division=0)
f1_macro = f1_score(y_val, val_preds, average="macro", zero_division=0)

precision_weighted = precision_score(y_val, val_preds, average="weighted", zero_division=0)
recall_weighted = recall_score(y_val, val_preds, average="weighted", zero_division=0)
f1_weighted = f1_score(y_val, val_preds, average="weighted", zero_division=0)

print(f"\nðŸ“Š Overall Accuracy: {accuracy:.4f}")
print(f"\nðŸ“ˆ Macro Metrics (unweighted average):") 
print(f"   Precision: {precision_macro:.4f}")
print(f"   Recall:    {recall_macro:.4f}")
print(f"   F1-Score:  {f1_macro:.4f}")
print(f"\nðŸ“Š Weighted Metrics (weighted by support):")
print(f"   Precision: {precision_weighted:.4f}")
print(f"   Recall:    {recall_weighted:.4f}")
print(f"   F1-Score:  {f1_weighted:.4f}")

print(f"\nðŸ“‹ Classification Report:\n{classification_report(y_val, val_preds, zero_division=0)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_val, val_preds)}")

F1 Macro: 1.0
F1 Weighted: 1.0


In [None]:
print("\n" + "="*70)
print("GENERATING PREDICTIONS & SUBMISSION")
print("="*70)

# Filter test data to match sample_submission
print(f"Sample submission shape: {sample_submission.shape}")
print(f"Full test data shape: {X_test.shape}")

# Use only the data needed for submission
if len(X_test) > len(sample_submission):
    X_test_submission = X_test.iloc[:len(sample_submission)].reset_index(drop=True)
else:
    X_test_submission = X_test.reset_index(drop=True)

# Predict on test set
test_preds = model.predict(X_test_submission)

print(f"\nTest set for submission shape: {X_test_submission.shape}")
print(f"Predictions shape: {test_preds.shape}")
print(f"\nPrediction distribution:")
print(pd.Series(test_preds).value_counts().sort_index())

# Create submission
submission = sample_submission.copy()
submission["category"] = test_preds

submission_path = "../hasil submission/submission_catboost.csv"
submission.to_csv(submission_path, index=False)
print(f"\nâœ“ Submission saved to: {submission_path}")
print(f"\nSubmission preview (first 10 rows):")
print(submission.head(10))
print(f"\nTotal submissions: {len(submission)}")

# Feature importance
print("\n" + "="*70)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*70)
feature_importance = model.get_feature_importance(val_pool)
feature_names = X_val.columns
feature_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)
print("\nTop 15 Most Important Features:")
print(feature_df.head(15).to_string(index=False))

ValueError: Length of values (4831) does not match length of index (455)