# Tabular classification with CatBoost (PySpark)

This notebook trains a CatBoost classifier on a tabular CSV dataset located in `Dangue_Dataset/dataset.csv`. It includes EDA, preprocessing, training, evaluation, and model export.

If your target column name differs from `target`, set `TARGET_COLUMN` below.

In [1]:
# Install (optional) - run in notebook kernel if needed
!pip install -r ../requirements.txt

Collecting pandas (from -r ../requirements.txt (line 9))
  Downloading pandas-3.0.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (79 kB)
Collecting seaborn (from -r ../requirements.txt (line 10))
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting catboost (from -r ../requirements.txt (line 11))
  Downloading catboost-1.2.8-cp313-cp313-macosx_11_0_universal2.whl.metadata (1.4 kB)
Collecting graphviz (from catboost->-r ../requirements.txt (line 11))
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Collecting plotly (from catboost->-r ../requirements.txt (line 11))
  Downloading plotly-6.5.2-py3-none-any.whl.metadata (8.5 kB)
Collecting narwhals>=1.15.1 (from plotly->catboost->-r ../requirements.txt (line 11))
  Downloading narwhals-2.15.0-py3-none-any.whl.metadata (13 kB)
Downloading pandas-3.0.0-cp313-cp313-macosx_11_0_arm64.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m1.3 MB/s[0m  [33m0:00:07[0m et

In [2]:
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from catboost import CatBoostClassifier, Pool
import joblib

DATA_CSV = Path('..') / 'Dangue_Dataset' / 'dataset.csv'
print('data path:', DATA_CSV)
print('exists:', DATA_CSV.exists())

data path: ../Dangue_Dataset/dataset.csv
exists: True


In [None]:
# Load dataset
df = pd.read_csv(DATA_CSV)
print('rows,cols:', df.shape)
display(df.head())

# Quick missing values check
display(df.isnull().sum())

In [None]:
# Detect target column (common names)
common_targets = ['target','label','class','diagnosis']
TARGET_COLUMN = None
for name in common_targets:
    if name in df.columns:
        TARGET_COLUMN = name
        break
if TARGET_COLUMN is None:
    # fall back to last column and warn
    TARGET_COLUMN = df.columns[-1]
    print(f'Warning: no common target found; using last column: {TARGET_COLUMN}')
else:
    print('Using detected target column:', TARGET_COLUMN)

In [None]:
# Separate features and target
X = df.drop(columns=[TARGET_COLUMN])
y = df[TARGET_COLUMN]

# Basic EDA: target distribution
print('target value counts:
', y.value_counts())
sns.countplot(x=y)
plt.title('Target distribution')
plt.show()

In [None]:
# Identify categorical columns automatically (object or low-cardinality)
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
# include low-cardinality numeric features as categorical (optional)
for c in X.select_dtypes(include=['int','int64','float']).columns:
    if X[c].nunique() <= 10:
        cat_cols.append(c)
cat_cols = list(dict.fromkeys(cat_cols))  # unique
num_cols = [c for c in X.columns if c not in cat_cols]
print('categorical cols:', cat_cols)
print('numerical cols:', num_cols[:10])

In [None]:
# Simple preprocessing: fillna for numerical, mode for categorical
for c in num_cols:
    if X[c].isnull().any():
        X[c] = X[c].fillna(X[c].median())
for c in cat_cols:
    if X[c].isnull().any():
        X[c] = X[c].fillna(X[c].mode().iloc[0])

# Convert object columns to string (CatBoost can accept them)
for c in cat_cols:
    X[c] = X[c].astype(str)

# Encode target if it's non-numeric
if y.dtype == 'object' or y.dtype.name == 'category':
    y, y_names = pd.factorize(y)
    print('encoded target classes:', list(y_names))
else:
    y_names = None

In [None]:
# Train / val / test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
print('train/val/test sizes:', X_train.shape[0], X_val.shape[0], X_test.shape[0])

In [None]:
# Prepare CatBoost Pools (specify categorical feature indices)
cat_feature_indices = [X.columns.get_loc(c) for c in cat_cols] if cat_cols else []
train_pool = Pool(X_train, y_train, cat_features=cat_feature_indices)
val_pool = Pool(X_val, y_val, cat_features=cat_feature_indices)
test_pool = Pool(X_test, y_test, cat_features=cat_feature_indices)
print('cat feature indices:', cat_feature_indices)

In [None]:
# Train CatBoostClassifier with simple hyperparameters and early stopping
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    eval_metric='Accuracy',
    random_seed=42,
    early_stopping_rounds=50,
    verbose=100
)
model.fit(train_pool, eval_set=val_pool, use_best_model=True)

In [None]:
# Evaluation on test set
preds = model.predict(X_test)
print('Test accuracy:', accuracy_score(y_test, preds))
print(classification_report(y_test, preds))
cm = confusion_matrix(y_test, preds)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.ylabel('True')
plt.xlabel('Pred')
plt.show()

In [None]:
# Feature importance
fi = model.get_feature_importance(train_pool)
fi_df = pd.DataFrame({'feature': X.columns, 'importance': fi}).sort_values('importance', ascending=False)
display(fi_df.head(20))
plt.figure(figsize=(8,6))
sns.barplot(data=fi_df.head(20), x='importance', y='feature')
plt.title('Top 20 feature importances')
plt.show()

In [None]:
# Save model and label encoder (if used)
models_dir = Path('..') / 'models'
models_dir.mkdir(exist_ok=True)
model_file = models_dir / 'catboost_model.cbm'
model.save_model(str(model_file))
print('Saved model to', model_file)
# save label names if computed
if y_names is not None:
    joblib.dump(y_names, models_dir / 'cat_names.joblib')

## Notes
- CatBoost works well on mixed numeric and categorical tabular data with minimal encoding.
- Tune `iterations`, `learning_rate`, and `depth` for better performance.
- For larger datasets, consider more advanced CV (k-fold) and hyperparameter search.