In [3]:
! python3 -m pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-macosx_11_0_universal2.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.7-cp310-cp310-macosx_11_0_universal2.whl (27.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.1/27.1 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading graphviz-0.20.3-py3-none-any.whl (47 kB)
Installing collected packages: graphviz, catboost
Successfully installed catboost-1.2.7 graphviz-0.20.3


In [8]:
import pandas as pd
import numpy as np
import optuna

from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_val_score

In [9]:
def submission(filename: str, df: pd.DataFrame, y_pred: np.array) -> None:
    """
    Saves the dataframe as a csv file.

    Args:
        df (pd.DataFrame): The dataframe to save.
        filename (str): The name of the file to save.
    Returns:
        None
    """
    sub_file = pd.DataFrame({"Id": df.index, "Risk": y_pred})
    sub_file["Risk"] = sub_file["Risk"].apply(lambda x: "Risk" if x == 1 else "No Risk")
    sub_file = sub_file.to_csv(filename, index=False)

In [10]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')

In [11]:
remainders = ['CheckingStatus', 'ExistingSavings', 'EmploymentDuration', 'InstallmentPercent', 'CurrentResidenceDuration', 'ExistingCreditsCount', 'Telephone']
to_scale = ['LoanDuration', 'LoanAmount', 'Age']
to_1hot = ['CreditHistory', 'LoanPurpose', 'OthersOnLoan', 'OwnsProperty', 'InstallmentPlans', 'Housing']

In [12]:
# First we do a preprocessing step
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), to_scale),
        ('1hot', OneHotEncoder(handle_unknown='ignore'), to_1hot),
    ],
    remainder='passthrough'
)

# Then we create the model
model = CatBoostClassifier()

# Last we create the pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model)
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

Learning rate set to 0.018562
0:	learn: 0.6830156	total: 4.31ms	remaining: 4.31s
1:	learn: 0.6741407	total: 5.99ms	remaining: 2.99s
2:	learn: 0.6654088	total: 7.82ms	remaining: 2.6s
3:	learn: 0.6573283	total: 9.51ms	remaining: 2.37s
4:	learn: 0.6491528	total: 11.3ms	remaining: 2.25s
5:	learn: 0.6415919	total: 12.9ms	remaining: 2.14s
6:	learn: 0.6346849	total: 14.4ms	remaining: 2.04s
7:	learn: 0.6274530	total: 16.2ms	remaining: 2.01s
8:	learn: 0.6199575	total: 18ms	remaining: 1.98s
9:	learn: 0.6137469	total: 20.5ms	remaining: 2.03s
10:	learn: 0.6074287	total: 22.2ms	remaining: 2s
11:	learn: 0.6009292	total: 24ms	remaining: 1.98s
12:	learn: 0.5953762	total: 26.3ms	remaining: 1.99s
13:	learn: 0.5887192	total: 27.9ms	remaining: 1.96s
14:	learn: 0.5834488	total: 29.3ms	remaining: 1.92s
15:	learn: 0.5791385	total: 30.7ms	remaining: 1.89s
16:	learn: 0.5742766	total: 32.5ms	remaining: 1.88s
17:	learn: 0.5696792	total: 35.8ms	remaining: 1.95s
18:	learn: 0.5637982	total: 37.5ms	remaining: 1.94s


In [14]:
def objective_cat(trial):
    # Hyperparameter space for CatBoost
    param = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'random_strength': trial.suggest_int('random_strength', 1, 20),
        'eval_metric': 'Accuracy',  # Evaluation metric
        'loss_function': 'Logloss',  # Binary classification loss function
        'cat_features': [],  # You can add categorical features if needed
        'verbose': 0  # Suppress output during training
    }

    # Create the pipeline
    model = Pipeline([
        ("preprocessor", preprocessor),
        ("model", CatBoostClassifier(**param))  # Use CatBoostClassifier
    ])
    
    # Cross-validation
    score = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy").mean()
    return score

In [15]:
# Find the best pipeline thanks to optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective_cat, n_trials=50)

[I 2025-03-31 13:04:18,177] A new study created in memory with name: no-name-0333b602-5ad6-452f-a49c-9081715eba67
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10),
[I 2025-03-31 13:04:25,796] Trial 0 finished with value: 0.7758186397984888 and parameters: {'iterations': 930, 'learning_rate': 0.00022315762499488387, 'depth': 6, 'l2_leaf_reg': 7.130367576491766, 'border_count': 188, 'bagging_temperature': 0.4313067842404793, 'random_strength': 14}. Best is trial 0 with value: 0.7758186397984888.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 0.1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10),
[I 2025-03-31 13:04:39,056] Trial 1 finished with value: 0.7823677581863981 and parameters: {'iterations': 644, 'learning_rate': 0.009685251536196768, 'depth': 9, 'l2_leaf_reg': 2.135040675377773, 'border_count': 223, 'bagging_temperature': 0.40395449785572557, 'random

In [16]:
# Get the best parameters & model
best_params = study.best_params
best_model = Pipeline([
    ("preprocessor", preprocessor),
    ("model", CatBoostClassifier(**best_params))
])

In [17]:
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

0:	learn: 0.6872401	total: 2.88ms	remaining: 2.06s
1:	learn: 0.6826882	total: 5.88ms	remaining: 2.1s
2:	learn: 0.6789455	total: 7.34ms	remaining: 1.75s
3:	learn: 0.6731314	total: 9.43ms	remaining: 1.68s
4:	learn: 0.6682340	total: 11.3ms	remaining: 1.61s
5:	learn: 0.6633224	total: 13.4ms	remaining: 1.58s
6:	learn: 0.6586050	total: 15.1ms	remaining: 1.53s
7:	learn: 0.6557026	total: 16.5ms	remaining: 1.46s
8:	learn: 0.6511736	total: 18.9ms	remaining: 1.49s
9:	learn: 0.6474191	total: 20.4ms	remaining: 1.44s
10:	learn: 0.6445245	total: 21.5ms	remaining: 1.38s
11:	learn: 0.6407555	total: 23.1ms	remaining: 1.36s
12:	learn: 0.6366916	total: 25ms	remaining: 1.35s
13:	learn: 0.6323216	total: 26.7ms	remaining: 1.34s
14:	learn: 0.6284682	total: 28.2ms	remaining: 1.32s
15:	learn: 0.6248267	total: 29.7ms	remaining: 1.3s
16:	learn: 0.6210501	total: 31.2ms	remaining: 1.28s
17:	learn: 0.6175067	total: 32.6ms	remaining: 1.27s
18:	learn: 0.6146897	total: 34.6ms	remaining: 1.27s
19:	learn: 0.6113636	total

In [18]:
submission("cat_1hot_accuracy.csv", X_test, y_pred)