In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting plotly (from catboost)
  Downloading plotly-5.24.1-py3-none-any.whl.metadata (7.3 kB)
Collecting tenacity>=6.2.0 (from plotly->catboost)
  Downloading tenacity-9.0.0-py3-none-any.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0mm
[?25hDownloading graphviz-0.20.3-py3-none-any.whl (47 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m144.3 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading plotly-5.24.1-py3-none-any.whl (19.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[

In [49]:
pip install --upgrade catboost

Note: you may need to restart the kernel to use updated packages.


In [4]:
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score  # for classification
import pandas as pd

In [5]:
data_test = pd.read_csv('test.csv')

In [6]:
data_train = pd.read_csv('train.csv')

In [7]:
X = data_train.copy()
del X['loan_status']
y = data_train.copy()
y = y['loan_status']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [9]:
model = CatBoostClassifier(iterations = 1000, learning_rate = 0.1, depth = 6, boosting_type = 'Plain', cat_features = [3,5,6,10], verbose = 100)
model.fit(X_train, y_train)

0:	learn: 0.5748657	total: 230ms	remaining: 3m 49s
100:	learn: 0.1574936	total: 8.74s	remaining: 1m 17s
200:	learn: 0.1426522	total: 17.4s	remaining: 1m 9s
300:	learn: 0.1331126	total: 26.3s	remaining: 1m 1s
400:	learn: 0.1255825	total: 34.2s	remaining: 51.1s
500:	learn: 0.1189316	total: 42.5s	remaining: 42.4s
600:	learn: 0.1138131	total: 51.6s	remaining: 34.3s
700:	learn: 0.1086440	total: 1m	remaining: 25.6s
800:	learn: 0.1041060	total: 1m 8s	remaining: 17.1s
900:	learn: 0.1001076	total: 1m 17s	remaining: 8.54s
999:	learn: 0.0959233	total: 1m 26s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f525df9ddd0>

In [10]:
y_pred = model.predict(X_test)

In [11]:
accuracy = accuracy_score(y_test,y_pred)

In [12]:
print(accuracy)

0.9533634580953193


In [20]:
import optuna
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# Assuming X and y are defined and represent your features and target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical features by column index or names
cat_features_indices = [3,5,6,10]  # Example indices of categorical features; update as needed
# OR
# cat_features = ['feature_name1', 'feature_name2']  # List of categorical feature names

# Define the Optuna objective function
def objective(trial):
    # Hyperparameters to tune
    params = {
        "iterations": trial.suggest_int("iterations", 100, 2000),  # Increased upper limit
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),  # Lower bound added
        "depth": trial.suggest_int("depth", 3, 12),  # Wider range for depth
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),  # Increased upper limit
        "random_strength": trial.suggest_float("random_strength", 1, 5),  # Increased upper limit
        "od_type": trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        "od_wait": trial.suggest_int("od_wait", 10, 100),  # Increased upper limit
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bernoulli"]),
        "task_type": 'CPU',
        "verbose": False
    }

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    roc_auc_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in skf.split(X_train, y_train):
        X_train_fold = X_train.iloc[train_index]  # Use iloc for indexing
        X_val = X_train.iloc[val_index]  # Use iloc for indexing
        y_train_fold = y_train.iloc[train_index]  # Use iloc for indexing
        y_val = y_train.iloc[val_index]  # Use iloc for indexing

        model = CatBoostClassifier(cat_features=cat_features_indices, **params)

        # Fit the model
        model.fit(X_train_fold, y_train_fold, verbose=False)

        # Predict and calculate ROC AUC score
        y_pred = model.predict_proba(X_val)[:, 1]  # Get probabilities for the positive class
        roc = roc_auc_score(y_val, y_pred)  # Calculate ROC AUC score
        roc_auc_scores.append(roc)

    # Return the mean ROC AUC score across folds
    return np.mean(roc_auc_scores)

# Create a study and optimize the objective function
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=60)

print("Best parameters:", study.best_params)
print("Best ROC AUC:", study.best_value)

# Fit the final model with the best parameters
best_params = study.best_params
best_params['verbose'] = 200  # Adjust verbosity

# Fit the model with the best parameters
model = CatBoostClassifier(cat_features=cat_features_indices, **best_params)
model.fit(X_train, y_train)  # Fit with the best parameters

[I 2024-10-08 20:24:17,806] A new study created in memory with name: no-name-7feadd60-100d-456a-afef-5f557855c1b9
[I 2024-10-08 20:28:28,630] Trial 0 finished with value: 0.8911594793120459 and parameters: {'iterations': 515, 'learning_rate': 0.0002425569404595565, 'depth': 3, 'l2_leaf_reg': 8.938112769532939, 'random_strength': 1.7988645906332996, 'od_type': 'Iter', 'od_wait': 54, 'bootstrap_type': 'Bernoulli'}. Best is trial 0 with value: 0.8911594793120459.
[I 2024-10-08 20:35:50,271] Trial 1 finished with value: 0.9073619593607228 and parameters: {'iterations': 645, 'learning_rate': 0.0003896968949439135, 'depth': 6, 'l2_leaf_reg': 7.437007196894844, 'random_strength': 3.5421510382368937, 'od_type': 'IncToDec', 'od_wait': 13, 'bootstrap_type': 'Bernoulli'}. Best is trial 1 with value: 0.9073619593607228.
[I 2024-10-08 20:55:46,829] Trial 2 finished with value: 0.9461311091980453 and parameters: {'iterations': 1361, 'learning_rate': 0.0154140208141927, 'depth': 7, 'l2_leaf_reg': 3.4

KeyboardInterrupt: 

In [22]:
import optuna
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# Assuming X and y are defined and represent your features and target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical features by column index or names
cat_features_indices = [3,5,6,10]  # Example indices of categorical features; update as needed
# OR
# cat_features = ['feature_name1', 'feature_name2']  # List of categorical feature names

# Define the Optuna objective function
def objective(trial):
    # Hyperparameters to tune
    params = {
        "iterations": trial.suggest_int("iterations", 100, 2000),  # Increased upper limit
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.15, log=True),  # Lower bound added
        "depth": trial.suggest_int("depth", 2, 3),  # Wider range for depth
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 7, 10),  # Increased upper limit
        "random_strength": trial.suggest_float("random_strength", 1, 1.4),  # Increased upper limit
        "od_type": trial.suggest_categorical("od_type", ["IncToDec"]),
        "od_wait": trial.suggest_int("od_wait", 50, 80),  # Increased upper limit
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bernoulli"]),
        "task_type": 'CPU',
        "verbose": False
    }

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    roc_auc_scores = []

    # Perform k-fold cross-validation
    for train_index, val_index in skf.split(X_train, y_train):
        X_train_fold = X_train.iloc[train_index]  # Use iloc for indexing
        X_val = X_train.iloc[val_index]  # Use iloc for indexing
        y_train_fold = y_train.iloc[train_index]  # Use iloc for indexing
        y_val = y_train.iloc[val_index]  # Use iloc for indexing

        model = CatBoostClassifier(cat_features=cat_features_indices, **params)

        # Fit the model
        model.fit(X_train_fold, y_train_fold, verbose=False)

        # Predict and calculate ROC AUC score
        y_pred = model.predict_proba(X_val)[:, 1]  # Get probabilities for the positive class
        roc = roc_auc_score(y_val, y_pred)  # Calculate ROC AUC score
        roc_auc_scores.append(roc)

    # Return the mean ROC AUC score across folds
    return np.mean(roc_auc_scores)

# Create a study and optimize the objective function
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print("Best parameters:", study.best_params)
print("Best ROC AUC:", study.best_value)

# Fit the final model with the best parameters
best_params = study.best_params
best_params['verbose'] = 200  # Adjust verbosity

# Fit the model with the best parameters
model = CatBoostClassifier(cat_features=cat_features_indices, **best_params)
model.fit(X_train, y_train)  # Fit with the best parameters

[I 2024-10-09 10:42:48,347] A new study created in memory with name: no-name-42782609-4a5c-46e5-9097-6f8f6d5cede5
[I 2024-10-09 10:49:16,864] Trial 0 finished with value: 0.9474517399662628 and parameters: {'iterations': 1691, 'learning_rate': 0.06265899533809129, 'depth': 2, 'l2_leaf_reg': 9.929668205682312, 'random_strength': 1.2831222181750548, 'od_type': 'IncToDec', 'od_wait': 80, 'bootstrap_type': 'Bernoulli'}. Best is trial 0 with value: 0.9474517399662628.
[I 2024-10-09 10:54:36,432] Trial 1 finished with value: 0.9272798567745774 and parameters: {'iterations': 1474, 'learning_rate': 0.011072398816736283, 'depth': 2, 'l2_leaf_reg': 9.321360285845373, 'random_strength': 1.282154686529962, 'od_type': 'IncToDec', 'od_wait': 64, 'bootstrap_type': 'Bernoulli'}. Best is trial 0 with value: 0.9474517399662628.
[I 2024-10-09 11:00:26,098] Trial 2 finished with value: 0.9512774349701398 and parameters: {'iterations': 1327, 'learning_rate': 0.06119826651686255, 'depth': 3, 'l2_leaf_reg': 

KeyboardInterrupt: 

In [16]:
X_ = data_test.copy()
y_ = model.predict(X_)
solution = {}
solution['id'] = X_['id']
solution['loan_status'] = y_

In [19]:
df = pd.DataFrame(solution)
df.to_csv('solution', index = False)