In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py


Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 553, done.[K
remote: Counting objects: 100% (284/284), done.[K
remote: Compressing objects: 100% (182/182), done.[K
remote: Total 553 (delta 179), reused 147 (delta 100), pack-reused 269 (from 1)[K
Receiving objects: 100% (553/553), 178.44 KiB | 22.30 MiB/s, done.
Resolving deltas: 100% (281/281), done.
Collecting pynvml
  Downloading pynvml-12.0.0-py3-none-any.whl.metadata (5.4 kB)
Collecting nvidia-ml-py<13.0.0a0,>=12.0.0 (from pynvml)
  Downloading nvidia_ml_py-12.560.30-py3-none-any.whl.metadata (8.6 kB)
Downloading pynvml-12.0.0-py3-none-any.whl (26 kB)
Downloading nvidia_ml_py-12.560.30-py3-none-any.whl (40 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.5/40.5 kB 3.7 MB/s eta 0:00:00
Installing collected packages: nvidia-ml-py, pynvml
Successfully installed nvidia-ml-py-12.560.30 pynvml-12.0.0
Installing RAPIDS remaining 24.10.* libraries
Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com


In [None]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

# Memory reduction function
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print(f'Mem. usage decreased to {end_mem:.2f} Mb ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    return df

# Combined preprocessing and UID detection
def preprocess_and_detect_uid(train_df, test_df, train_identity=None, test_identity=None):
    print("Starting preprocessing...")

    # Memory reduction
    train_df = reduce_mem_usage(train_df)
    test_df = reduce_mem_usage(test_df)

    # Merge with identity datasets if provided
    if train_identity is not None and test_identity is not None:
        train_df = pd.merge(train_df, train_identity, on="TransactionID", how="left")
        test_df = pd.merge(test_df, test_identity, on="TransactionID", how="left")

    # Add full address feature
    train_df['full_addr'] = train_df['addr1'].astype(str) + '_' + train_df['addr2'].astype(str)
    test_df['full_addr'] = test_df['addr1'].astype(str) + '_' + test_df['addr2'].astype(str)

    # Add time-based UID features
    for col in ['D1', 'D2', 'D3', 'D5', 'D10', 'D11', 'D15']:
        if col in train_df.columns:
            new_col = 'uid_td_' + col
            train_df[new_col] = train_df['TransactionDT'] / (24 * 60 * 60)
            train_df[new_col] = np.floor(train_df[new_col] - train_df[col]) + 1000

            test_df[new_col] = test_df['TransactionDT'] / (24 * 60 * 60)
            test_df[new_col] = np.floor(test_df[new_col] - test_df[col]) + 1000

    # Add normalized day feature
    train_df['DT_day'] = np.floor(train_df['TransactionDT'] / (24 * 60 * 60)) + 1000
    test_df['DT_day'] = np.floor(test_df['TransactionDT'] / (24 * 60 * 60)) + 1000

    # Round transaction amounts for feature creation
    train_df['TransactionAmt_fix'] = np.round(train_df['TransactionAmt'], 2)
    test_df['TransactionAmt_fix'] = np.round(test_df['TransactionAmt'], 2)

    # UID detection
    train_df['uid'] = train_df['card1'].astype(str) + '_' + train_df['card2'].astype(str) + '_' + train_df['addr1'].astype(str)
    test_df['uid'] = test_df['card1'].astype(str) + '_' + test_df['card2'].astype(str) + '_' + test_df['addr1'].astype(str)

    # Frequency encoding for UID
    uid_freq = pd.concat([train_df['uid'], test_df['uid']]).value_counts()
    train_df['uid_count'] = train_df['uid'].map(uid_freq)
    test_df['uid_count'] = test_df['uid'].map(uid_freq)

    print("Preprocessing and UID detection complete.")
    return train_df, test_df





In [None]:
train_df = pd.read_csv('drive/MyDrive/DSC_final_project/train_transaction.csv')
train_identity = pd.read_csv('drive/MyDrive/DSC_final_project/train_identity.csv')
test_df = pd.read_csv('drive/MyDrive/DSC_final_project/test_transaction.csv')
test_identity = pd.read_csv('drive/MyDrive/DSC_final_project/test_identity.csv')

In [None]:
train_processed, test_processed = preprocess_and_detect_uid(train_df, test_df, train_identity, test_identity)

Starting preprocessing...
Mem. usage decreased to 542.35 Mb (69.4% reduction)
Mem. usage decreased to 472.59 Mb (68.9% reduction)
Preprocessing and UID detection complete.


In [None]:
train_processed['isFraud']

Unnamed: 0,isFraud
0,0
1,0
2,0
3,0
4,0
...,...
590535,0
590536,0
590537,0
590538,0


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import numpy as np
import scipy.sparse as sp

def preprocess_for_baseline_with_onehot_sparse(input_train_df, input_test_df, target_column='IsFraud'):
    # Work on local copies to avoid modifying global variables
    train_df = input_train_df.copy()
    test_df = input_test_df.copy()

    # Drop target column from training data
    if target_column in train_df.columns:
        train_df = train_df.drop(columns=[target_column])

    # Identify numeric and categorical columns that exist in the datasets
    numeric_columns = train_df.select_dtypes(include=['int64', 'float64']).columns
    categorical_columns = train_df.select_dtypes(include=['object']).columns

    # Ensure the columns exist in both train and test
    numeric_columns = [col for col in numeric_columns if col in test_df.columns]
    categorical_columns = [col for col in categorical_columns if col in test_df.columns]

    # Handle missing values
    train_df[numeric_columns] = train_df[numeric_columns].fillna(train_df[numeric_columns].median())
    test_df[numeric_columns] = test_df[numeric_columns].fillna(test_df[numeric_columns].median())
    train_df[categorical_columns] = train_df[categorical_columns].fillna('missing')
    test_df[categorical_columns] = test_df[categorical_columns].fillna('missing')

    # OneHotEncoder for categorical columns
    if categorical_columns:
        ohe = OneHotEncoder(handle_unknown='ignore')  # Use sparse matrices
        train_categorical_sparse = ohe.fit_transform(train_df[categorical_columns])
        test_categorical_sparse = ohe.transform(test_df[categorical_columns])

        # Save memory by retaining sparse matrices
        train_df = train_df.drop(columns=categorical_columns)
        test_df = test_df.drop(columns=categorical_columns)

    # Standardize numeric columns
    if numeric_columns:
        scaler = StandardScaler()
        train_df[numeric_columns] = scaler.fit_transform(train_df[numeric_columns])
        test_df[numeric_columns] = scaler.transform(test_df[numeric_columns])

    # Convert numeric columns to sparse format and concatenate
    if numeric_columns:
        train_numeric_sparse = sp.csr_matrix(train_df[numeric_columns].values.astype(np.float32))
        test_numeric_sparse = sp.csr_matrix(test_df[numeric_columns].values.astype(np.float32))

        train_sparse = sp.hstack([train_numeric_sparse, train_categorical_sparse], format='csr')
        test_sparse = sp.hstack([test_numeric_sparse, test_categorical_sparse], format='csr')
    else:
        train_sparse = train_categorical_sparse
        test_sparse = test_categorical_sparse

    return train_sparse, test_sparse


In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [None]:
prepocessed_train, prepocessed_test = preprocess_for_baseline_with_onehot_sparse(train_processed, test_processed)

In [None]:
import optuna
import cupy as cp
import cupyx.scipy.sparse as cps
from cuml.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Target variable
y_train = train_processed['isFraud']

# Objective function for Optuna
def objective(trial):
    # Define hyperparameters to tune
    param = {
        "C": trial.suggest_loguniform("C", 1e-4, 1e2),  # Regularization strength
        "solver": "qn",  # Use quasi-Newton solver (cuML only supports this)
        "penalty": "l2",  # L2 regularization only
        "class_weight": "balanced",  # Handle imbalanced data
        "max_iter": 10000,
        "tol": 1e-4
    }

    # Logistic Regression using cuML (GPU)
    model = LogisticRegression(**param)

    # Stratified K-Fold Cross-Validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    log_losses = []

    for train_idx, val_idx in cv.split(prepocessed_train, y_train):
        # Convert to GPU sparse matrices
        X_train = cps.csr_matrix(prepocessed_train[train_idx])
        X_val = cps.csr_matrix(prepocessed_train[val_idx])
        y_train_fold = cp.array(y_train.iloc[train_idx])
        y_val_fold = cp.array(y_train.iloc[val_idx])

        # Train the model
        model.fit(X_train, y_train_fold)

        # Predict probabilities
        y_val_pred_proba = model.predict_proba(X_val)[:, 1]

        # Calculate log loss
        log_losses.append(log_loss(y_val_fold.get(), y_val_pred_proba.get()))
        print(np.mean(log_losses))
    # Return mean log loss
    return np.mean(log_losses)

# Optuna study
study = optuna.create_study(direction="minimize", study_name="GPU Logistic Regression Tuning")
study.optimize(objective, n_trials=20)

# Best trial parameters
print("Best trial:")
print(study.best_trial.params)


[I 2024-12-06 22:48:29,737] A new study created in memory with name: GPU Logistic Regression Tuning


0.34131738696261177
0.34239190083931176
0.3419132316833835
0.3426304954744556


[I 2024-12-06 22:49:01,933] Trial 0 finished with value: 0.34279517543590854 and parameters: {'C': 1.01974693075021}. Best is trial 0 with value: 0.34279517543590854.


0.34279517543590854
0.5423706705689512
0.5436765141459714
0.5418103523528651
0.5421215506057802


[I 2024-12-06 22:49:04,745] Trial 1 finished with value: 0.542082529078044 and parameters: {'C': 0.0003798170803813578}. Best is trial 0 with value: 0.34279517543590854.


0.542082529078044
0.524332550162172
0.5255865391604699
0.5238089754980525
0.5241377068823354


[I 2024-12-06 22:49:08,835] Trial 2 finished with value: 0.5240172889562312 and parameters: {'C': 0.0015206657059152293}. Best is trial 0 with value: 0.34279517543590854.


0.5240172889562312
0.5530617154143186
0.5543585477337452
0.5525429869472152
0.5528422895021006


[I 2024-12-06 22:49:10,834] Trial 3 finished with value: 0.5528319916233693 and parameters: {'C': 0.00012380435805592387}. Best is trial 0 with value: 0.34279517543590854.


0.5528319916233693
0.3323631435453769
0.33329342072318413
0.33281221895952334
0.3334859214261799


[I 2024-12-06 22:49:51,893] Trial 4 finished with value: 0.3337574715007993 and parameters: {'C': 3.2630018283171065}. Best is trial 4 with value: 0.3337574715007993.


0.3337574715007993
0.459134745275502
0.46049079493125533
0.45902838284708086
0.4594165649148779


[I 2024-12-06 22:50:00,067] Trial 5 finished with value: 0.4591334236035164 and parameters: {'C': 0.018725874984155462}. Best is trial 4 with value: 0.3337574715007993.


0.4591334236035164
0.5435457388690567
0.5448655813372146
0.5430198807671651
0.5433453850074158


[I 2024-12-06 22:50:02,705] Trial 6 finished with value: 0.5433286683091199 and parameters: {'C': 0.0003339021920351738}. Best is trial 4 with value: 0.3337574715007993.


0.5433286683091199
0.4029764542959232
0.40401890991743944
0.40304004648185604
0.4036594683626384


[I 2024-12-06 22:50:16,589] Trial 7 finished with value: 0.40346242556842765 and parameters: {'C': 0.08780659878691065}. Best is trial 4 with value: 0.3337574715007993.


0.40346242556842765
0.44020551772909233
0.44134749653267913
0.43987112851762494
0.44048108915888656


[I 2024-12-06 22:50:26,828] Trial 8 finished with value: 0.44022465727124027 and parameters: {'C': 0.03179939398625025}. Best is trial 4 with value: 0.3337574715007993.


0.44022465727124027
0.5421813597768301
0.5435136789509618
0.5416640691871586
0.5419964967591778


[I 2024-12-06 22:50:29,698] Trial 9 finished with value: 0.5419718725590539 and parameters: {'C': 0.0003839211814373898}. Best is trial 4 with value: 0.3337574715007993.


0.5419718725590539
0.33213982623007443
0.33245041836116185
0.3321308975505203
0.3328353203993377


[I 2024-12-06 22:51:29,612] Trial 10 finished with value: 0.33314724297784404 and parameters: {'C': 40.864817467513994}. Best is trial 10 with value: 0.33314724297784404.


0.33314724297784404
0.3320909269962667
0.33244488823492835
0.3324241051195297
0.3331939808200488


[I 2024-12-06 22:52:32,239] Trial 11 finished with value: 0.3335528535729252 and parameters: {'C': 56.958407270148115}. Best is trial 10 with value: 0.33314724297784404.


0.3335528535729252
0.3322659734684514
0.3330359640542784
0.33268020786901703
0.3332435467895212


[I 2024-12-06 22:53:40,853] Trial 12 finished with value: 0.33378008942094745 and parameters: {'C': 78.56796146426251}. Best is trial 10 with value: 0.33314724297784404.


0.33378008942094745
0.33155518042842486
0.33205988317720786
0.3317900519920378
0.33247565169674076


[I 2024-12-06 22:54:35,997] Trial 13 finished with value: 0.3329529804087489 and parameters: {'C': 42.85890095129514}. Best is trial 13 with value: 0.3329529804087489.


0.3329529804087489
0.33095166285936584
0.33180816948140246
0.33149857885943124
0.3322780237950773


[I 2024-12-06 22:55:22,972] Trial 14 finished with value: 0.33258721210939185 and parameters: {'C': 4.485989414870252}. Best is trial 14 with value: 0.33258721210939185.


0.33258721210939185
0.33100999481912097
0.33208499033060634
0.33169108653888135
0.3324039419569283


[I 2024-12-06 22:56:06,486] Trial 15 finished with value: 0.3327363756026286 and parameters: {'C': 4.373708550421695}. Best is trial 14 with value: 0.33258721210939185.


0.3327363756026286
0.33304114828307774
0.33402101642539
0.33381573982657425
0.33466190273304497


[I 2024-12-06 22:56:45,657] Trial 16 finished with value: 0.3349150317571402 and parameters: {'C': 2.521610413748298}. Best is trial 14 with value: 0.33258721210939185.


0.3349150317571402
0.33027674084955
0.3309702902125936
0.33071998484902454
0.3314977431849833


[I 2024-12-06 22:57:32,568] Trial 17 finished with value: 0.3318781557968956 and parameters: {'C': 7.523470284010317}. Best is trial 17 with value: 0.3318781557968956.


0.3318781557968956
0.3678279997668923
0.36903353472062717
0.3685479855929628
0.3692728424075452


[I 2024-12-06 22:57:53,701] Trial 18 finished with value: 0.36926760198165354 and parameters: {'C': 0.26361324769032213}. Best is trial 17 with value: 0.3318781557968956.


0.36926760198165354
0.33057453024747585
0.3310828940586367
0.3307358527172471
0.33150242241198435


[I 2024-12-06 22:58:46,346] Trial 19 finished with value: 0.33191341496367394 and parameters: {'C': 12.121541808000616}. Best is trial 17 with value: 0.3318781557968956.


0.33191341496367394
Best trial:
{'C': 7.523470284010317}


In [None]:
# Cross-validation with Best Parameters
# best_params = study.best_trial.params

import optuna
import cupy as cp
import cupyx.scipy.sparse as cps
from cuml.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
import numpy as np
best_params = {'C': 7.523470284010317}
model = LogisticRegression(**best_params)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

auc_scores = []
f1_scores = []
precision_scores = []
recall_scores = []
log_losses = []

for train_idx, val_idx in cv.split(prepocessed_train, y_train):
    X_train = cps.csr_matrix(prepocessed_train[train_idx])
    X_val = cps.csr_matrix(prepocessed_train[val_idx])
    y_train_fold = cp.array(y_train.iloc[train_idx])
    y_val_fold = cp.array(y_train.iloc[val_idx])

    # Train with best parameters
    model.fit(X_train, y_train_fold)

    # Predict probabilities and classes
    y_val_pred_proba = model.predict_proba(X_val)[:, 1]
    y_val_pred = (y_val_pred_proba > 0.5).astype(int)  # Threshold for binary classification

    # Metrics
    log_losses.append(log_loss(y_val_fold.get(), y_val_pred_proba.get()))
    auc_scores.append(roc_auc_score(y_val_fold.get(), y_val_pred_proba.get()))
    f1_scores.append(f1_score(y_val_fold.get(), y_val_pred.get()))
    precision_scores.append(precision_score(y_val_fold.get(), y_val_pred.get()))
    recall_scores.append(recall_score(y_val_fold.get(), y_val_pred.get()))

# Print Final Metrics
print("Final Cross-Validation Metrics with Best Parameters:")
print(f"Mean Log Loss: {np.mean(log_losses):.4f}")
print(f"Mean AUC Score: {np.mean(auc_scores):.4f}")
print(f"Mean F1 Score: {np.mean(f1_scores):.4f}")
print(f"Mean Precision: {np.mean(precision_scores):.4f}")
print(f"Mean Recall: {np.mean(recall_scores):.4f}")


[W] [16:29:33.442725] L-BFGS: max iterations reached
[W] [16:29:33.444571] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [16:29:40.874539] L-BFGS: max iterations reached
[W] [16:29:40.876043] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [16:29:46.355304] L-BFGS: max iterations reached
[W] [16:29:46.356845] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the input data.
[W] [16:29:52.252562] L-BFGS: max iterations reached
[W] [16:29:52.253673] Maximum iterations reached before solver is converged. To increase model accuracy you can increase the number of iterations (max_iter) or improve the scaling of the inpu

In [None]:
from scipy.sparse import save_npz, load_npz

# Save sparse matrices
save_npz('drive/MyDrive/DSC_final_project/train_sparse_BASELINE_LOGISTIC.npz', prepocessed_train)
save_npz('drive/MyDrive/DSC_final_project/test_sparse_BASELINE_LOGISTIC.npz', prepocessed_test)

# Load sparse matrices
train_sparse = load_npz('drive/MyDrive/DSC_final_project/train_sparse_BASELINE_LOGISTIC.npz')
test_sparse = load_npz('drive/MyDrive/DSC_final_project/test_sparse_BASELINE_LOGISTIC.npz')


In [None]:
predicted_proba = model.predict_proba(prepocessed_test)[:, 1]

In [None]:
predicted_proba

array([0.00148378, 0.01873477, 0.00662038, ..., 0.05206139, 0.01982539,
       0.0039593 ])

In [None]:
predicted_proba_test_logistic_Regression_baseline_for_Stacking = model.predict_proba(prepocessed_test)[:, 1]

In [None]:
predicted_proba_train_logistic_Regression_baseline_for_Stacking = model.predict_proba(prepocessed_train)[:, 1]

In [None]:
submission = pd.DataFrame({
    "TransactionID": test_df["TransactionID"],  # Replace with your test set's TransactionID column
    "isFraud": predicted_proba  # Convert CuPy array to NumPy
})

In [None]:
submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.001484
1,3663550,0.018735
2,3663551,0.00662
3,3663552,0.001732
4,3663553,0.01355


In [None]:
submission = pd.DataFrame({
    "TransactionID": test_df["TransactionID"],  # Replace with your test set's TransactionID column
    "isFraud": predicted_proba  # Convert CuPy array to NumPy
})

# Save to CSV
submission.to_csv("drive/MyDrive/DSC_final_project/log_reg_initial_submission.csv", index=False)

## Preprocessing for Advanced models

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import scipy.sparse as sp

def preprocess_for_isolation_forest(input_train_df, input_test_df,target_column='IsFraud'):
    # Work on local copies to avoid modifying global variables
    train_df = input_train_df.copy()
    test_df = input_test_df.copy()
    if target_column in train_df.columns:
        train_df = train_df.drop(columns=[target_column])

    # Identify numeric and categorical columns that exist in the datasets
    numeric_columns = train_df.select_dtypes(include=['int64', 'float64']).columns
    categorical_columns = train_df.select_dtypes(include=['object']).columns

    # Ensure the columns exist in both train and test
    numeric_columns = [col for col in numeric_columns if col in test_df.columns]
    categorical_columns = [col for col in categorical_columns if col in test_df.columns]

    # Handle missing values
    numeric_imputer = SimpleImputer(strategy='median')
    train_df[numeric_columns] = numeric_imputer.fit_transform(train_df[numeric_columns])
    test_df[numeric_columns] = numeric_imputer.transform(test_df[numeric_columns])

    categorical_imputer = SimpleImputer(strategy='constant', fill_value='missing')
    train_df[categorical_columns] = categorical_imputer.fit_transform(train_df[categorical_columns])
    test_df[categorical_columns] = categorical_imputer.transform(test_df[categorical_columns])

    # OneHotEncoder for categorical columns
    if categorical_columns:
        ohe = OneHotEncoder(handle_unknown='ignore')  # Use sparse matrices
        train_categorical_sparse = ohe.fit_transform(train_df[categorical_columns])
        test_categorical_sparse = ohe.transform(test_df[categorical_columns])

        # Drop original categorical columns
        train_df = train_df.drop(columns=categorical_columns)
        test_df = test_df.drop(columns=categorical_columns)

    # Standardize numeric columns
    if numeric_columns:
        scaler = StandardScaler()
        train_df[numeric_columns] = scaler.fit_transform(train_df[numeric_columns])
        test_df[numeric_columns] = scaler.transform(test_df[numeric_columns])

    # Convert numeric columns to sparse format and concatenate
    if numeric_columns:
        train_numeric_sparse = sp.csr_matrix(train_df[numeric_columns].values.astype(np.float32))
        test_numeric_sparse = sp.csr_matrix(test_df[numeric_columns].values.astype(np.float32))

        train_sparse = sp.hstack([train_numeric_sparse, train_categorical_sparse], format='csr')
        test_sparse = sp.hstack([test_numeric_sparse, test_categorical_sparse], format='csr')
    else:
        train_sparse = train_categorical_sparse
        test_sparse = test_categorical_sparse

    return train_sparse, test_sparse


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np



def preprocess_for_dbscan_optimized(input_train_df, input_test_df, target_column='isFraud'):
    # Work on local copies to avoid modifying global variables
    train_df = input_train_df.copy()
    test_df = input_test_df.copy()

    # Drop target column
    if target_column in train_df.columns:
        train_df = train_df.drop(columns=[target_column])

    # Identify numeric and categorical columns that exist in the datasets
    numeric_columns = train_df.select_dtypes(include=['int64', 'float64']).columns
    categorical_columns = train_df.select_dtypes(include=['object']).columns

    # Ensure the columns exist in both train and test
    numeric_columns = [col for col in numeric_columns if col in test_df.columns]
    categorical_columns = [col for col in categorical_columns if col in test_df.columns]

    # Handle missing values
    numeric_imputer = SimpleImputer(strategy='median')
    train_df[numeric_columns] = numeric_imputer.fit_transform(train_df[numeric_columns])
    test_df[numeric_columns] = numeric_imputer.transform(test_df[numeric_columns])

    categorical_imputer = SimpleImputer(strategy='constant', fill_value='missing')
    train_df[categorical_columns] = categorical_imputer.fit_transform(train_df[categorical_columns])
    test_df[categorical_columns] = categorical_imputer.transform(test_df[categorical_columns])

    # Handle specific case for id_12
    if 'id_12' in categorical_columns:
        train_df['id_12'] = train_df['id_12'].fillna('missing').astype(str)
        test_df['id_12'] = test_df['id_12'].fillna('missing').astype(str)

        mapping = {'NotFound': 0, 'Found': 1, 'missing': -1}
        train_df['id_12'] = train_df['id_12'].map(mapping)
        test_df['id_12'] = test_df['id_12'].map(mapping)

        if train_df['id_12'].isnull().any() or test_df['id_12'].isnull().any():
            raise ValueError("Mapping failed! Check values in id_12 for unmapped categories.")

    # Frequency Encoding for other categorical columns
    for col in categorical_columns:
        if col != 'id_12':  # Skip id_12 as it's already encoded
            combined = pd.concat([train_df[col], test_df[col]])
            freq_encoding = combined.value_counts(normalize=True)  # Frequency proportions
            train_df[col] = train_df[col].map(freq_encoding).fillna(0)  # Encode train
            test_df[col] = test_df[col].map(freq_encoding).fillna(0)    # Encode test

    # Align column order between train and test
    all_columns = train_df.columns
    test_df = test_df[all_columns]

    # Verify all columns are numeric
    for col in train_df.columns:
        if not pd.api.types.is_numeric_dtype(train_df[col]):
            raise ValueError(f"Column {col} is not numeric after preprocessing! Values: {train_df[col].unique()}")

    # Standardize numeric columns
    scaler = StandardScaler()
    train_df = scaler.fit_transform(train_df)
    test_df = scaler.transform(test_df)

    return train_df, test_df


In [None]:
train_processed, test_processed = preprocess_and_detect_uid(train_df, test_df, train_identity, test_identity)

Starting preprocessing...
Mem. usage decreased to 542.35 Mb (69.4% reduction)
Mem. usage decreased to 472.59 Mb (68.9% reduction)
Preprocessing and UID detection complete.


In [None]:
train_processed, test_processed = preprocess_and_detect_uid(train_df, test_df, train_identity, test_identity)
test_processed.columns = test_processed.columns.str.replace('-', '_')
train_processed.replace([np.inf, -np.inf], np.nan, inplace=True)
test_processed.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
# Check for NaN values in numeric columns
print("NaN values in numeric train columns:", train_processed.select_dtypes(include=[np.number]).isna().any().any())
print("NaN values in numeric test columns:", test_processed.select_dtypes(include=[np.number]).isna().any().any())

# Check for Infinity values in numeric columns
print("Infinity values in numeric train columns:", np.isinf(train_processed.select_dtypes(include=[np.number])).any().any())
print("Infinity values in numeric test columns:", np.isinf(test_processed.select_dtypes(include=[np.number])).any().any())

# Check max/min values for extreme ranges in numeric columns
print("Max value in numeric train columns:", train_processed.select_dtypes(include=[np.number]).max().max())
print("Min value in numeric train columns:", train_processed.select_dtypes(include=[np.number]).min().min())
print("Max value in numeric test columns:", test_processed.select_dtypes(include=[np.number]).max().max())
print("Min value in numeric test columns:", test_processed.select_dtypes(include=[np.number]).min().min())


NaN values in numeric train columns: True
NaN values in numeric test columns: True
Infinity values in numeric train columns: True
Infinity values in numeric test columns: True
Max value in numeric train columns: inf
Min value in numeric train columns: -660.0
Max value in numeric test columns: inf
Min value in numeric test columns: -720.0


In [None]:
filled_preprocessed_train, filled_preprocessed_test =preprocess_for_dbscan_optimized(train_processed, test_processed)

In [None]:
import numpy as np

def verify_preprocessed_data(train_data, test_data):
    print("=== Verification of Preprocessed Data ===")

    # Check for NaN values
    print("NaN values in train data:", np.isnan(train_data).any())
    print("NaN values in test data:", np.isnan(test_data).any())

    # Check for Infinity values
    print("Infinity values in train data:", np.isinf(train_data).any())
    print("Infinity values in test data:", np.isinf(test_data).any())

    # Check for extreme values
    print("Max value in train data:", np.max(train_data))
    print("Min value in train data:", np.min(train_data))
    print("Max value in test data:", np.max(test_data))
    print("Min value in test data:", np.min(test_data))

    # Check for shape consistency
    print("Shape of train data:", train_data.shape)
    print("Shape of test data:", test_data.shape)

    # Check for zero variance columns
    zero_variance_train = (train_data.var(axis=0) == 0).sum()
    zero_variance_test = (test_data.var(axis=0) == 0).sum()
    print(f"Zero-variance columns in train data: {zero_variance_train}")
    print(f"Zero-variance columns in test data: {zero_variance_test}")

    print("=== Verification Completed ===")
#verify_preprocessed_data(filled_preprocessed_train, filled_preprocessed_test)

In [None]:
import optuna
import numpy as np
from cuml.cluster import DBSCAN
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import StratifiedKFold


# Target variable
y_train = train_processed['isFraud']

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters for DBSCAN
    eps = trial.suggest_float("eps", 0.1, 10.0, log=True)  # Neighborhood radius
    min_samples = trial.suggest_int("min_samples", 2, 50)  # Minimum samples per cluster

    # Initialize DBSCAN with cuML
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, verbose=False)

    # StratifiedKFold for cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []
    log_losses = []

    for train_idx, val_idx in cv.split(filled_preprocessed_train, y_train):
        X_train, X_val = filled_preprocessed_train[train_idx], filled_preprocessed_train[val_idx]
        y_val = y_train.iloc[val_idx]

        # Fit DBSCAN
        dbscan.fit(X_train)

        # Get cluster labels (-1 indicates noise)
        cluster_labels = dbscan.labels_

        # Skip fold if DBSCAN failed to find clusters
        if len(np.unique(cluster_labels)) <= 1:
            auc_scores.append(0.5)  # Assign neutral AUC for failure cases
            log_losses.append(np.inf)  # Assign high log loss
            continue

        # Use cluster labels as predictions for simplicity
        # Map cluster labels to binary predictions (fraud/no fraud)
        cluster_to_fraud = {label: np.mean(y_train[dbscan.labels_ == label]) for label in np.unique(cluster_labels) if label != -1}
        y_pred_proba = np.array([cluster_to_fraud.get(label, 0.0) for label in cluster_labels])

        # Calculate metrics
        auc_scores.append(roc_auc_score(y_val, y_pred_proba))
        log_losses.append(log_loss(y_val, y_pred_proba + 1e-15))  # Avoid log(0)

    # Decide the metric to optimize
    metric_to_optimize = "auc"  # Change to "log_loss" if desired
    if metric_to_optimize == "auc":
        return -np.mean(auc_scores)  # Optuna minimizes, so negate AUC
    else:
        return np.mean(log_losses)
    print(np.mean(auc_scores))

# Run Optuna optimization
study = optuna.create_study(direction="maximize", study_name="DBSCAN Tuning")
study.optimize(objective, n_trials=20)

# Print the best parameters
print("Best parameters:", study.best_trial.params)

# # Re-train DBSCAN on the full dataset with the best parameters
# best_params = study.best_trial.params
# final_dbscan = DBSCAN(eps=best_params["eps"], min_samples=best_params["min_samples"], verbose=False)
# final_dbscan.fit(filled_preprocessed_train)

# # Use the trained model for predictions on the test set
# test_cluster_labels = final_dbscan.fit_predict(filled_preprocessed_test)

# # Map cluster labels to probabilities for test predictions
# test_cluster_to_fraud = {
#     label: np.mean(y_train[final_dbscan.labels_ == label]) for label in np.unique(final_dbscan.labels_) if label != -1
# }
# test_pred_proba = np.array([test_cluster_to_fraud.get(label, 0.0) for label in test_cluster_labels])

# # Save test predictions for submission
# submission = pd.DataFrame({"TransactionID": test_processed["TransactionID"], "isFraud": test_pred_proba})
# submission.to_csv("dbscan_submission.csv", index=False)

# print("Submission file created: dbscan_submission.csv")


[I 2024-12-07 02:44:04,385] A new study created in memory with name: DBSCAN Tuning


[W] [02:44:07.115778] Batch size limited by the chosen integer type (4 bytes). 7348 -> 4545. Using the larger integer type might result in better performance
[W] [03:15:20.010159] Batch size limited by the chosen integer type (4 bytes). 7348 -> 4545. Using the larger integer type might result in better performance
[W] [03:46:32.088962] Batch size limited by the chosen integer type (4 bytes). 7348 -> 4545. Using the larger integer type might result in better performance
[W] [04:17:44.200599] Batch size limited by the chosen integer type (4 bytes). 7348 -> 4545. Using the larger integer type might result in better performance


[I 2024-12-07 05:20:06,706] Trial 0 finished with value: -0.5 and parameters: {'eps': 0.1682518443131811, 'min_samples': 47}. Best is trial 0 with value: -0.5.


[W] [04:48:56.274405] Batch size limited by the chosen integer type (4 bytes). 7348 -> 4545. Using the larger integer type might result in better performance


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


# Neural network

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [None]:
train_processed, test_processed = preprocess_and_detect_uid(train_df, test_df, train_identity, test_identity)
test_processed.columns = test_processed.columns.str.replace('-', '_')

Starting preprocessing...
Mem. usage decreased to 542.35 Mb (0.0% reduction)
Mem. usage decreased to 472.59 Mb (0.0% reduction)
Preprocessing and UID detection complete.


In [None]:
train_processed.replace([np.inf, -np.inf], np.nan, inplace=True)
test_processed.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np


def preprocess_for_NN(input_train_df, input_test_df, target_column='isFraud'):
    # Work on local copies to avoid modifying global variables
    train_df = input_train_df.copy()
    test_df = input_test_df.copy()

    # Drop target column
    if target_column in train_df.columns:
        train_df = train_df.drop(columns=[target_column])

    # Identify numeric and categorical columns that exist in the datasets
    numeric_columns = train_df.select_dtypes(include=['int64', 'float64']).columns
    categorical_columns = train_df.select_dtypes(include=['object']).columns
    print(categorical_columns)
    # Ensure numeric and categorical columns exist in both train and test
    numeric_columns = [col for col in numeric_columns if col in test_df.columns]
    categorical_columns = [col for col in categorical_columns if col in test_df.columns]

    # Handle missing values for numeric columns
    numeric_imputer = SimpleImputer(strategy='median')
    train_df[numeric_columns] = numeric_imputer.fit_transform(train_df[numeric_columns])
    test_df[numeric_columns] = numeric_imputer.transform(test_df[numeric_columns])

    # Handle missing values for categorical columns
    categorical_imputer = SimpleImputer(strategy='constant', fill_value='missing')
    train_df[categorical_columns] = categorical_imputer.fit_transform(train_df[categorical_columns])
    test_df[categorical_columns] = categorical_imputer.transform(test_df[categorical_columns])

    # # Special case: Encoding for id_12
    # if 'id_12' in categorical_columns:
    #     train_df['id_12'] = train_df['id_12'].fillna('missing').astype(str)
    #     test_df['id_12'] = test_df['id_12'].fillna('missing').astype(str)

    #     mapping = {'NotFound': 0, 'Found': 1, 'missing': -1}
    #     train_df['id_12'] = train_df['id_12'].map(mapping)
    #     test_df['id_12'] = test_df['id_12'].map(mapping)

    #     # Check if mapping was successful
    #     if train_df['id_12'].isnull().any() or test_df['id_12'].isnull().any():
    #         raise ValueError("Mapping failed! Check values in id_12 for unmapped categories.")

    # # Frequency Encoding for remaining categorical columns
    # for col in categorical_columns:
    #     if col != 'id_12':  # Skip id_12 as it's already encoded
    #         combined = pd.concat([train_df[col], test_df[col]])
    #         freq_encoding = combined.value_counts(normalize=True)  # Frequency proportions
    #         train_df[col] = train_df[col].map(freq_encoding).fillna(0)  # Encode train
    #         test_df[col] = test_df[col].map(freq_encoding).fillna(0)    # Encode test

    # Align column order between train and test
    all_columns = train_df.columns
    test_df = test_df[all_columns]

    # Verify all columns are numeric
    for col in train_df.columns:
        if not pd.api.types.is_numeric_dtype(train_df[col]):
            raise ValueError(f"Column {col} is not numeric after preprocessing! Values: {train_df[col].unique()}")

    # Remove any infinite values
    train_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    test_df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Re-impute numeric columns to handle infinities replaced with NaN
    train_df[numeric_columns] = numeric_imputer.fit_transform(train_df[numeric_columns])
    test_df[numeric_columns] = numeric_imputer.transform(test_df[numeric_columns])

    # Standardize numeric columns
    scaler = StandardScaler()
    train_df = scaler.fit_transform(train_df)
    test_df = scaler.transform(test_df)

    return train_df, test_df


In [None]:
def preprocess_for_NN(input_train_df, input_test_df, target_column='isFraud'):
    print("Starting preprocessing (Method B)...")
    train_df = input_train_df.copy()
    test_df = input_test_df.copy()

    # Drop target column
    if target_column in train_df.columns:
        train_df = train_df.drop(columns=[target_column])

    # Identify numeric and categorical columns
    numeric_columns = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_columns = train_df.select_dtypes(include=['object']).columns.tolist()

    # Ensure numeric and categorical columns exist in both train and test
    numeric_columns = [col for col in numeric_columns if col in test_df.columns]
    categorical_columns = [col for col in categorical_columns if col in test_df.columns]

    # Handle missing values
    numeric_imputer = SimpleImputer(strategy='median')
    train_df[numeric_columns] = numeric_imputer.fit_transform(train_df[numeric_columns])
    test_df[numeric_columns] = numeric_imputer.transform(test_df[numeric_columns])

    categorical_imputer = SimpleImputer(strategy='constant', fill_value='missing')
    train_df[categorical_columns] = categorical_imputer.fit_transform(train_df[categorical_columns])
    test_df[categorical_columns] = categorical_imputer.transform(test_df[categorical_columns])

    # Frequency Encoding for categorical columns
    for col in categorical_columns:
        combined = pd.concat([train_df[col], test_df[col]])
        freq_encoding = combined.value_counts(normalize=True)
        train_df[col] = train_df[col].map(freq_encoding).fillna(0)
        test_df[col] = test_df[col].map(freq_encoding).fillna(0)

    # Align column order
    test_df = test_df[train_df.columns]

    # Replace infinities with NaN
    train_df.replace([np.inf, -np.inf], np.nan, inplace=True)
    test_df.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Final imputation for NaNs
    train_df = numeric_imputer.fit_transform(train_df)
    test_df = numeric_imputer.transform(test_df)

    # Standardize numeric columns
    scaler = StandardScaler()
    train_df = scaler.fit_transform(train_df)
    test_df = scaler.transform(test_df)

    print("Preprocessing (Method B) complete.")
    return train_df, test_df

In [None]:
train_df_NN, test_df_NN = preprocess_for_NN(train_processed, test_processed, target_column='isFraud')
verify_preprocessed_data(train_df_NN, test_df_NN)
# Convert to PyTorch tensors for neural network
train_tensor = torch.tensor(train_df_NN, dtype=torch.float32)
test_tensor = torch.tensor(test_df_NN, dtype=torch.float32)

Starting preprocessing (Method B)...
Preprocessing (Method B) complete.
=== Verification of Preprocessed Data ===
NaN values in train data: False
NaN values in test data: False
Infinity values in train data: False
Infinity values in test data: False
Max value in train data: 604.9377178842773
Min value in train data: -186.37770000411408
Max value in test data: 2431.967463663471
Min value in test data: -186.37770000411408
Shape of train data: (590540, 445)
Shape of test data: (506691, 445)
Zero-variance columns in train data: 0
Zero-variance columns in test data: 0
=== Verification Completed ===


In [None]:
verify_preprocessed_data(train_df_NN, test_df_NN)

=== Verification of Preprocessed Data ===
NaN values in train data: True
NaN values in test data: True
Infinity values in train data: False
Infinity values in test data: False
Max value in train data: nan
Min value in train data: nan
Max value in test data: nan
Min value in test data: nan
Shape of train data: (590540, 445)
Shape of test data: (506691, 445)
Zero-variance columns in train data: 0
Zero-variance columns in test data: 0
=== Verification Completed ===


In [None]:
import pandas as pd

# Replace with actual column names if you have them
column_names = [f"feature_{i}" for i in range(train_df_NN.shape[1])]

# Convert to DataFrame
train_df_NN_df = pd.DataFrame(train_df_NN, columns=column_names)
test_df_NN_df = pd.DataFrame(test_df_NN, columns=column_names)


print("Checking for NaN values in training dataset:")
nan_train = train_df_NN_df.isnull().sum()
nan_train = nan_train[nan_train > 0]  # Filter only columns with NaN values
print(nan_train)

print("\nChecking for NaN values in testing dataset:")
nan_test = test_df_NN_df.isnull().sum()
nan_test = nan_test[nan_test > 0]  # Filter only columns with NaN values
print(nan_test)

Checking for NaN values in training dataset:
Series([], dtype: int64)

Checking for NaN values in testing dataset:
Series([], dtype: int64)


In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

# Assume train_tensor and y_train are ready from preprocessing

# Train-Validation Split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    train_tensor.numpy(),  # Convert to NumPy for splitting
    y_train,               # Target labels
    test_size=0.2,         # 20% validation split
    random_state=42        # For reproducibility
)

# Target for validation

scaler = StandardScaler()
X_train_split = scaler.fit_transform(X_train_split)
X_val_split = scaler.transform(X_val_split)

# Convert to tensors
train_tensor_split = torch.tensor(X_train_split, dtype=torch.float32)
val_tensor = torch.tensor(X_val_split, dtype=torch.float32)

# Define the Autoencoder Neural Network
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Initialize the Autoencoder
input_dim = train_tensor_split.shape[1]  # Number of features
encoding_dim = 32  # Bottleneck size
model = Autoencoder(input_dim, encoding_dim)

# Define Loss Function and Optimizer
criterion = nn.MSELoss()  # Reconstruction loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training Loop
epochs = 50
batch_size = 128

for epoch in range(epochs):
    model.train()
    permutation = torch.randperm(train_tensor_split.size(0))
    epoch_loss = 0

    for i in range(0, train_tensor_split.size(0), batch_size):
        indices = permutation[i:i + batch_size]
        batch_data = train_tensor_split[indices]

        # Forward pass
        outputs = model(batch_data)
        loss = criterion(outputs, batch_data)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(train_tensor_split):.4f}")



Epoch 1/50, Loss: 0.0026
Epoch 2/50, Loss: 0.0020
Epoch 3/50, Loss: 0.0019
Epoch 4/50, Loss: 0.0017
Epoch 5/50, Loss: 0.0016
Epoch 6/50, Loss: 0.0016
Epoch 7/50, Loss: 0.0015
Epoch 8/50, Loss: 0.0015
Epoch 9/50, Loss: 0.0014
Epoch 10/50, Loss: 0.0014
Epoch 11/50, Loss: 0.0015
Epoch 12/50, Loss: 0.0014
Epoch 13/50, Loss: 0.0014
Epoch 14/50, Loss: 0.0013
Epoch 15/50, Loss: 0.0013
Epoch 16/50, Loss: 0.0013
Epoch 17/50, Loss: 0.0012
Epoch 18/50, Loss: 0.0012
Epoch 19/50, Loss: 0.0013
Epoch 20/50, Loss: 0.0013
Epoch 21/50, Loss: 0.0012
Epoch 22/50, Loss: 0.0012
Epoch 23/50, Loss: 0.0012
Epoch 24/50, Loss: 0.0012
Epoch 25/50, Loss: 0.0011
Epoch 26/50, Loss: 0.0011
Epoch 27/50, Loss: 0.0011
Epoch 28/50, Loss: 0.0013
Epoch 29/50, Loss: 0.0013
Epoch 30/50, Loss: 0.0012
Epoch 31/50, Loss: 0.0012
Epoch 32/50, Loss: 0.0011
Epoch 33/50, Loss: 0.0013
Epoch 34/50, Loss: 0.0012
Epoch 35/50, Loss: 0.0013
Epoch 36/50, Loss: 0.0013
Epoch 37/50, Loss: 0.0013
Epoch 38/50, Loss: 0.0012
Epoch 39/50, Loss: 0.

AttributeError: 'Series' object has no attribute 'numpy'

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

# Define the Autoencoder Neural Network
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Assume train_tensor and y_train are ready from preprocessing

# Scale the entire dataset first
scaler = StandardScaler()
X_scaled = scaler.fit_transform(train_tensor.numpy())
y = y_train.values  # Assuming y_train is a pandas Series

# Cross-validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

encoding_dim = 32  # Bottleneck size
epochs = 50
batch_size = 128

auc_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled, y)):
    print(f"Starting Fold {fold + 1}")
    X_train_split, X_val_split = X_scaled[train_idx], X_scaled[val_idx]
    y_train_split, y_val_split = y[train_idx], y[val_idx]

    # Convert to tensors
    train_tensor_split = torch.tensor(X_train_split, dtype=torch.float32)
    val_tensor = torch.tensor(X_val_split, dtype=torch.float32)

    # Initialize the Autoencoder
    input_dim = train_tensor_split.shape[1]
    model = Autoencoder(input_dim, encoding_dim)

    # Define Loss Function and Optimizer
    criterion = nn.MSELoss()  # Reconstruction loss
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training Loop
    for epoch in range(epochs):
        model.train()
        permutation = torch.randperm(train_tensor_split.size(0))
        epoch_loss = 0

        for i in range(0, train_tensor_split.size(0), batch_size):
            indices = permutation[i:i + batch_size]
            batch_data = train_tensor_split[indices]

            # Forward pass
            outputs = model(batch_data)
            loss = criterion(outputs, batch_data)

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Fold {fold + 1}, Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss / len(train_tensor_split):.4f}")

    # Evaluate on validation data
    model.eval()
    with torch.no_grad():
        val_reconstructed = model(val_tensor)
        reconstruction_loss = torch.mean((val_reconstructed - val_tensor) ** 2, dim=1).numpy()

    # Anomaly Detection Threshold
    threshold = np.percentile(reconstruction_loss, 95)  # Adjust the percentile as needed
    y_pred = (reconstruction_loss > threshold).astype(int)

    # Calculate AUC
    auc = roc_auc_score(y_val_split, reconstruction_loss)
    auc_scores.append(auc)
    print(f"Fold {fold + 1}, AUC Score: {auc:.4f}")

# Final Cross-Validation AUC
print(f"Final Cross-Validation AUC: {np.mean(auc_scores):.4f}")


Starting Fold 1
Fold 1, Epoch 1/50, Loss: 0.0027
Fold 1, Epoch 2/50, Loss: 0.0020
Fold 1, Epoch 3/50, Loss: 0.0018
Fold 1, Epoch 4/50, Loss: 0.0016
Fold 1, Epoch 5/50, Loss: 0.0017
Fold 1, Epoch 6/50, Loss: 0.0015
Fold 1, Epoch 7/50, Loss: 0.0014
Fold 1, Epoch 8/50, Loss: 0.0014
Fold 1, Epoch 9/50, Loss: 0.0014
Fold 1, Epoch 10/50, Loss: 0.0013
Fold 1, Epoch 11/50, Loss: 0.0012
Fold 1, Epoch 12/50, Loss: 0.0014
Fold 1, Epoch 13/50, Loss: 0.0013
Fold 1, Epoch 14/50, Loss: 0.0014
Fold 1, Epoch 15/50, Loss: 0.0013
Fold 1, Epoch 16/50, Loss: 0.0012
Fold 1, Epoch 17/50, Loss: 0.0013
Fold 1, Epoch 18/50, Loss: 0.0012
Fold 1, Epoch 19/50, Loss: 0.0012
Fold 1, Epoch 20/50, Loss: 0.0012
Fold 1, Epoch 21/50, Loss: 0.0012
Fold 1, Epoch 22/50, Loss: 0.0011
Fold 1, Epoch 23/50, Loss: 0.0011
Fold 1, Epoch 24/50, Loss: 0.0011
Fold 1, Epoch 25/50, Loss: 0.0012
Fold 1, Epoch 26/50, Loss: 0.0011
Fold 1, Epoch 27/50, Loss: 0.0011
Fold 1, Epoch 28/50, Loss: 0.0015
Fold 1, Epoch 29/50, Loss: 0.0012
Fold 1,

In [None]:
#assert not torch.isnan(train_tensor_split).any(), "NaN values found in train_tensor_split"
assert not torch.isinf(train_tensor_split).any(), "Inf values found in train_tensor_split"


In [None]:

# Prediction for Test Set
test_tensor = torch.tensor(test_df_NN, dtype=torch.float32)  # Assuming test_tensor is ready
with torch.no_grad():
    test_reconstructed = model(test_tensor)
    test_reconstruction_loss = torch.mean((test_reconstructed - test_tensor) ** 2, dim=1).numpy()

# Submission
submission = pd.DataFrame({
    "TransactionID": test_processed["TransactionID"],
    "isFraud": test_reconstruction_loss  # Use reconstruction loss directly as probabilities
})
submission.to_csv("autoencoder_submission.csv", index=False)
print("Submission file created: autoencoder_submission.csv")

Submission file created: autoencoder_submission.csv


In [None]:
torch.save(model.state_dict(), "autoencoder_model.pth")
print("Model saved as 'autoencoder_model.pth'")

Model saved as 'autoencoder_model.pth'


# Satcking Logistic Regression with Catboost

In [None]:
train_processed, test_processed = preprocess_and_detect_uid(train_df, test_df, train_identity, test_identity)
test_processed.columns = test_processed.columns.str.replace('-', '_')
train_processed.replace([np.inf, -np.inf], np.nan, inplace=True)
test_processed.replace([np.inf, -np.inf], np.nan, inplace=True)

Starting preprocessing...
Mem. usage decreased to 542.35 Mb (0.0% reduction)
Mem. usage decreased to 472.59 Mb (0.0% reduction)
Preprocessing and UID detection complete.


In [None]:
filled_train, filled_test, train_y, cat_features = preprocess_for_catboost(train_processed, test_processed)

print("Preprocessed Train Shape:", filled_train.shape)
print("Preprocessed Test Shape:", filled_test.shape)
print("Categorical Features:", cat_features)


Preprocessed Train Shape: (590540, 65)
Preprocessed Test Shape: (506691, 65)
Categorical Features: [32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64]


In [None]:
filled_train['stacked_log_reg'] = predicted_proba_train_logistic_Regression_baseline_for_Stacking
filled_test['stacked_log_reg'] = predicted_proba_test_logistic_Regression_baseline_for_Stacking

In [None]:
filled_train['stacked_log_reg']

Unnamed: 0,stacked_log_reg
0,0.751780
1,0.993111
2,0.022570
3,0.209198
4,0.998900
...,...
590535,0.000249
590536,0.019864
590537,0.003043
590538,0.009289


In [None]:
import optuna
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# Target variable
y_train = train_processed['isFraud']

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters for CatBoost
    params = {
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.5, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "random_strength": trial.suggest_float("random_strength", 0.0, 10.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "loss_function": "Logloss",
        "eval_metric": "Logloss",
        "task_type": "GPU",
        "verbose": False,
        "random_state": 42,
    }

    # Cross-validation setup
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, val_idx in cv.split(filled_train, y_train):
        X_train, X_val = filled_train.iloc[train_idx], filled_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Create CatBoost Pool
        train_pool = Pool(X_train, y_train_fold, cat_features=cat_features)
        val_pool = Pool(X_val, y_val_fold, cat_features=cat_features)

        # Train CatBoost model
        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50, verbose=False)

        # Predict probabilities for validation set
        y_val_pred_proba = model.predict_proba(X_val)[:, 1]  # Probabilities for class 1

        # Calculate AUC
        auc = roc_auc_score(y_val_fold, y_val_pred_proba)
        auc_scores.append(auc)
        print(np.mean(auc_scores))

    # Return the mean AUC (maximize this)
    return np.mean(auc_scores)

# Run Optuna optimization
study = optuna.create_study(direction="maximize", study_name="CatBoost AUC Tuning")
study.optimize(objective, n_trials=50)

# Print best parameters
print("Best parameters:", study.best_trial.params)



[I 2024-12-08 16:48:59,080] A new study created in memory with name: CatBoost AUC Tuning


0.9589560358013585
0.9609716040234281
0.960934019884608
0.9603944105305997


[I 2024-12-08 16:51:50,362] Trial 0 finished with value: 0.9607153943222896 and parameters: {'iterations': 557, 'depth': 6, 'learning_rate': 0.45326398619337, 'l2_leaf_reg': 0.011476656803977863, 'bagging_temperature': 0.2240223010411081, 'random_strength': 5.536505896148168, 'border_count': 169}. Best is trial 0 with value: 0.9607153943222896.


0.9607153943222896
0.9351211470888625
0.9362380964563046
0.936397239262857
0.9368187776196131


[I 2024-12-08 16:53:00,744] Trial 1 finished with value: 0.9368149481176452 and parameters: {'iterations': 246, 'depth': 3, 'learning_rate': 0.10021798811709581, 'l2_leaf_reg': 0.03275453222778487, 'bagging_temperature': 0.45006997858307973, 'random_strength': 0.23060323305153618, 'border_count': 78}. Best is trial 0 with value: 0.9607153943222896.


0.9368149481176452
0.9232052920772505
0.9248984123183897
0.9248846627104422
0.9243448131661138


[I 2024-12-08 16:56:36,097] Trial 2 finished with value: 0.9246500286753486 and parameters: {'iterations': 819, 'depth': 5, 'learning_rate': 0.011074769878687456, 'l2_leaf_reg': 0.007856346140730587, 'bagging_temperature': 0.1971282954921686, 'random_strength': 7.005262862977317, 'border_count': 63}. Best is trial 0 with value: 0.9607153943222896.


0.9246500286753486
0.9251152522234092
0.9274986576461028
0.9284331599597464
0.928108175149361


[I 2024-12-08 16:59:28,034] Trial 3 finished with value: 0.9282543077972626 and parameters: {'iterations': 531, 'depth': 6, 'learning_rate': 0.013743783505078619, 'l2_leaf_reg': 2.406158886967847, 'bagging_temperature': 0.4497631024516763, 'random_strength': 1.128842105704374, 'border_count': 187}. Best is trial 0 with value: 0.9607153943222896.


0.9282543077972626
0.8400198559150478
0.8409985646805893
0.8412165311187475
0.8400385454161737


[I 2024-12-08 17:01:59,666] Trial 4 finished with value: 0.8393083063848386 and parameters: {'iterations': 942, 'depth': 3, 'learning_rate': 0.0013470601532050242, 'l2_leaf_reg': 8.970868022466693, 'bagging_temperature': 0.12303945242142, 'random_strength': 6.773371381715878, 'border_count': 151}. Best is trial 0 with value: 0.9607153943222896.


0.8393083063848386
0.8840595648828363
0.8853338460981681
0.8857519967842039
0.8857968696528696


[I 2024-12-08 17:05:09,366] Trial 5 finished with value: 0.8851511105071206 and parameters: {'iterations': 632, 'depth': 6, 'learning_rate': 0.0033595620328419375, 'l2_leaf_reg': 0.4024775430218668, 'bagging_temperature': 0.37807872532860376, 'random_strength': 7.3410929343302245, 'border_count': 219}. Best is trial 0 with value: 0.9607153943222896.


0.8851511105071206
0.7951942717632645
0.7966324885162896
0.7953189989229207
0.7939969768653938


[I 2024-12-08 17:06:31,868] Trial 6 finished with value: 0.7939494019528979 and parameters: {'iterations': 194, 'depth': 10, 'learning_rate': 0.0021691837265927975, 'l2_leaf_reg': 4.028040389185629, 'bagging_temperature': 0.5155271583818841, 'random_strength': 8.345421088979464, 'border_count': 91}. Best is trial 0 with value: 0.9607153943222896.


0.7939494019528979
0.9533090136212505
0.9556281402986897
0.9562606814134352
0.9573568128872851


[I 2024-12-08 17:08:19,015] Trial 7 finished with value: 0.9557609168851968 and parameters: {'iterations': 260, 'depth': 6, 'learning_rate': 0.22487206594873302, 'l2_leaf_reg': 0.050230426232164255, 'bagging_temperature': 0.479635907888903, 'random_strength': 7.590868289458409, 'border_count': 203}. Best is trial 0 with value: 0.9607153943222896.


0.9557609168851968
0.9423625048931006
0.9408393712716544
0.9418028321971548
0.9424495235323482


[I 2024-12-08 17:09:49,331] Trial 8 finished with value: 0.9428026076531196 and parameters: {'iterations': 419, 'depth': 3, 'learning_rate': 0.15549053173319347, 'l2_leaf_reg': 0.08010279019685321, 'bagging_temperature': 0.9367390654666601, 'random_strength': 4.98816417995036, 'border_count': 90}. Best is trial 0 with value: 0.9607153943222896.


0.9428026076531196
0.916897952939397
0.9180445801594004
0.9185172894803652
0.9187956065354475


[I 2024-12-08 17:11:54,476] Trial 9 finished with value: 0.9178141704073859 and parameters: {'iterations': 484, 'depth': 4, 'learning_rate': 0.017538746417508595, 'l2_leaf_reg': 0.19775587554425011, 'bagging_temperature': 0.6225774523253825, 'random_strength': 5.977274871728312, 'border_count': 210}. Best is trial 0 with value: 0.9607153943222896.


0.9178141704073859
0.948202958915078
0.9520872120986595
0.9535071603916272
0.9547168013259055


[I 2024-12-08 17:13:57,272] Trial 10 finished with value: 0.9550468587512064 and parameters: {'iterations': 711, 'depth': 8, 'learning_rate': 0.47608760908172953, 'l2_leaf_reg': 0.0011227384379080036, 'bagging_temperature': 0.014166907026949671, 'random_strength': 2.684386865016411, 'border_count': 252}. Best is trial 0 with value: 0.9607153943222896.


0.9550468587512064
0.9509633745094415
0.9563638285018616
0.9572941291896232
0.957894348163755


[I 2024-12-08 17:16:44,915] Trial 11 finished with value: 0.9583442390670454 and parameters: {'iterations': 354, 'depth': 8, 'learning_rate': 0.4053044508341824, 'l2_leaf_reg': 0.009855163875325148, 'bagging_temperature': 0.7438710256082182, 'random_strength': 9.336458841544943, 'border_count': 142}. Best is trial 0 with value: 0.9607153943222896.


0.9583442390670454
0.9464144376516895
0.9478787379177533
0.9475571690995949
0.9486069276234104


[I 2024-12-08 17:19:36,846] Trial 12 finished with value: 0.9492486048318369 and parameters: {'iterations': 376, 'depth': 8, 'learning_rate': 0.0547421155848247, 'l2_leaf_reg': 0.007499195228695075, 'bagging_temperature': 0.8090326966259089, 'random_strength': 9.68970695212713, 'border_count': 137}. Best is trial 0 with value: 0.9607153943222896.


0.9492486048318369
0.954177673807247
0.9572403605621427
0.9578095310567205
0.959100622552939


[I 2024-12-08 17:22:06,942] Trial 13 finished with value: 0.9572082113999281 and parameters: {'iterations': 348, 'depth': 8, 'learning_rate': 0.3510401514951259, 'l2_leaf_reg': 0.004970117258893852, 'bagging_temperature': 0.7039998964767211, 'random_strength': 4.052675727539883, 'border_count': 142}. Best is trial 0 with value: 0.9607153943222896.


0.9572082113999281
0.9605628292544528
0.9613570530099541
0.9619997460582077
0.9629926999993936


[I 2024-12-08 17:28:59,126] Trial 14 finished with value: 0.9631851406560117 and parameters: {'iterations': 632, 'depth': 10, 'learning_rate': 0.05429520798747817, 'l2_leaf_reg': 0.0014656139685710419, 'bagging_temperature': 0.2723350262445067, 'random_strength': 9.426751987733903, 'border_count': 165}. Best is trial 14 with value: 0.9631851406560117.


0.9631851406560117
0.9641912627145023
0.9647598409374546
0.9652479217728139
0.965369902289994


[I 2024-12-08 17:36:07,746] Trial 15 finished with value: 0.9654900794387944 and parameters: {'iterations': 681, 'depth': 10, 'learning_rate': 0.09537110666645382, 'l2_leaf_reg': 0.0015253830003535153, 'bagging_temperature': 0.2768519593024324, 'random_strength': 3.4921816840353954, 'border_count': 181}. Best is trial 15 with value: 0.9654900794387944.


0.9654900794387944
0.9601459867300404
0.9611684035347603
0.9620226144452876
0.9626267742770575


[I 2024-12-08 17:44:19,705] Trial 16 finished with value: 0.9629648148362406 and parameters: {'iterations': 754, 'depth': 10, 'learning_rate': 0.040825436209500174, 'l2_leaf_reg': 0.0013281223670987272, 'bagging_temperature': 0.3125887034310189, 'random_strength': 3.03895670479022, 'border_count': 243}. Best is trial 15 with value: 0.9654900794387944.


0.9629648148362406
0.9634562338186242
0.96474633840901
0.9649892735507594
0.9652678956417158


[I 2024-12-08 17:51:51,676] Trial 17 finished with value: 0.9657562588202413 and parameters: {'iterations': 878, 'depth': 9, 'learning_rate': 0.06359196495368959, 'l2_leaf_reg': 0.0029006968059049503, 'bagging_temperature': 0.016661862679708983, 'random_strength': 2.345499793318164, 'border_count': 115}. Best is trial 17 with value: 0.9657562588202413.


0.9657562588202413
0.9247225577576121
0.9260873477439688
0.9270360006461873
0.927381558372254


[I 2024-12-08 17:59:17,548] Trial 18 finished with value: 0.9277057578549324 and parameters: {'iterations': 950, 'depth': 9, 'learning_rate': 0.005349627375283845, 'l2_leaf_reg': 0.003526717014914897, 'bagging_temperature': 0.03756900453274524, 'random_strength': 2.0789028145949553, 'border_count': 118}. Best is trial 17 with value: 0.9657562588202413.


0.9277057578549324
0.966465993910778
0.9670484653627934
0.9670134861714496
0.9674064263380574


[I 2024-12-08 18:06:26,806] Trial 19 finished with value: 0.9676572286920144 and parameters: {'iterations': 836, 'depth': 9, 'learning_rate': 0.09715448909982823, 'l2_leaf_reg': 0.023104380057350743, 'bagging_temperature': 0.1206129266989493, 'random_strength': 3.4127914471868745, 'border_count': 116}. Best is trial 19 with value: 0.9676572286920144.


0.9676572286920144
0.958277373963208
0.9599613026537881
0.9603224262587776
0.9609442751790607


[I 2024-12-08 18:13:26,445] Trial 20 finished with value: 0.9614162352741873 and parameters: {'iterations': 850, 'depth': 9, 'learning_rate': 0.034557694570186856, 'l2_leaf_reg': 0.02191982562443513, 'bagging_temperature': 0.12818271258391242, 'random_strength': 4.40163587244976, 'border_count': 36}. Best is trial 19 with value: 0.9676572286920144.


0.9614162352741873
0.9650016458688234
0.965994616268631
0.9662392241569974
0.9668844045772375


[I 2024-12-08 18:20:39,975] Trial 21 finished with value: 0.9670010562880599 and parameters: {'iterations': 858, 'depth': 9, 'learning_rate': 0.10324502910932085, 'l2_leaf_reg': 0.002959069485749281, 'bagging_temperature': 0.12371133060122114, 'random_strength': 3.2709349425401055, 'border_count': 110}. Best is trial 19 with value: 0.9676572286920144.


0.9670010562880599
0.964158101999497
0.9649335700019317
0.9656844566806596
0.9664312104517867


[I 2024-12-08 18:28:04,928] Trial 22 finished with value: 0.9669047824652669 and parameters: {'iterations': 868, 'depth': 9, 'learning_rate': 0.10847727105658277, 'l2_leaf_reg': 0.0030965380159980863, 'bagging_temperature': 0.12478028957403095, 'random_strength': 1.942694663767941, 'border_count': 118}. Best is trial 19 with value: 0.9676572286920144.


0.9669047824652669
0.9628669511171014
0.9648310047194257
0.9650585180031644
0.9658624551198971


[I 2024-12-08 18:36:05,450] Trial 23 finished with value: 0.9662772192292618 and parameters: {'iterations': 985, 'depth': 9, 'learning_rate': 0.1529664844633978, 'l2_leaf_reg': 0.018783234903305645, 'bagging_temperature': 0.131762779946309, 'random_strength': 1.4389077513400073, 'border_count': 113}. Best is trial 19 with value: 0.9676572286920144.


0.9662772192292618
0.9516659816240929
0.9527614047676415
0.9531564674273098
0.9538413172955406


[I 2024-12-08 18:40:47,661] Trial 24 finished with value: 0.9539296557236602 and parameters: {'iterations': 793, 'depth': 7, 'learning_rate': 0.027737784187345516, 'l2_leaf_reg': 0.003242451805081241, 'bagging_temperature': 0.10136087604827448, 'random_strength': 3.7359744885271806, 'border_count': 102}. Best is trial 19 with value: 0.9676572286920144.


0.9539296557236602
0.9652157137104865
0.9670009692263128
0.9669539034069908
0.967373151325005


[I 2024-12-08 18:46:07,883] Trial 25 finished with value: 0.9676341645018747 and parameters: {'iterations': 894, 'depth': 7, 'learning_rate': 0.21165095489636612, 'l2_leaf_reg': 0.17437264112415893, 'bagging_temperature': 0.3491785350882037, 'random_strength': 0.2897003949569701, 'border_count': 65}. Best is trial 19 with value: 0.9676572286920144.


0.9676341645018747
0.9662596838121046
0.9670717582329623
0.9671994846687829
0.9671235813278882


[I 2024-12-08 18:51:23,025] Trial 26 finished with value: 0.9675956315320079 and parameters: {'iterations': 884, 'depth': 7, 'learning_rate': 0.26396446404401397, 'l2_leaf_reg': 0.5398385891326517, 'bagging_temperature': 0.3479964412671651, 'random_strength': 0.10488155867429533, 'border_count': 53}. Best is trial 19 with value: 0.9676572286920144.


0.9675956315320079
0.966430170486122
0.9667933766103916
0.9669833334950146
0.967353614424856


[I 2024-12-08 18:56:48,001] Trial 27 finished with value: 0.9676497353412434 and parameters: {'iterations': 914, 'depth': 7, 'learning_rate': 0.14985488280644285, 'l2_leaf_reg': 0.5809187825660769, 'bagging_temperature': 0.34443250076214577, 'random_strength': 0.5401654037942478, 'border_count': 34}. Best is trial 19 with value: 0.9676572286920144.


0.9676497353412434
0.9644396081653304
0.9658698304192843
0.9663560175241196
0.9667345201633138


[I 2024-12-08 19:02:39,459] Trial 28 finished with value: 0.9670671926717264 and parameters: {'iterations': 988, 'depth': 7, 'learning_rate': 0.21394258508034997, 'l2_leaf_reg': 0.7776902617279754, 'bagging_temperature': 0.5667661569372334, 'random_strength': 1.038674759432599, 'border_count': 34}. Best is trial 19 with value: 0.9676572286920144.


0.9670671926717264
0.9596326135489811
0.9607346175560747
0.9612905019706887
0.9619226460722714


[I 2024-12-08 19:06:06,384] Trial 29 finished with value: 0.9624608874659584 and parameters: {'iterations': 748, 'depth': 5, 'learning_rate': 0.14670515865932426, 'l2_leaf_reg': 0.17780439098919631, 'bagging_temperature': 0.2098879118954324, 'random_strength': 0.585750295488522, 'border_count': 54}. Best is trial 19 with value: 0.9676572286920144.


0.9624608874659584
0.9558803821388392
0.9560960871010293
0.9565743923691254
0.9573864139244389


[I 2024-12-08 19:10:08,910] Trial 30 finished with value: 0.9580235780287841 and parameters: {'iterations': 918, 'depth': 5, 'learning_rate': 0.07188572347240821, 'l2_leaf_reg': 1.1218788250941272, 'bagging_temperature': 0.3951852962886201, 'random_strength': 1.6292619771044619, 'border_count': 75}. Best is trial 19 with value: 0.9676572286920144.


0.9580235780287841
0.965401708438678
0.966235594677088
0.9665328588014025
0.9669157055821402


[I 2024-12-08 19:15:34,559] Trial 31 finished with value: 0.9672622714921246 and parameters: {'iterations': 912, 'depth': 7, 'learning_rate': 0.2860218378374551, 'l2_leaf_reg': 0.36675357722812907, 'bagging_temperature': 0.3569698032420829, 'random_strength': 0.8225511432977115, 'border_count': 50}. Best is trial 19 with value: 0.9676572286920144.


0.9672622714921246
0.964020142213666
0.964896300029469
0.9652694845347155
0.9655910143298858


[I 2024-12-08 19:20:25,323] Trial 32 finished with value: 0.9659182566624288 and parameters: {'iterations': 798, 'depth': 7, 'learning_rate': 0.24123273471724313, 'l2_leaf_reg': 0.1083764986112296, 'bagging_temperature': 0.30965478086324416, 'random_strength': 0.09632389972573757, 'border_count': 68}. Best is trial 19 with value: 0.9676572286920144.


0.9659182566624288
0.9630777171138626
0.9642918044226086
0.9651497656913238
0.9655135885722443


[I 2024-12-08 19:24:39,144] Trial 33 finished with value: 0.966026651949185 and parameters: {'iterations': 815, 'depth': 6, 'learning_rate': 0.17485868898639167, 'l2_leaf_reg': 0.8245455919052341, 'bagging_temperature': 0.21993224483960344, 'random_strength': 0.05266536214208162, 'border_count': 49}. Best is trial 19 with value: 0.9676572286920144.


0.966026651949185
0.9662281740195505
0.9675171473830091
0.9670683536907866
0.9675256479312475


[I 2024-12-08 19:30:22,647] Trial 34 finished with value: 0.9674853730151727 and parameters: {'iterations': 989, 'depth': 7, 'learning_rate': 0.3051234507911496, 'l2_leaf_reg': 0.35944271204059935, 'bagging_temperature': 0.39381593852670677, 'random_strength': 0.5806630029710743, 'border_count': 85}. Best is trial 19 with value: 0.9676572286920144.


0.9674853730151727
0.9265832832557062
0.9283570714960389
0.9293299824780205
0.9292809523949478


[I 2024-12-08 19:31:10,573] Trial 35 finished with value: 0.9299974991657287 and parameters: {'iterations': 103, 'depth': 8, 'learning_rate': 0.49754447978246097, 'l2_leaf_reg': 1.7238042608677429, 'bagging_temperature': 0.43991560561836635, 'random_strength': 1.5907338754274927, 'border_count': 65}. Best is trial 19 with value: 0.9676572286920144.


0.9299974991657287
0.9589962822118222
0.960235926437121
0.9604933924560003
0.9611443541984104


[I 2024-12-08 19:34:36,976] Trial 36 finished with value: 0.9614875125226112 and parameters: {'iterations': 636, 'depth': 6, 'learning_rate': 0.11511330092299564, 'l2_leaf_reg': 0.042350440097539015, 'bagging_temperature': 0.5866464667748734, 'random_strength': 0.013917095256119102, 'border_count': 42}. Best is trial 19 with value: 0.9676572286920144.


0.9614875125226112
0.9631461767804108
0.9637996946980432
0.9643969818266124
0.9648787076788945


[I 2024-12-08 19:39:59,303] Trial 37 finished with value: 0.9652192906965272 and parameters: {'iterations': 904, 'depth': 7, 'learning_rate': 0.08323089215533275, 'l2_leaf_reg': 0.21413522175469377, 'bagging_temperature': 0.26300913412471494, 'random_strength': 0.7629794519822716, 'border_count': 57}. Best is trial 19 with value: 0.9676572286920144.


0.9652192906965272
0.9100964550221031
0.9115980698514611
0.9126016586175622
0.9128280016462776


[I 2024-12-08 19:43:15,933] Trial 38 finished with value: 0.9128311167748151 and parameters: {'iterations': 738, 'depth': 5, 'learning_rate': 0.007742858822296521, 'l2_leaf_reg': 0.08270950002064431, 'bagging_temperature': 0.33907427059238554, 'random_strength': 6.048673005224808, 'border_count': 74}. Best is trial 19 with value: 0.9676572286920144.


0.9128311167748151
0.955777674590347
0.9564707058239397
0.9579084638259975
0.9584614553184269


[I 2024-12-08 19:46:41,833] Trial 39 finished with value: 0.9590168070775809 and parameters: {'iterations': 940, 'depth': 4, 'learning_rate': 0.1812382930386544, 'l2_leaf_reg': 4.172893914171999, 'bagging_temperature': 0.49756044922755016, 'random_strength': 2.5353870420931433, 'border_count': 129}. Best is trial 19 with value: 0.9676572286920144.


0.9590168070775809
0.9640572874379694
0.9641250405521761
0.9643233356394215
0.9649696520150375


[I 2024-12-08 19:49:49,512] Trial 40 finished with value: 0.9651574400041177 and parameters: {'iterations': 550, 'depth': 6, 'learning_rate': 0.3128714943220283, 'l2_leaf_reg': 0.5086172272735641, 'bagging_temperature': 0.4198718414659204, 'random_strength': 1.1238141064902507, 'border_count': 97}. Best is trial 19 with value: 0.9676572286920144.


0.9651574400041177
0.9646788188658253
0.9660710645949749
0.9664473336135861
0.9672390447367818


[I 2024-12-08 19:55:38,444] Trial 41 finished with value: 0.9677223623099167 and parameters: {'iterations': 977, 'depth': 7, 'learning_rate': 0.2826326158602375, 'l2_leaf_reg': 0.3072312497021844, 'bagging_temperature': 0.1810000291777522, 'random_strength': 0.6166119609138719, 'border_count': 81}. Best is trial 41 with value: 0.9677223623099167.


0.9677223623099167
0.9661209564269606
0.9671117498386745
0.9661975704066129
0.9664751715441154


[I 2024-12-08 20:01:34,936] Trial 42 finished with value: 0.9665090861183587 and parameters: {'iterations': 830, 'depth': 8, 'learning_rate': 0.2312540898815097, 'l2_leaf_reg': 0.24507667159108185, 'bagging_temperature': 0.19695482337222775, 'random_strength': 0.45134960442446037, 'border_count': 85}. Best is trial 41 with value: 0.9677223623099167.


0.9665090861183587
0.9657204288384348
0.9667972830857345
0.9669662760716111
0.9676963837508841


[I 2024-12-08 20:07:13,266] Trial 43 finished with value: 0.9680341026849497 and parameters: {'iterations': 958, 'depth': 7, 'learning_rate': 0.13855689881749275, 'l2_leaf_reg': 0.6113539009883876, 'bagging_temperature': 0.17710574014981362, 'random_strength': 1.3229705738948407, 'border_count': 62}. Best is trial 43 with value: 0.9680341026849497.


0.9680341026849497
0.9632664467463845
0.9644042794865664
0.9649921552543628
0.9653621993685517


[I 2024-12-08 20:12:05,125] Trial 44 finished with value: 0.9657651971353858 and parameters: {'iterations': 952, 'depth': 6, 'learning_rate': 0.12487116302886611, 'l2_leaf_reg': 0.12630378560991348, 'bagging_temperature': 0.17360388137699234, 'random_strength': 1.3772190256357204, 'border_count': 76}. Best is trial 43 with value: 0.9680341026849497.


0.9657651971353858
0.9503222762000607
0.9519771268909316
0.9524925134836605
0.9530072668479665


[I 2024-12-08 20:18:22,750] Trial 45 finished with value: 0.953374931497482 and parameters: {'iterations': 948, 'depth': 8, 'learning_rate': 0.020042025897397023, 'l2_leaf_reg': 2.1177197190268515, 'bagging_temperature': 0.24049424584209622, 'random_strength': 4.923249027628453, 'border_count': 64}. Best is trial 43 with value: 0.9680341026849497.


0.953374931497482
0.9635358942773639
0.9649559418590935
0.9649924349028233
0.9655259534871335


[I 2024-12-08 20:23:27,915] Trial 46 finished with value: 0.9653267146862514 and parameters: {'iterations': 985, 'depth': 6, 'learning_rate': 0.376972086710624, 'l2_leaf_reg': 0.05777031429000985, 'bagging_temperature': 0.06720516122580364, 'random_strength': 1.9605261938181289, 'border_count': 157}. Best is trial 43 with value: 0.9680341026849497.


0.9653267146862514
0.8614860456857445
0.863003278984669
0.8636583498070222
0.8625790628621515


[I 2024-12-08 20:28:16,141] Trial 47 finished with value: 0.8621720592401136 and parameters: {'iterations': 905, 'depth': 7, 'learning_rate': 0.0012805016378676494, 'l2_leaf_reg': 1.1506386347131325, 'bagging_temperature': 0.166374421779822, 'random_strength': 1.1298150756792125, 'border_count': 40}. Best is trial 43 with value: 0.9680341026849497.


0.8621720592401136
0.9617858103107048
0.9635313895416986
0.9637594880346508
0.9642038886434068


[I 2024-12-08 20:33:57,723] Trial 48 finished with value: 0.9643102869751174 and parameters: {'iterations': 797, 'depth': 8, 'learning_rate': 0.18200937937524891, 'l2_leaf_reg': 0.28310069591566106, 'bagging_temperature': 0.9797257026522773, 'random_strength': 2.8241524957938022, 'border_count': 99}. Best is trial 43 with value: 0.9680341026849497.


0.9643102869751174
0.9459060108678643
0.9472824942689315
0.9476508078333951
0.9483578692198142


[I 2024-12-08 20:36:43,067] Trial 49 finished with value: 0.9487187224402718 and parameters: {'iterations': 700, 'depth': 4, 'learning_rate': 0.04690662989318129, 'l2_leaf_reg': 0.13180821967242326, 'bagging_temperature': 0.06729213916605695, 'random_strength': 2.2040551445135046, 'border_count': 129}. Best is trial 43 with value: 0.9680341026849497.


0.9487187224402718
Best parameters: {'iterations': 958, 'depth': 7, 'learning_rate': 0.13855689881749275, 'l2_leaf_reg': 0.6113539009883876, 'bagging_temperature': 0.17710574014981362, 'random_strength': 1.3229705738948407, 'border_count': 62}


In [None]:
# Train the final model with the best parameters on the full dataset
#best_params = study.best_trial.params
best_params = {'iterations': 958, 'depth': 7, 'learning_rate': 0.13855689881749275, 'task_type': 'GPU', 'l2_leaf_reg': 0.6113539009883876, 'bagging_temperature': 0.17710574014981362, 'random_strength': 1.3229705738948407, 'border_count': 62}
final_pool = Pool(filled_train, y_train, cat_features=cat_features)
final_model = CatBoostClassifier(**best_params, verbose=100)
final_model.fit(final_pool)

# Make predictions on the test set
test_pool = Pool(filled_test, cat_features=cat_features)
test_pred_proba = final_model.predict_proba(test_pool)[:, 1]

# Create submission
submission = pd.DataFrame({
    "TransactionID": test_processed["TransactionID"],
    "isFraud": test_pred_proba
})
submission.to_csv("catboost_log_reg_stacked_auc_submission.csv", index=False)

print("Submission file created: catboost_auc_submission.csv")

0:	learn: 0.4893707	total: 79.7ms	remaining: 1m 16s
100:	learn: 0.0710812	total: 7.17s	remaining: 1m
200:	learn: 0.0635219	total: 14.8s	remaining: 55.6s
300:	learn: 0.0593398	total: 22.3s	remaining: 48.6s
400:	learn: 0.0564467	total: 29.4s	remaining: 40.9s
500:	learn: 0.0546512	total: 36.7s	remaining: 33.5s
600:	learn: 0.0525618	total: 44s	remaining: 26.1s
700:	learn: 0.0507231	total: 51.1s	remaining: 18.7s
800:	learn: 0.0490758	total: 58.3s	remaining: 11.4s
900:	learn: 0.0476297	total: 1m 5s	remaining: 4.15s
957:	learn: 0.0468720	total: 1m 9s	remaining: 0us
Submission file created: catboost_auc_submission.csv


In [None]:
filled_test['stacked_log_reg']

Unnamed: 0,stacked_log_reg
0,0.999997
1,0.999885
2,0.999993
3,1.000000
4,1.000000
...,...
506686,1.000000
506687,0.999928
506688,0.994934
506689,0.999972


In [None]:
import optuna
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

# Target variable
y_train = train_processed['isFraud']

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters for CatBoost
    params = {
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.5, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0, log=True),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "random_strength": trial.suggest_float("random_strength", 0.0, 10.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "loss_function": "Logloss",
        "eval_metric": "Logloss",
        "task_type": "GPU",
        "verbose": False,
        "random_state": 42,
    }

    # Cross-validation setup
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    roc_auc_scores = []
    pr_auc_scores = []

    for train_idx, val_idx in cv.split(filled_train, y_train):
        X_train, X_val = filled_train.iloc[train_idx], filled_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Create CatBoost Pool
        train_pool = Pool(X_train, y_train_fold, cat_features=cat_features)
        val_pool = Pool(X_val, y_val_fold, cat_features=cat_features)

        # Train CatBoost model
        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50, verbose=False)

        # Predict probabilities for validation set
        y_val_pred_proba = model.predict_proba(X_val)[:, 1]  # Probabilities for class 1

        # Calculate ROC AUC
        roc_auc = roc_auc_score(y_val_fold, y_val_pred_proba)
        roc_auc_scores.append(roc_auc)

        # Calculate Precision-Recall AUC
        precision, recall, _ = precision_recall_curve(y_val_fold, y_val_pred_proba)
        pr_auc = auc(recall, precision)
        pr_auc_scores.append(pr_auc)

    # Save metrics for this trial
    trial.set_user_attr("roc_auc_scores", roc_auc_scores)
    trial.set_user_attr("pr_auc_scores", pr_auc_scores)
    print(f'ROC AUC: {np.mean(roc_auc_scores)}')
    print(f'PR AUC: {np.mean(pr_auc_scores)}')
    # Return the mean ROC AUC and PR AUC
    return np.mean(roc_auc_scores), np.mean(pr_auc_scores)

# Run Optuna multi-objective optimization
study = optuna.create_study(
    directions=["maximize", "maximize"],  # Multi-objective optimization
    study_name="CatBoost Multi-Objective Tuning"
)
study.optimize(objective, n_trials=50)

# Save metrics to DataFrame for later analysis
results = []
for trial in study.trials:
    results.append({
        "trial_number": trial.number,
        "roc_auc": np.mean(trial.user_attrs["roc_auc_scores"]),
        "pr_auc": np.mean(trial.user_attrs["pr_auc_scores"])
    })

metrics_df = pd.DataFrame(results)
metrics_df.to_csv("optuna_metrics.csv", index=False)

# Print best parameters
print("Best ROC AUC Trial:")
print(study.best_trials[0].values)
print("Best PR AUC Trial:")
print(study.best_trials[1].values)


[I 2024-12-08 21:43:24,296] A new study created in memory with name: CatBoost Multi-Objective Tuning
[I 2024-12-08 21:50:20,342] Trial 0 finished with values: [0.9644038163549361, 0.8325089024842246] and parameters: {'iterations': 630, 'depth': 10, 'learning_rate': 0.1535726401767315, 'l2_leaf_reg': 0.24527383446914192, 'bagging_temperature': 0.7758189094916633, 'random_strength': 1.9454538767507445, 'border_count': 52}.


ROC AUC: 0.9644038163549361
PR AUC: 0.8325089024842246


[I 2024-12-08 21:52:22,880] Trial 1 finished with values: [0.8938027582853069, 0.5408833680578891] and parameters: {'iterations': 679, 'depth': 3, 'learning_rate': 0.004845294259107338, 'l2_leaf_reg': 0.3721982120361202, 'bagging_temperature': 0.8158767212007675, 'random_strength': 0.45685457709192967, 'border_count': 32}.


ROC AUC: 0.8938027582853069
PR AUC: 0.5408833680578891


[I 2024-12-08 21:58:30,692] Trial 2 finished with values: [0.9407119671752021, 0.7129449967594199] and parameters: {'iterations': 966, 'depth': 8, 'learning_rate': 0.013284223756845825, 'l2_leaf_reg': 1.1370348208471195, 'bagging_temperature': 0.9228547607338592, 'random_strength': 8.826745757097349, 'border_count': 107}.


ROC AUC: 0.9407119671752021
PR AUC: 0.7129449967594199


[I 2024-12-08 22:01:47,466] Trial 3 finished with values: [0.84459198876316, 0.37821438358809806] and parameters: {'iterations': 672, 'depth': 6, 'learning_rate': 0.0017396790380804048, 'l2_leaf_reg': 0.007735682623715023, 'bagging_temperature': 0.363447840784334, 'random_strength': 6.96320355898915, 'border_count': 205}.


ROC AUC: 0.84459198876316
PR AUC: 0.37821438358809806


[I 2024-12-08 22:06:59,453] Trial 4 finished with values: [0.9627700734376609, 0.8068664554515074] and parameters: {'iterations': 886, 'depth': 7, 'learning_rate': 0.08385340887738532, 'l2_leaf_reg': 9.531440281837588, 'bagging_temperature': 0.4479534428272809, 'random_strength': 0.885400700400395, 'border_count': 47}.


ROC AUC: 0.9627700734376609
PR AUC: 0.8068664554515074


[I 2024-12-08 22:09:41,658] Trial 5 finished with values: [0.9237896694643869, 0.6607974981473744] and parameters: {'iterations': 676, 'depth': 4, 'learning_rate': 0.01460069290208518, 'l2_leaf_reg': 0.49822796096985805, 'bagging_temperature': 0.6938381690844493, 'random_strength': 4.2572741867392665, 'border_count': 173}.


ROC AUC: 0.9237896694643869
PR AUC: 0.6607974981473744


[I 2024-12-08 22:16:09,167] Trial 6 finished with values: [0.9628557204113525, 0.8090568901531491] and parameters: {'iterations': 932, 'depth': 8, 'learning_rate': 0.04529503426104619, 'l2_leaf_reg': 0.14591351690899035, 'bagging_temperature': 0.2892184731137417, 'random_strength': 1.998102662930057, 'border_count': 69}.


ROC AUC: 0.9628557204113525
PR AUC: 0.8090568901531491


[I 2024-12-08 22:20:39,374] Trial 7 finished with values: [0.8572728924526387, 0.42322957386770776] and parameters: {'iterations': 979, 'depth': 6, 'learning_rate': 0.0013852011420123845, 'l2_leaf_reg': 8.978450024455293, 'bagging_temperature': 0.9448114546942342, 'random_strength': 7.595970681926913, 'border_count': 188}.


ROC AUC: 0.8572728924526387
PR AUC: 0.42322957386770776


[I 2024-12-08 22:23:31,985] Trial 8 finished with values: [0.8037354531111396, 0.2589188832472812] and parameters: {'iterations': 567, 'depth': 6, 'learning_rate': 0.0012430840451180683, 'l2_leaf_reg': 0.06192685053253109, 'bagging_temperature': 0.5916233165024478, 'random_strength': 6.6772432252418445, 'border_count': 185}.


ROC AUC: 0.8037354531111396
PR AUC: 0.2589188832472812


[I 2024-12-08 22:24:20,001] Trial 9 finished with values: [0.8627078624438539, 0.40499000234864413] and parameters: {'iterations': 106, 'depth': 8, 'learning_rate': 0.01529156973513774, 'l2_leaf_reg': 0.01821746983474974, 'bagging_temperature': 0.5652201313547309, 'random_strength': 1.2882907746027539, 'border_count': 51}.


ROC AUC: 0.8627078624438539
PR AUC: 0.40499000234864413


[I 2024-12-08 22:26:37,393] Trial 10 finished with values: [0.9629263668187841, 0.8117741413784916] and parameters: {'iterations': 265, 'depth': 8, 'learning_rate': 0.3820441674452696, 'l2_leaf_reg': 7.464805566198541, 'bagging_temperature': 0.3643378037654279, 'random_strength': 4.666446234105621, 'border_count': 158}.


ROC AUC: 0.9629263668187841
PR AUC: 0.8117741413784916


[I 2024-12-08 22:30:03,823] Trial 11 finished with values: [0.9507515262534172, 0.7559574696418154] and parameters: {'iterations': 760, 'depth': 5, 'learning_rate': 0.04403427936543469, 'l2_leaf_reg': 0.2684416614846318, 'bagging_temperature': 0.5771698767676026, 'random_strength': 8.607712710346666, 'border_count': 91}.


ROC AUC: 0.9507515262534172
PR AUC: 0.7559574696418154


[I 2024-12-08 22:35:01,942] Trial 12 finished with values: [0.8830418528895587, 0.5237618243415256] and parameters: {'iterations': 916, 'depth': 7, 'learning_rate': 0.0017484150596376005, 'l2_leaf_reg': 0.19225657643226832, 'bagging_temperature': 0.4992885931780744, 'random_strength': 1.6151832528596033, 'border_count': 202}.


ROC AUC: 0.8830418528895587
PR AUC: 0.5237618243415256


[I 2024-12-08 22:41:58,761] Trial 13 finished with values: [0.9575597425890964, 0.7818891589054047] and parameters: {'iterations': 858, 'depth': 9, 'learning_rate': 0.023199605301906994, 'l2_leaf_reg': 1.4629430136027024, 'bagging_temperature': 0.14156532552190915, 'random_strength': 3.695025452733971, 'border_count': 32}.


ROC AUC: 0.9575597425890964
PR AUC: 0.7818891589054047


[I 2024-12-08 22:42:47,833] Trial 14 finished with values: [0.9040993130666315, 0.5494016558607171] and parameters: {'iterations': 166, 'depth': 7, 'learning_rate': 0.07304865140394949, 'l2_leaf_reg': 0.0025851086080566593, 'bagging_temperature': 0.632187230194125, 'random_strength': 9.154840359313223, 'border_count': 47}.


ROC AUC: 0.9040993130666315
PR AUC: 0.5494016558607171


[I 2024-12-08 22:44:13,071] Trial 15 finished with values: [0.9183692981650428, 0.6378371570695733] and parameters: {'iterations': 363, 'depth': 3, 'learning_rate': 0.030207928066129618, 'l2_leaf_reg': 0.7875433807510372, 'bagging_temperature': 0.5930558194025569, 'random_strength': 2.3842813760577783, 'border_count': 166}.


ROC AUC: 0.9183692981650428
PR AUC: 0.6378371570695733


[I 2024-12-08 22:51:59,918] Trial 16 finished with values: [0.8367190289917412, 0.31701107682786] and parameters: {'iterations': 818, 'depth': 10, 'learning_rate': 0.0010221702549187872, 'l2_leaf_reg': 0.0024953441189330718, 'bagging_temperature': 0.7959977008973913, 'random_strength': 9.101666511776369, 'border_count': 153}.


ROC AUC: 0.8367190289917412
PR AUC: 0.31701107682786


[I 2024-12-08 22:56:11,052] Trial 17 finished with values: [0.9338444940632525, 0.690005708189714] and parameters: {'iterations': 731, 'depth': 7, 'learning_rate': 0.013686411631919023, 'l2_leaf_reg': 0.7597016387316281, 'bagging_temperature': 0.7655186245768262, 'random_strength': 7.857731357896407, 'border_count': 69}.


ROC AUC: 0.9338444940632525
PR AUC: 0.690005708189714


[I 2024-12-08 23:01:59,916] Trial 18 finished with values: [0.9667078513937983, 0.8365462137252907] and parameters: {'iterations': 828, 'depth': 8, 'learning_rate': 0.3799307108843257, 'l2_leaf_reg': 9.22304912417145, 'bagging_temperature': 0.6096237219531504, 'random_strength': 3.5622467058641503, 'border_count': 146}.


ROC AUC: 0.9667078513937983
PR AUC: 0.8365462137252907


[I 2024-12-08 23:09:39,017] Trial 19 finished with values: [0.961982104144018, 0.8032142555182581] and parameters: {'iterations': 704, 'depth': 10, 'learning_rate': 0.0532115840769887, 'l2_leaf_reg': 2.7530137713782104, 'bagging_temperature': 0.919211889184508, 'random_strength': 1.7513052618324043, 'border_count': 159}.


ROC AUC: 0.961982104144018
PR AUC: 0.8032142555182581


[I 2024-12-08 23:15:23,991] Trial 20 finished with values: [0.9601736429516542, 0.7972592256470955] and parameters: {'iterations': 992, 'depth': 7, 'learning_rate': 0.03972729030995201, 'l2_leaf_reg': 0.42348398098672085, 'bagging_temperature': 0.09201463551983147, 'random_strength': 5.137525057134679, 'border_count': 111}.


ROC AUC: 0.9601736429516542
PR AUC: 0.7972592256470955


[I 2024-12-08 23:20:40,126] Trial 21 finished with values: [0.9669135604802722, 0.8305425966913023] and parameters: {'iterations': 886, 'depth': 7, 'learning_rate': 0.13103548213774716, 'l2_leaf_reg': 3.0254418646722065, 'bagging_temperature': 0.013723284576605543, 'random_strength': 1.6122272051199904, 'border_count': 244}.


ROC AUC: 0.9669135604802722
PR AUC: 0.8305425966913023


[I 2024-12-08 23:23:00,273] Trial 22 finished with values: [0.9423302238199149, 0.7194620518982822] and parameters: {'iterations': 811, 'depth': 3, 'learning_rate': 0.048462962325120845, 'l2_leaf_reg': 1.1126277380402383, 'bagging_temperature': 0.303102131176308, 'random_strength': 2.8932453574678285, 'border_count': 161}.


ROC AUC: 0.9423302238199149
PR AUC: 0.7194620518982822


[I 2024-12-08 23:26:33,175] Trial 23 finished with values: [0.9498129803879823, 0.7553671649195486] and parameters: {'iterations': 655, 'depth': 6, 'learning_rate': 0.037122307483569005, 'l2_leaf_reg': 0.0027279826633985634, 'bagging_temperature': 0.8719254568407071, 'random_strength': 1.8660899966037525, 'border_count': 242}.


ROC AUC: 0.9498129803879823
PR AUC: 0.7553671649195486


[I 2024-12-08 23:32:41,813] Trial 24 finished with values: [0.8458121711674631, 0.3596904864823739] and parameters: {'iterations': 839, 'depth': 9, 'learning_rate': 0.0012052593336467183, 'l2_leaf_reg': 0.01069958155234592, 'bagging_temperature': 0.8178892306684621, 'random_strength': 8.927122879951567, 'border_count': 106}.


ROC AUC: 0.8458121711674631
PR AUC: 0.3596904864823739


[I 2024-12-08 23:34:30,211] Trial 25 finished with values: [0.8941727303142141, 0.5634307184804046] and parameters: {'iterations': 547, 'depth': 3, 'learning_rate': 0.009945803322506329, 'l2_leaf_reg': 0.04195478736493394, 'bagging_temperature': 0.1487827253951517, 'random_strength': 4.4325071912933165, 'border_count': 67}.


ROC AUC: 0.8941727303142141
PR AUC: 0.5634307184804046


[I 2024-12-08 23:35:33,041] Trial 26 finished with values: [0.8682141535045395, 0.41945063031151575] and parameters: {'iterations': 166, 'depth': 9, 'learning_rate': 0.010386379225581614, 'l2_leaf_reg': 0.46344139167580123, 'bagging_temperature': 0.8845483460855965, 'random_strength': 6.657109976115459, 'border_count': 65}.


ROC AUC: 0.8682141535045395
PR AUC: 0.41945063031151575


[I 2024-12-08 23:42:10,424] Trial 27 finished with values: [0.9658168890420201, 0.8218899268998868] and parameters: {'iterations': 597, 'depth': 10, 'learning_rate': 0.07329274167515952, 'l2_leaf_reg': 1.7819423636253182, 'bagging_temperature': 0.15356976794755905, 'random_strength': 9.47912446225242, 'border_count': 162}.


ROC AUC: 0.9658168890420201
PR AUC: 0.8218899268998868


[I 2024-12-08 23:46:22,268] Trial 28 finished with values: [0.8523866740789241, 0.3963352055929327] and parameters: {'iterations': 537, 'depth': 9, 'learning_rate': 0.002068169395078207, 'l2_leaf_reg': 1.0562606983682108, 'bagging_temperature': 0.5753554653881864, 'random_strength': 7.760653746397025, 'border_count': 146}.


ROC AUC: 0.8523866740789241
PR AUC: 0.3963352055929327


[I 2024-12-08 23:49:24,412] Trial 29 finished with values: [0.9577044067715403, 0.7926632307179178] and parameters: {'iterations': 800, 'depth': 4, 'learning_rate': 0.3306344553015811, 'l2_leaf_reg': 0.1433764569046337, 'bagging_temperature': 0.8631669947138534, 'random_strength': 2.4974763273936276, 'border_count': 173}.


ROC AUC: 0.9577044067715403
PR AUC: 0.7926632307179178


[I 2024-12-08 23:52:25,266] Trial 30 finished with values: [0.934601190842044, 0.6918039745953101] and parameters: {'iterations': 560, 'depth': 6, 'learning_rate': 0.0228042022788295, 'l2_leaf_reg': 0.46758495471851286, 'bagging_temperature': 0.8762522512762608, 'random_strength': 8.746589938222849, 'border_count': 152}.


ROC AUC: 0.934601190842044
PR AUC: 0.6918039745953101


[I 2024-12-08 23:54:24,292] Trial 31 finished with values: [0.9279866388427797, 0.6680942094940209] and parameters: {'iterations': 623, 'depth': 3, 'learning_rate': 0.02851075219142329, 'l2_leaf_reg': 0.16044630668245033, 'bagging_temperature': 0.4308069293958934, 'random_strength': 3.8503302000788144, 'border_count': 178}.


ROC AUC: 0.9279866388427797
PR AUC: 0.6680942094940209


[I 2024-12-08 23:57:46,402] Trial 32 finished with values: [0.9527084536024342, 0.7631700512730306] and parameters: {'iterations': 532, 'depth': 7, 'learning_rate': 0.034862918308871935, 'l2_leaf_reg': 0.3233708366475466, 'bagging_temperature': 0.06225091380321579, 'random_strength': 2.4476160015369053, 'border_count': 54}.


ROC AUC: 0.9527084536024342
PR AUC: 0.7631700512730306


[I 2024-12-08 23:59:46,888] Trial 33 finished with values: [0.941068614357859, 0.7172674149563782] and parameters: {'iterations': 449, 'depth': 4, 'learning_rate': 0.04079019874565134, 'l2_leaf_reg': 5.507105921930276, 'bagging_temperature': 0.16670965587784803, 'random_strength': 0.8087372456231023, 'border_count': 108}.


ROC AUC: 0.941068614357859
PR AUC: 0.7172674149563782


[I 2024-12-09 00:06:20,813] Trial 34 finished with values: [0.9652361090489201, 0.8223310271899666] and parameters: {'iterations': 758, 'depth': 9, 'learning_rate': 0.06483783684193602, 'l2_leaf_reg': 0.009548342622812627, 'bagging_temperature': 0.19549717071975525, 'random_strength': 3.2986088687932846, 'border_count': 177}.


ROC AUC: 0.9652361090489201
PR AUC: 0.8223310271899666


[I 2024-12-09 00:09:52,898] Trial 35 finished with values: [0.9286058435690361, 0.6825671678234302] and parameters: {'iterations': 315, 'depth': 10, 'learning_rate': 0.017017693213917762, 'l2_leaf_reg': 0.012094905537970511, 'bagging_temperature': 0.6374269232162562, 'random_strength': 4.425378902407785, 'border_count': 47}.


ROC AUC: 0.9286058435690361
PR AUC: 0.6825671678234302


[I 2024-12-09 00:13:23,305] Trial 36 finished with values: [0.9601276730029953, 0.7984573832797619] and parameters: {'iterations': 967, 'depth': 4, 'learning_rate': 0.14251632081387228, 'l2_leaf_reg': 5.474188549904893, 'bagging_temperature': 0.04275247894611878, 'random_strength': 4.658731677087099, 'border_count': 103}.


ROC AUC: 0.9601276730029953
PR AUC: 0.7984573832797619


[I 2024-12-09 00:15:44,617] Trial 37 finished with values: [0.9305477864339983, 0.6773448312255586] and parameters: {'iterations': 819, 'depth': 3, 'learning_rate': 0.026201516736670136, 'l2_leaf_reg': 5.083654485810885, 'bagging_temperature': 0.33008491520279726, 'random_strength': 6.97247933480913, 'border_count': 140}.


ROC AUC: 0.9305477864339983
PR AUC: 0.6773448312255586


[I 2024-12-09 00:19:37,945] Trial 38 finished with values: [0.9425184728299915, 0.7199453782264047] and parameters: {'iterations': 768, 'depth': 6, 'learning_rate': 0.020725897914360678, 'l2_leaf_reg': 0.5889755410610802, 'bagging_temperature': 0.2071114787645182, 'random_strength': 9.008416205580248, 'border_count': 167}.


ROC AUC: 0.9425184728299915
PR AUC: 0.7199453782264047


[I 2024-12-09 00:20:24,497] Trial 39 finished with values: [0.8806672881139054, 0.4854416234965651] and parameters: {'iterations': 170, 'depth': 6, 'learning_rate': 0.024661074135151926, 'l2_leaf_reg': 0.003917137201461307, 'bagging_temperature': 0.8119609337122089, 'random_strength': 5.750072067474511, 'border_count': 42}.


ROC AUC: 0.8806672881139054
PR AUC: 0.4854416234965651


[I 2024-12-09 00:24:08,879] Trial 40 finished with values: [0.9246084652629885, 0.6701060538972122] and parameters: {'iterations': 550, 'depth': 8, 'learning_rate': 0.010732521135453229, 'l2_leaf_reg': 0.004519398344234004, 'bagging_temperature': 0.8093884178222195, 'random_strength': 6.319217627755586, 'border_count': 58}.


ROC AUC: 0.9246084652629885
PR AUC: 0.6701060538972122


[I 2024-12-09 00:26:12,074] Trial 41 finished with values: [0.8316523941043309, 0.3198025246569121] and parameters: {'iterations': 515, 'depth': 4, 'learning_rate': 0.0014668199404824036, 'l2_leaf_reg': 0.0983962550537702, 'bagging_temperature': 0.41953984010843814, 'random_strength': 0.12307016747270083, 'border_count': 117}.


ROC AUC: 0.8316523941043309
PR AUC: 0.3198025246569121


[I 2024-12-09 00:28:46,217] Trial 42 finished with values: [0.9003714549500275, 0.5963561485719316] and parameters: {'iterations': 645, 'depth': 4, 'learning_rate': 0.006522873979314234, 'l2_leaf_reg': 0.002948977140193092, 'bagging_temperature': 0.5433365567102146, 'random_strength': 2.7361125453949287, 'border_count': 78}.


ROC AUC: 0.9003714549500275
PR AUC: 0.5963561485719316


[I 2024-12-09 00:34:24,744] Trial 43 finished with values: [0.9598211507376437, 0.797110162523432] and parameters: {'iterations': 514, 'depth': 10, 'learning_rate': 0.05020365166879048, 'l2_leaf_reg': 0.02219184208797571, 'bagging_temperature': 0.3496931975939318, 'random_strength': 7.212460659983586, 'border_count': 170}.


ROC AUC: 0.9598211507376437
PR AUC: 0.797110162523432


[I 2024-12-09 00:35:13,597] Trial 44 finished with values: [0.7859803274596727, 0.23148465185902492] and parameters: {'iterations': 156, 'depth': 7, 'learning_rate': 0.00417918107653418, 'l2_leaf_reg': 1.7155972700452597, 'bagging_temperature': 0.1617843556143055, 'random_strength': 9.882372487498271, 'border_count': 120}.


ROC AUC: 0.7859803274596727
PR AUC: 0.23148465185902492


[I 2024-12-09 00:44:34,270] Trial 45 finished with values: [0.9643307706730733, 0.8149569918189282] and parameters: {'iterations': 876, 'depth': 10, 'learning_rate': 0.0411149898794545, 'l2_leaf_reg': 0.6447287736563039, 'bagging_temperature': 0.4645606939507445, 'random_strength': 3.5210251383613587, 'border_count': 137}.


ROC AUC: 0.9643307706730733
PR AUC: 0.8149569918189282


[I 2024-12-09 00:49:48,433] Trial 46 finished with values: [0.9612814955822355, 0.8026563707839538] and parameters: {'iterations': 878, 'depth': 7, 'learning_rate': 0.04783761282580409, 'l2_leaf_reg': 0.006737536123045816, 'bagging_temperature': 0.08337873992946798, 'random_strength': 4.397731239645272, 'border_count': 236}.


ROC AUC: 0.9612814955822355
PR AUC: 0.8026563707839538


[I 2024-12-09 00:53:18,355] Trial 47 finished with values: [0.955326458425492, 0.7760591703501679] and parameters: {'iterations': 774, 'depth': 5, 'learning_rate': 0.06525400685969639, 'l2_leaf_reg': 3.4908110247933015, 'bagging_temperature': 0.3245741369058702, 'random_strength': 6.180591204175623, 'border_count': 137}.


ROC AUC: 0.955326458425492
PR AUC: 0.7760591703501679


[I 2024-12-09 00:54:02,167] Trial 48 finished with values: [0.8605575379521021, 0.39863332067479806] and parameters: {'iterations': 176, 'depth': 3, 'learning_rate': 0.01326700333907043, 'l2_leaf_reg': 2.256086049670873, 'bagging_temperature': 0.8732120292014017, 'random_strength': 4.885625452841248, 'border_count': 53}.


ROC AUC: 0.8605575379521021
PR AUC: 0.39863332067479806


[I 2024-12-09 00:54:46,077] Trial 49 finished with values: [0.7709100028733026, 0.18291348708084565] and parameters: {'iterations': 147, 'depth': 4, 'learning_rate': 0.00358964153892542, 'l2_leaf_reg': 0.006846231219828737, 'bagging_temperature': 0.15211588131822706, 'random_strength': 7.5419443447312045, 'border_count': 175}.


ROC AUC: 0.7709100028733026
PR AUC: 0.18291348708084565
Best ROC AUC Trial:
[0.9667078513937983, 0.8365462137252907]
Best PR AUC Trial:
[0.9669135604802722, 0.8305425966913023]
