In [None]:
from google.colab import files
uploaded = files.upload()

Saving sample_submission.parquet to sample_submission.parquet


In [None]:
from google.colab import files
uploaded = files.upload()

Saving train.parquet to train.parquet


In [None]:
from google.colab import files
uploaded = files.upload()

Saving test.parquet to test.parquet


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def preprocess_data(train, test):
    from sklearn.preprocessing import LabelEncoder

    y = train['target']
    X_train = train.drop(columns=['target']).copy()
    X_test = test.copy()

    numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X_train.select_dtypes(exclude=[np.number, 'datetime']).columns.tolist()
    date_cols = X_train.select_dtypes(include=['datetime']).columns.tolist()

    for col in numeric_cols:
        median_val = X_train[col].median()
        X_train[col] = X_train[col].fillna(median_val)
        X_test[col] = X_test[col].fillna(median_val)

    for col in categorical_cols:
        mode_val = X_train[col].mode()[0]
        X_train[col] = X_train[col].fillna(mode_val)
        X_test[col] = X_test[col].fillna(mode_val)

    for col in date_cols:
        for df in [X_train, X_test]:
            df[col + '_year'] = df[col].dt.year
            df[col + '_month'] = df[col].dt.month
            df[col + '_day'] = df[col].dt.day
            df[col + '_weekday'] = df[col].dt.weekday
        X_train.drop(columns=[col], inplace=True)
        X_test.drop(columns=[col], inplace=True)

    categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

    X_train_classical = X_train.copy()
    X_test_classical = X_test.copy()
    for col in categorical_cols:
        le = LabelEncoder()
        X_train_classical[col] = le.fit_transform(X_train_classical[col].astype(str))
        X_test_classical[col] = X_test_classical[col].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)

    X_test_classical = X_test_classical.reindex(columns=X_train_classical.columns, fill_value=0)

    X_train_dl = pd.get_dummies(X_train, columns=categorical_cols, drop_first=True)
    X_test_dl = pd.get_dummies(X_test, columns=categorical_cols, drop_first=True)
    X_test_dl = X_test_dl.reindex(columns=X_train_dl.columns, fill_value=0)

    return X_train_classical, X_test_classical, X_train_dl, X_test_dl, y

import pandas as pd

train = pd.read_parquet('train.parquet')
sample = pd.read_parquet('sample_submission.parquet')
test = pd.read_parquet('test.parquet')

X_train_classical, X_test_classical, X_train_dl, X_test_dl, y = preprocess_data(train, test)

print("Shapes:")
print("Classical:", X_train_classical.shape, X_test_classical.shape)
print("Deep Learning:", X_train_dl.shape, X_test_dl.shape)
print("Target:", y.shape)

Shapes:
Classical: (1639424, 9) (409856, 9)
Deep Learning: (1639424, 9) (409856, 9)
Target: (1639424,)


In [None]:
X_train_classical, X_test_classical, X_train_dl, X_test_dl, y = preprocess_data(train, test)

print("Classical ML Data Shape:", X_train_classical.shape, X_test_classical.shape)
print("Deep Learning Data Shape:", X_train_dl.shape, X_test_dl.shape)

Classical ML Data Shape: (1639424, 9) (409856, 9)
Deep Learning Data Shape: (1639424, 9) (409856, 9)


In [None]:
y = y.astype(int)
print(y.unique(), y.dtype)

[0 1] int64


In [None]:
from sklearn.model_selection import train_test_split

X_tr_class, X_val_class, y_tr, y_val = train_test_split(
    X_train_classical, y, test_size=0.2, stratify=y, random_state=42
)

X_tr_dl, X_val_dl, _, _ = train_test_split(
    X_train_dl, y, test_size=0.2, stratify=y, random_state=42
)

print("Classical ML shapes:")
print("Train:", X_tr_class.shape, "Validation:", X_val_class.shape)

print("\nDeep Learning shapes:")
print("Train:", X_tr_dl.shape, "Validation:", X_val_dl.shape)

print("\nTarget shapes:")
print("y_train:", y_tr.shape, "y_val:", y_val.shape)

Classical ML shapes:
Train: (1311539, 9) Validation: (327885, 9)

Deep Learning shapes:
Train: (1311539, 9) Validation: (327885, 9)

Target shapes:
y_train: (1311539,) y_val: (327885,)


In [None]:
corrs = pd.DataFrame(X_tr_class, columns=X_train_classical.columns).corrwith(y_tr)
print(corrs.sort_values(ascending=False))

X5              0.097707
X1              0.096114
X4              0.043767
X3              0.018893
Date_month      0.002580
Date_day       -0.002035
Date_weekday   -0.008026
Date_year      -0.033194
X2             -0.162628
dtype: float64


In [None]:
print(y.value_counts(normalize=True))

target
0    0.991437
1    0.008563
Name: proportion, dtype: float64


In [None]:
print(len(X_tr_class), len(X_val_class))
print(set(X_tr_class.index).intersection(set(X_val_class.index)))

1311539 327885
set()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

clf = LogisticRegression(class_weight="balanced", max_iter=1000, random_state=42)
clf.fit(X_tr_class, y_tr)

y_val_probs = clf.predict_proba(X_val_class)[:, 1]
y_val_preds = clf.predict(X_val_class)

print("ROC-AUC:", roc_auc_score(y_val, y_val_probs))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_val_preds))
print("\nClassification Report:\n", classification_report(y_val, y_val_preds, digits=4))


ROC-AUC: 0.5

Confusion Matrix:
 [[325077      0]
 [  2808      0]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9914    1.0000    0.9957    325077
           1     0.0000    0.0000    0.0000      2808

    accuracy                         0.9914    327885
   macro avg     0.4957    0.5000    0.4978    327885
weighted avg     0.9829    0.9914    0.9872    327885



ABNORMAL: .

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.metrics import precision_recall_curve, f1_score

prec, rec, thresh = precision_recall_curve(y_val, y_val_probs)
f1_scores = 2*prec*rec / (prec+rec + 1e-6)  # avoid div by zero

best_idx = f1_scores.argmax()
best_thresh = thresh[best_idx]

print("Best threshold:", best_thresh)
print("Best F1-score:", f1_scores[best_idx])

y_val_preds_best = (y_val_probs >= best_thresh).astype(int)

print("\nConfusion Matrix at best threshold:\n", confusion_matrix(y_val, y_val_preds_best))
print("\nClassification Report:\n", classification_report(y_val, y_val_preds_best, digits=4))

Best threshold: 0.5
Best F1-score: 0.016982501690971482

Confusion Matrix at best threshold:
 [[     0 325077]
 [     0   2808]]

Classification Report:
               precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000    325077
           1     0.0086    1.0000    0.0170      2808

    accuracy                         0.0086    327885
   macro avg     0.0043    0.5000    0.0085    327885
weighted avg     0.0001    0.0086    0.0001    327885



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    class_weight="balanced",
    n_jobs=-1,
    random_state=42
)

rf.fit(X_tr_class, y_tr)

y_val_probs = rf.predict_proba(X_val_class)[:, 1]
y_val_preds = rf.predict(X_val_class)

print("ROC-AUC:", roc_auc_score(y_val, y_val_probs))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_val_preds))
print("\nClassification Report:\n", classification_report(y_val, y_val_preds, digits=4))

ROC-AUC: 0.9847599272929656

Confusion Matrix:
 [[324927    150]
 [   902   1906]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9972    0.9995    0.9984    325077
           1     0.9270    0.6788    0.7837      2808

    accuracy                         0.9968    327885
   macro avg     0.9621    0.8392    0.8911    327885
weighted avg     0.9966    0.9968    0.9965    327885



In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

scale_pos_weight = (y_tr == 0).sum() / (y_tr == 1).sum()
print("scale_pos_weight:", scale_pos_weight)

xgb = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    scale_pos_weight=scale_pos_weight,
    n_jobs=-1,
    random_state=42,
    eval_metric="auc"
)

xgb.fit(X_tr_class, y_tr)

y_val_probs = xgb.predict_proba(X_val_class)[:, 1]
y_val_preds = (y_val_probs >= 0.5).astype(int)

print("ROC-AUC:", roc_auc_score(y_val, y_val_probs))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_val_preds))
print("\nClassification Report:\n", classification_report(y_val, y_val_preds, digits=4))


scale_pos_weight: 115.78886910062333
ROC-AUC: 0.992660324299059

Confusion Matrix:
 [[315161   9916]
 [   164   2644]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9995    0.9695    0.9843    325077
           1     0.2105    0.9416    0.3441      2808

    accuracy                         0.9693    327885
   macro avg     0.6050    0.9555    0.6642    327885
weighted avg     0.9927    0.9693    0.9788    327885



In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    roc_auc_score, classification_report,
    confusion_matrix, precision_recall_curve
)
import numpy as np

dt = DecisionTreeClassifier(
    class_weight="balanced",
    max_depth=None,
    random_state=42
)
dt.fit(X_tr_class, y_tr)

y_val_probs = dt.predict_proba(X_val_class)[:, 1]
roc_auc = roc_auc_score(y_val, y_val_probs)
print("ROC-AUC:", roc_auc)

prec, rec, thresh = precision_recall_curve(y_val, y_val_probs)
f1_scores = 2 * prec * rec / (prec + rec + 1e-12)
best_idx = np.nanargmax(f1_scores)
best_thresh = thresh[best_idx] if best_idx < len(thresh) else 1.0
print("Best threshold:", best_thresh)
print("Best F1-score:", f1_scores[best_idx])

y_val_pred = (y_val_probs >= best_thresh).astype(int)
print("\nConfusion Matrix at best threshold:\n", confusion_matrix(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred, digits=4))

from sklearn.metrics import precision_recall_curve, f1_score

prec, rec, thresh = precision_recall_curve(y_val, y_val_probs)
f1_scores = 2 * prec * rec / (prec + rec + 1e-6)

best_idx = f1_scores.argmax()
best_thresh = thresh[best_idx]

print("Best threshold:", best_thresh)
print("Best F1-score:", f1_scores[best_idx])

y_val_preds_best = (y_val_probs >= best_thresh).astype(int)

print("\nConfusion Matrix at best threshold:\n", confusion_matrix(y_val, y_val_preds_best))
print("\nClassification Report:\n", classification_report(y_val, y_val_preds_best, digits=4))

ROC-AUC: 0.8417468533446825
Best threshold: 1.0
Best F1-score: 0.7132271211554845

Confusion Matrix at best threshold:
 [[324412    665]
 [   883   1925]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9973    0.9980    0.9976    325077
           1     0.7432    0.6855    0.7132      2808

    accuracy                         0.9953    327885
   macro avg     0.8703    0.8417    0.8554    327885
weighted avg     0.9951    0.9953    0.9952    327885

Best threshold: 1.0
Best F1-score: 0.7132266219718205

Confusion Matrix at best threshold:
 [[324412    665]
 [   883   1925]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9973    0.9980    0.9976    325077
           1     0.7432    0.6855    0.7132      2808

    accuracy                         0.9953    327885
   macro avg     0.8703    0.8417    0.8554    327885
weighted avg     0.9951    0.9953    0.9952    327885



In [None]:
from sklearn.metrics import precision_recall_curve, f1_score

prec, rec, thresh = precision_recall_curve(y_val, y_val_probs)
f1_scores = 2 * prec * rec / (prec + rec + 1e-6)

best_idx = f1_scores.argmax()
best_thresh = thresh[best_idx]

print("Best threshold:", best_thresh)
print("Best F1-score:", f1_scores[best_idx])

# Apply best threshold
y_val_preds_best = (y_val_probs >= best_thresh).astype(int)

print("\nConfusion Matrix at best threshold:\n", confusion_matrix(y_val, y_val_preds_best))
print("\nClassification Report:\n", classification_report(y_val, y_val_preds_best, digits=4))

Best threshold: 1.0
Best F1-score: 0.7132266219718205

Confusion Matrix at best threshold:
 [[324412    665]
 [   883   1925]]

Classification Report:
               precision    recall  f1-score   support

           0     0.9973    0.9980    0.9976    325077
           1     0.7432    0.6855    0.7132      2808

    accuracy                         0.9953    327885
   macro avg     0.8703    0.8417    0.8554    327885
weighted avg     0.9951    0.9953    0.9952    327885



In [None]:
# --- Logistic Regression ---
lr_probs = clf.predict_proba(X_val_class)[:, 1]
lr_preds = clf.predict(X_val_class)

# --- Random Forest ---
rf_probs = rf.predict_proba(X_val_class)[:, 1]
rf_preds = rf.predict(X_val_class)

# --- XGBoost ---
xgb_probs = xgb.predict_proba(X_val_class)[:, 1]
xgb_preds = (xgb_probs >= 0.5).astype(int)

# --- Decision Tree ---
dt_probs = dt.predict_proba(X_val_class)[:, 1]
dt_preds = (dt_probs >= 0.5).astype(int)

import pandas as pd
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score,
    recall_score, f1_score, precision_recall_curve, auc
)

# Dictionary to store results
results = {}

# Helper function to evaluate models
def evaluate_model(name, y_true, y_probs, y_preds):
    prec, rec, _ = precision_recall_curve(y_true, y_probs)
    pr_auc = auc(rec, prec)

    results[name] = {
        "ROC-AUC": roc_auc_score(y_true, y_probs),
        "PR-AUC": pr_auc,
        "Accuracy": accuracy_score(y_true, y_preds),
        "Precision": precision_score(y_true, y_preds, zero_division=0),
        "Recall": recall_score(y_true, y_preds, zero_division=0),
        "F1-Score": f1_score(y_true, y_preds, zero_division=0),
    }

# Evaluate all models
evaluate_model("Logistic Regression", y_val, lr_probs, lr_preds)
evaluate_model("Random Forest", y_val, rf_probs, rf_preds)
evaluate_model("XGBoost", y_val, xgb_probs, xgb_preds)
evaluate_model("Decision Tree", y_val, dt_probs, dt_preds)

# Convert results to a DataFrame
df_results = pd.DataFrame(results).T
print("\nModel Comparison:\n")
print(df_results.round(4))


Model Comparison:

                     ROC-AUC  PR-AUC  Accuracy  Precision  Recall  F1-Score
Logistic Regression   0.5000  0.5043    0.9914     0.0000  0.0000    0.0000
Random Forest         0.9848  0.8747    0.9968     0.9270  0.6788    0.7837
XGBoost               0.9927  0.8091    0.9693     0.2105  0.9416    0.3441
Decision Tree         0.8417  0.7156    0.9953     0.7427  0.6855    0.7130


In [None]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

rf_final = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    class_weight="balanced",
    n_jobs=-1,
    random_state=42
)
rf_final.fit(X_train_classical, y)

test_preds = rf_final.predict(X_test_classical)

submission = pd.DataFrame({
    "ID": test["ID"],
    "target": test_preds
})

submission.to_csv("submission.csv", index=False)

from google.colab import files
files.download("submission.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score, f1_score, classification_report

train_data = lgb.Dataset(X_tr_class, label=y_tr)
val_data = lgb.Dataset(X_val_class, label=y_val, reference=train_data)

params = {
    "objective": "binary",
    "boosting_type": "gbdt",
    "metric": ["binary_error", "auc"],
    "learning_rate": 0.05,
    "num_leaves": 31,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "seed": 42
}

lgb_model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)
y_val_pred_proba = lgb_model.predict(X_val_class, num_iteration=lgb_model.best_iteration)
y_val_pred = (y_val_pred_proba >= 0.5).astype(int)

print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation F1 Score:", f1_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred))

[LightGBM] [Info] Number of positive: 11230, number of negative: 1300309
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070904 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 752
[LightGBM] [Info] Number of data points in the train set: 1311539, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.008562 -> initscore=-4.751768
[LightGBM] [Info] Start training from score -4.751768
Training until validation scores don't improve for 50 rounds
[100]	training's binary_error: 0.00435671	training's auc: 0.991758	valid_1's binary_error: 0.00451073	valid_1's auc: 0.989691
[200]	training's binary_error: 0.00406393	training's auc: 0.992893	valid_1's binary_error: 0.00441008	valid_1's auc: 0.990015
Early stopping, best iteration is:
[157]	training's binary_error: 0.00419583	training's auc: 0.992743	valid_1's binary_error: 0

In [None]:
from sklearn.metrics import precision_recall_curve, f1_score

prec, rec, thresh = precision_recall_curve(y_val, y_val_pred_proba)
f1_scores = 2 * prec * rec / (prec + rec + 1e-6)

best_idx = f1_scores.argmax()
best_thresh = thresh[best_idx]

print("Best threshold:", best_thresh)
print("Best F1-score:", f1_scores[best_idx])

y_val_pred_best = (y_val_pred_proba >= best_thresh).astype(int)
print("\nClassification Report at best threshold:\n",
      classification_report(y_val, y_val_pred_best, digits=4))

Best threshold: 0.3411835764073415
Best F1-score: 0.7137939237565574

Classification Report at best threshold:
               precision    recall  f1-score   support

           0     0.9971    0.9983    0.9977    325077
           1     0.7700    0.6652    0.7138      2808

    accuracy                         0.9954    327885
   macro avg     0.8836    0.8318    0.8557    327885
weighted avg     0.9952    0.9954    0.9953    327885



In [None]:
import pandas as pd
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score,
    recall_score, f1_score, precision_recall_curve, auc, classification_report
)
import lightgbm as lgb

# Dictionary to store results
results = {}

# Helper function to evaluate models
def evaluate_model(name, y_true, y_probs, y_preds):
    prec, rec, _ = precision_recall_curve(y_true, y_probs)
    pr_auc = auc(rec, prec)

    results[name] = {
        "ROC-AUC": roc_auc_score(y_true, y_probs),
        "PR-AUC": pr_auc,
        "Accuracy": accuracy_score(y_true, y_preds),
        "Precision": precision_score(y_true, y_preds, zero_division=0),
        "Recall": recall_score(y_true, y_preds, zero_division=0),
        "F1-Score": f1_score(y_true, y_preds, zero_division=0),
    }

# --- Logistic Regression ---
lr_probs = clf.predict_proba(X_val_class)[:, 1]
lr_preds = clf.predict(X_val_class)
evaluate_model("Logistic Regression", y_val, lr_probs, lr_preds)

# --- Random Forest ---
rf_probs = rf.predict_proba(X_val_class)[:, 1]
rf_preds = rf.predict(X_val_class)
evaluate_model("Random Forest", y_val, rf_probs, rf_preds)

# --- XGBoost ---
xgb_probs = xgb.predict_proba(X_val_class)[:, 1]
xgb_preds = (xgb_probs >= 0.5).astype(int)
evaluate_model("XGBoost", y_val, xgb_probs, xgb_preds)

# --- Decision Tree ---
dt_probs = dt.predict_proba(X_val_class)[:, 1]
dt_preds = (dt_probs >= 0.5).astype(int)
evaluate_model("Decision Tree", y_val, dt_probs, dt_preds)

# --- LightGBM (default 0.5) ---
train_data = lgb.Dataset(X_tr_class, label=y_tr)
val_data = lgb.Dataset(X_val_class, label=y_val, reference=train_data)

params = {
    "objective": "binary",
    "boosting_type": "gbdt",
    "metric": ["binary_error", "auc"],
    "learning_rate": 0.05,
    "num_leaves": 31,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "seed": 42
}

lgb_model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=1000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]
)

lgb_probs = lgb_model.predict(X_val_class, num_iteration=lgb_model.best_iteration)
lgb_preds = (lgb_probs >= 0.5).astype(int)
evaluate_model("LightGBM (0.5)", y_val, lgb_probs, lgb_preds)

# --- LightGBM (best F1 threshold) ---
prec, rec, thresh = precision_recall_curve(y_val, lgb_probs)
f1_scores = 2 * prec * rec / (prec + rec + 1e-6)
best_idx = f1_scores.argmax()
best_thresh = thresh[best_idx]
print("Best LightGBM threshold:", best_thresh)

lgb_preds_best = (lgb_probs >= best_thresh).astype(int)
evaluate_model("LightGBM (Best F1)", y_val, lgb_probs, lgb_preds_best)

# Convert results to DataFrame
df_results = pd.DataFrame(results).T
print("\nModel Comparison:\n")
print(df_results.round(4))


[LightGBM] [Info] Number of positive: 11230, number of negative: 1300309
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066109 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 752
[LightGBM] [Info] Number of data points in the train set: 1311539, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.008562 -> initscore=-4.751768
[LightGBM] [Info] Start training from score -4.751768
Training until validation scores don't improve for 50 rounds
[100]	training's binary_error: 0.00435671	training's auc: 0.991758	valid_1's binary_error: 0.00451073	valid_1's auc: 0.989691


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    "n_estimators": [200, 500, 1000],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
    "class_weight": ["balanced", "balanced_subsample"]
}

rf = RandomForestClassifier(random_state=42, n_jobs=-1)

search = RandomizedSearchCV(
    rf, param_grid, n_iter=30, cv=5,
    scoring="f1", verbose=2, n_jobs=-1, random_state=42
)

search.fit(X_train_classical, y)
best_rf = search.best_estimator_
print("Best params:", search.best_params_)


Fitting 5 folds for each of 30 candidates, totalling 150 fits


