In [1]:
!pip install --upgrade optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Mak

In [3]:
import polars as pl
import pandas as pd
import numpy as np

import optuna
from xgboost import XGBClassifier, DMatrix

from sklearn.metrics import roc_auc_score, f1_score, precision_recall_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

train = pd.read_parquet("/content/drive/MyDrive/Gagarin hack/train.parquet")

In [4]:
train = train.drop(columns = ["smpl", "id"])

X = train.drop(columns = "target")
y = train["target"]

cat_features = [col for col in X.columns if X[col].nunique()<200]

for col in cat_features:
    X[col] = X[col].astype(str)

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, stratify=y_temp, random_state=42)

In [5]:
# Convert categorical features to category type for CatBoost
for col in cat_features:
    X_train[col] = X_train[col].astype('category')
    X_val[col] = X_val[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# Calculate class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}


In [6]:
def xgboost_objective(trial):
    use_class_weights = trial.suggest_categorical('use_class_weights', [True, False])
    xgboost_params = {
        'objective': 'binary:logistic',
        'n_estimators': 2000,
        'learning_rate': trial.suggest_float('xgboost_learning_rate', 1e-3, 1e-1, log=True),
        'max_depth': trial.suggest_int('xgboost_max_depth', 1, 15),
        'reg_alpha': trial.suggest_float('xgboost_reg_alpha', 1e-2, 1e0, log=True),
        'reg_lambda': trial.suggest_float('xgboost_reg_lambda', 1e-2, 1e0, log=True),
        'subsample': trial.suggest_float('xgboost_subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('xgboost_colsample_bytree', 0.5, 1.0),
    }

    if use_class_weights:
        xgboost_params['scale_pos_weight'] = class_weight_dict[1] / class_weight_dict[0]

    model = XGBClassifier(**xgboost_params, enable_categorical=True,
                          early_stopping_rounds=100,
                          eval_metric='auc',
                          use_label_encoder=False,
                          verbosity=0)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)

    pred = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, pred)

    return auc


In [7]:
# Optimize each model
study_xgboost = optuna.create_study(direction='maximize')
study_xgboost.optimize(xgboost_objective, n_trials=100)

# Get best parameters

best_xgboost_params = {k.replace('xgboost_', ''): v for k, v in study_xgboost.best_params.items() if 'xgboost_' in k}

if study_xgboost.best_params.get('use_class_weights', False):
    best_xgboost_params['scale_pos_weight'] = class_weight_dict[1] / class_weight_dict[0]

[I 2024-11-14 12:00:52,591] A new study created in memory with name: no-name-d0de6adf-85e0-4e3f-b931-4dfa52636d86
[I 2024-11-14 12:03:10,822] Trial 0 finished with value: 0.7414374927272022 and parameters: {'use_class_weights': True, 'xgboost_learning_rate': 0.031129072434433882, 'xgboost_max_depth': 8, 'xgboost_reg_alpha': 0.7555057346995884, 'xgboost_reg_lambda': 0.08547786070162337, 'xgboost_subsample': 0.6493733571302808, 'xgboost_colsample_bytree': 0.8560285963580255}. Best is trial 0 with value: 0.7414374927272022.


In [8]:
xgboost_model = XGBClassifier(**best_xgboost_params, enable_categorical=True, early_stopping_rounds=100, eval_metric='auc', use_label_encoder=False, verbosity=0)
xgboost_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)


In [9]:
result = pd.DataFrame({'imp': xgboost_model.feature_importances_, 'names': xgboost_model.feature_names_in_})

In [10]:
features_to_drop = result.query('imp == 0')['names'].tolist()

# it is hard to replicate precisely because random_seed wasn't set, but result we acquire
# were as follows:

# features_to_drop = ['feature_3', 'feature_5', 'feature_10', 'feature_14', 'feature_15',
#        'feature_17', 'feature_20', 'feature_22', 'feature_23',
#        'feature_26', 'feature_36', 'feature_43', 'feature_46',
#        'feature_48', 'feature_49', 'feature_54', 'feature_55',
#        'feature_70', 'feature_77', 'feature_89', 'feature_92',
#        'feature_98', 'feature_101', 'feature_115', 'feature_125',
#        'feature_126', 'feature_130', 'feature_132', 'feature_133',
#        'feature_135', 'feature_149', 'feature_151', 'feature_153',
#        'feature_155', 'feature_156', 'feature_157', 'feature_181',
#        'feature_187', 'feature_203', 'feature_206', 'feature_207',
#        'feature_210', 'feature_231', 'feature_240', 'feature_242',
#        'feature_244', 'feature_248', 'feature_252', 'feature_255',
#        'feature_270', 'feature_287', 'feature_292', 'feature_308',
#        'feature_330', 'feature_341', 'feature_348', 'feature_350',
#        'feature_352', 'feature_368', 'feature_372', 'feature_381',
#        'feature_396', 'feature_404', 'feature_405', 'feature_406',
#        'feature_410', 'feature_411', 'feature_415'] + ['feature_4',
#        'feature_12', 'feature_25', 'feature_57',
#        'feature_60', 'feature_66', 'feature_72', 'feature_82',
#        'feature_91', 'feature_102', 'feature_105', 'feature_137',
#        'feature_142', 'feature_165', 'feature_176', 'feature_179',
#        'feature_197', 'feature_198', 'feature_200', 'feature_246',
#        'feature_249', 'feature_254', 'feature_262', 'feature_276',
#        'feature_289', 'feature_299', 'feature_305', 'feature_306',
#        'feature_312', 'feature_322', 'feature_337', 'feature_342',
#        'feature_347', 'feature_367', 'feature_393']