In [None]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

import gc
import time
import json
from tqdm import tqdm

gc.enable()

### Считывание выбранных фичей

In [None]:
with open('/kaggle/input/maindataset/ultra_mega_last_features.json') as f:
    file = json.load(f)
    features = file['features']
    cat_cols = ['feature_133', 'feature_143', 'feature_166', 'feature_201', 'feature_209',
                'feature_251', 'feature_253', 'feature_31', 'feature_382', 'feature_392',
                'feature_423', 'feature_434', 'feature_449', 'feature_490', 'feature_80',
                'feature_83', 'feature_92']

### Считывание всего датасета (для удобства мы создали сконкаченный датасет) с отобранными признаками

In [None]:
dataset = pd.read_parquet('/kaggle/input/maindataset/Rawdata.parquet', columns=features + ['target'])

X, y = dataset.drop(['target'], axis=1), dataset['target']
X[cat_cols] = X[cat_cols].astype('int')

del dataset
gc.collect()

### Считывание тестовых данных

In [None]:
def loading_test(best_features, cat_features):
    test_data = pd.read_csv('/kaggle/input/test/test_1.csv', usecols = best_features)

    for i in range(2, 11):
        new_test_data = pd.read_csv(f'/kaggle/input/test/test_{i}.csv', usecols = best_features)
        test_data = pd.concat((test_data, new_test_data)).reset_index(drop = True)
        del new_test_data

    test_data[cat_features] = test_data[cat_features].astype('category', copy = False)
    return test_data

test = loading_test(X.columns, cat_cols)
test = test[X.columns]
test[cat_cols] = test[cat_cols].astype('int')
gc.collect()

## Модели

In [None]:
predictions = []

### CatBoost

Данные параметры были выбранны в ручную, основывась на метрике на валидации

In [None]:
catboost_params = {'iterations': 3000,
          'task_type': 'GPU',
          'depth': 7,
          'learning_rate': .035,
          'early_stopping_rounds': 150,
          'grow_policy' : 'Lossguide',
          'verbose': 500,
          'l2_leaf_reg': 4.5,
          'eval_metric': 'AUC',
          'bagging_temperature': .5,
          'bootstrap_type': 'Bayesian',
          'min_data_in_leaf': 1700,
          'max_leaves': 6500,
          'loss_function': 'CrossEntropy',
          'random_strength': 20,
          'gpu_ram_part' : 0.9,
         }

##### Обучение 10 кэтбустов с одинаковыми параметрами и разными random_state, для уменьшения дисперсии предсказаний, большей стабильности модели

In [None]:
random_states = [777, 13, 42, 333, 4378, 1, 2389, 98, 76, 121]
predictions_catboost = []
for i in random_states:
    catboost_params['random_state'] = i
    model = CatBoostClassifier(**catboost_params)
    model.fit(X,
              y,
             cat_features=cat_cols)
    predictions_catboost.append(model.predict_proba(test)[:, 1])
del model
gc.collect()

Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 185ms	remaining: 6m 10s
500:	total: 1m 11s	remaining: 3m 34s
1000:	total: 2m 14s	remaining: 2m 14s
1500:	total: 3m 15s	remaining: 1m 5s
1999:	total: 4m 16s	remaining: 0us


Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 182ms	remaining: 6m 3s
500:	total: 1m 11s	remaining: 3m 32s
1000:	total: 2m 12s	remaining: 2m 12s
1500:	total: 3m 10s	remaining: 1m 3s
1999:	total: 4m 8s	remaining: 0us


Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 186ms	remaining: 6m 11s
500:	total: 1m 11s	remaining: 3m 32s
1000:	total: 2m 12s	remaining: 2m 11s
1500:	total: 3m 13s	remaining: 1m 4s
1999:	total: 4m 13s	remaining: 0us


Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 173ms	remaining: 5m 46s
500:	total: 1m 12s	remaining: 3m 37s
1000:	total: 2m 13s	remaining: 2m 13s
1500:	total: 3m 15s	remaining: 1m 4s
1999:	total: 4m 14s	remaining: 0us


Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 166ms	remaining: 5m 31s
500:	total: 1m 12s	remaining: 3m 37s
1000:	total: 2m 12s	remaining: 2m 12s
1500:	total: 3m 12s	remaining: 1m 4s
1999:	total: 4m 14s	remaining: 0us


Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 186ms	remaining: 6m 11s
500:	total: 1m 12s	remaining: 3m 37s
1000:	total: 2m 15s	remaining: 2m 15s
1500:	total: 3m 18s	remaining: 1m 6s
1999:	total: 4m 21s	remaining: 0us


Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 186ms	remaining: 6m 12s
500:	total: 1m 11s	remaining: 3m 33s
1000:	total: 2m 12s	remaining: 2m 12s
1500:	total: 3m 13s	remaining: 1m 4s
1999:	total: 4m 15s	remaining: 0us


Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 185ms	remaining: 6m 10s
500:	total: 1m 9s	remaining: 3m 28s
1000:	total: 2m 10s	remaining: 2m 10s
1500:	total: 3m 9s	remaining: 1m 3s
1999:	total: 4m 7s	remaining: 0us


Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 180ms	remaining: 6m
500:	total: 1m 9s	remaining: 3m 28s
1000:	total: 2m 10s	remaining: 2m 10s
1500:	total: 3m 9s	remaining: 1m 2s
1999:	total: 4m 7s	remaining: 0us


Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 181ms	remaining: 6m 1s
500:	total: 1m 14s	remaining: 3m 42s
1000:	total: 2m 15s	remaining: 2m 15s
1500:	total: 3m 14s	remaining: 1m 4s
1999:	total: 4m 14s	remaining: 0us


##### Усреднение предсказаний 10 моделей

In [None]:
predictions.append(np.mean(predictions_catboost, axis=0))

0

#### XGBoost

In [None]:
X[cat_cols] = X[cat_cols].astype('category')
test[cat_cols] = test[cat_cols].astype('category')
predictions_xgboost = []
gc.collect()

In [None]:
# Параметры моделей были выбраны вручную: Мы хотели усреднить предсказания "недообученной", "переобученной" и "средней" моделей
xgb_params_1 = { #train 0.8944 test 0.8629
    'n_estimators': 2000,
    'learning_rate':0.03,
    'max_depth': 8,
    'use_label_encoder':False,
    'enable_categorical':True,
    'eval_metric':'auc',
    'tree_method':'hist',
    'device':'cuda',
    'reg_lambda': 11,
    'reg_alpha': 30,
    'max_bin': 1000,
    'max_leaves': 80,
    'subsample': 0.8,
    'sampling_method': 'uniform',
    'random_state': 42,
    'max_delta_step': 6,
    'min_child_weight': 5
}

xgb_params_2 = { # Train ROC AUC: 0.9329 | Test ROC AUC: 0.8652
    'n_estimators': 3000,
    'learning_rate':0.03,
    'max_depth': 9,
    'max_delta_step' : 6,
    'use_label_encoder':False,
    'enable_categorical':True,
    'eval_metric':'auc',
    'tree_method':'hist',
    'device':'cuda',
    'reg_lambda': 8,
    'reg_alpha': 20,
    'max_bin': 1000,
    'max_leaves': 6500,
    'subsample': 0.8,
    'min_child_weight' : 5,
    'sampling_method': 'uniform',
    'random_state': 42,
}

xgb_params_3 = { # Train ROC AUC: 0.9166 | Test ROC AUC: 0.8642
    'n_estimators': 2700,
    'learning_rate':0.03,
    'max_depth': 7,
    'use_label_encoder':False,
    'enable_categorical':True,
    'eval_metric':'auc',
    'tree_method':'hist',
    'device':'cuda',
    'reg_lambda': 8,
    'reg_alpha': 8,
    'max_bin': 500,
    'max_leaves': 6500,
    'subsample': 0.8,
    'sampling_method': 'uniform',
    'random_state': 42,
}

In [None]:
model1 = XGBClassifier(**xgb_params_1)
model1.fit(X, y)
predictions_xgboost.append(model1.predict_proba(test)[:, 1])
del model1
gc.collect()

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




13

In [None]:
model2 = XGBClassifier(**xgb_params_2)
model2.fit(X, y, verbose=100)
predictions_xgboost.append(model2.predict_proba(test)[:, 1])
del model2
gc.collect()

13

In [None]:
model3 = XGBClassifier(**xgb_params_3)
model3.fit(X, y, verbose=100)
predictions_xgboost.append(model3.predict_proba(test)[:, 1])
del model3
gc.collect()

13

##### Усреднение предсказаний 3 моделей

In [None]:
predictions.append(np.mean(predictions_xgboost, axis=0))

### Результат

##### Усреднение 10 catboost + 3 xgboost

In [None]:
final_prediction = np.mean(predictions, axis=0)

### Итоговый Сабмит

In [None]:
submission = pd.DataFrame()
submission['id'] = np.arange(4490468, 4490468 + 509532)
submission['target'] = final_prediction
submission.to_csv('submission.csv', index = False)