In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
files = []
for dirname, _, filenames in os.walk('/kaggle/input/alfahack/train'):
    for filename in filenames:
        files.append(os.path.join(dirname, filename))

files_test = []
for dirname, _, filenames in os.walk('/kaggle/input/alfahack/test'):
    for filename in filenames:
        files_test.append(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from catboost import CatBoostClassifier

import json

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

import tqdm.notebook as tqdm
import gc

### Скачивание категориальных признаков; Категориальными мы считаем признаки, значения которых были целочисленными в датасете

In [None]:
# Загружаем json файл с категориальными фичами
with open('/kaggle/input/features-with-logit/features.json', 'r') as file:
    fe = json.load(file)

cat_features = fe['cat_features']

### Скачивание датасета

In [None]:
# Функция для создания датасета из половины всего датасета
def create_dataset(files):

    X = pd.DataFrame()
    y = pd.DataFrame()

    for file in files:
        df = pd.read_csv(file).drop(columns=['id', 'smpl'])
        print(file)

        # train_size взяли 0.5, потому что это максимальный размер, который вмещает оперативная память ноутбука kaggle
        X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']), df['target'],
                                                            random_state = 42, stratify=df['target'], train_size=0.5)
        print('сплитанулось')

        # Удаляем для очистки памяти
        del X_train, y_train, df
        gc.collect()

        # Объединяем в один DataFrame
        X_test = pd.DataFrame(X_test)
        y_test = pd.DataFrame(y_test)

        X = pd.concat((X, X_test)).reset_index(drop=True)
        y = pd.concat((y, y_test)).reset_index(drop=True)
        print('concat')
        print()

        # Удаляем для очистки памяти
        del X_test, y_test
        gc.collect()

    return X, y

In [None]:
# Создаем датасет и переопределяем тип категориальных фичей на int
X, y = create_dataset(files)
X[cat_features] = X[cat_features].astype('int', copy=False)

In [None]:
# Делаем train test split и удаляем весь датасет для очистки памяти
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, train_size=0.8)
del X, y

### CatBoost feature selection

In [None]:
# Взяли стандартные параметры для CatBoostClassifier и выделили 200 лучших фичей
params = {'task_type': 'GPU',
          'early_stopping_rounds': 150,
          'verbose': 100,
          'eval_metric': 'AUC',
          'random_state': 42,
          'cat_features': cat_features}

clf = CatBoostClassifier(**params)
summary = clf.select_features(X_train,
                              y_train,
                              eval_set=(X_test, y_test),
                              features_for_select=X_train.columns,
                              num_features_to_select=200,
                              train_final_model=True,
                              steps=10)

Learning rate set to 0.037662
Step #1 out of 10


Default metric period is 5 because AUC is/are not implemented for GPU
CatBoost is using more CPU RAM (20.8GiB) than the limit (10.6GiB)
CatBoost is using more CPU RAM (20.8GiB) than the limit (10.6GiB)


0:	test: 0.6977261	best: 0.6977261 (0)	total: 564ms	remaining: 9m 23s
100:	test: 0.8264193	best: 0.8264193 (100)	total: 36.2s	remaining: 5m 21s
200:	test: 0.8357592	best: 0.8357592 (200)	total: 1m 12s	remaining: 4m 48s
300:	test: 0.8400043	best: 0.8400043 (300)	total: 1m 48s	remaining: 4m 12s
400:	test: 0.8425313	best: 0.8425313 (400)	total: 2m 25s	remaining: 3m 36s
500:	test: 0.8440770	best: 0.8440770 (500)	total: 3m	remaining: 2m 59s
600:	test: 0.8451619	best: 0.8451619 (600)	total: 3m 35s	remaining: 2m 23s
700:	test: 0.8460172	best: 0.8460172 (700)	total: 4m 10s	remaining: 1m 46s
800:	test: 0.8467764	best: 0.8467764 (800)	total: 4m 45s	remaining: 1m 10s
900:	test: 0.8473043	best: 0.8473043 (900)	total: 5m 19s	remaining: 35.1s
999:	test: 0.8477837	best: 0.8477837 (999)	total: 5m 53s	remaining: 0us
bestTest = 0.8477836847
bestIteration = 999
Feature #491 eliminated
Feature #20 eliminated
Feature #351 eliminated
Feature #272 eliminated
Feature #430 eliminated
Feature #357 eliminated
Fe

CatBoost is using more CPU RAM (22.6GiB) than the limit (8.79GiB)
CatBoost is using more CPU RAM (22.6GiB) than the limit (8.79GiB)


0:	test: 0.7006655	best: 0.7006655 (0)	total: 404ms	remaining: 6m 43s
100:	test: 0.8267429	best: 0.8267429 (100)	total: 36.2s	remaining: 5m 22s
200:	test: 0.8360713	best: 0.8360713 (200)	total: 1m 12s	remaining: 4m 49s
300:	test: 0.8401382	best: 0.8401382 (300)	total: 1m 49s	remaining: 4m 13s
400:	test: 0.8427116	best: 0.8427116 (400)	total: 2m 24s	remaining: 3m 36s
500:	test: 0.8442312	best: 0.8442312 (500)	total: 2m 59s	remaining: 2m 59s
600:	test: 0.8452795	best: 0.8452795 (600)	total: 3m 34s	remaining: 2m 22s
700:	test: 0.8460791	best: 0.8460791 (700)	total: 4m 9s	remaining: 1m 46s
800:	test: 0.8468171	best: 0.8468171 (800)	total: 4m 44s	remaining: 1m 10s
900:	test: 0.8473736	best: 0.8473737 (899)	total: 5m 18s	remaining: 35s
999:	test: 0.8479019	best: 0.8479019 (999)	total: 5m 53s	remaining: 0us
bestTest = 0.8479019403
bestIteration = 999
Feature #63 eliminated
Feature #384 eliminated
Feature #110 eliminated
Feature #452 eliminated
Feature #473 eliminated
Feature #114 eliminated
F

CatBoost is using more CPU RAM (22.6GiB) than the limit (8.72GiB)
CatBoost is using more CPU RAM (22.6GiB) than the limit (8.72GiB)


0:	test: 0.7006655	best: 0.7006655 (0)	total: 377ms	remaining: 6m 16s
100:	test: 0.8264144	best: 0.8264144 (100)	total: 33.8s	remaining: 5m 1s
200:	test: 0.8356929	best: 0.8356929 (200)	total: 1m 7s	remaining: 4m 29s
300:	test: 0.8400804	best: 0.8400804 (300)	total: 1m 41s	remaining: 3m 55s
400:	test: 0.8424966	best: 0.8424966 (400)	total: 2m 14s	remaining: 3m 20s
500:	test: 0.8440036	best: 0.8440036 (500)	total: 2m 46s	remaining: 2m 46s
600:	test: 0.8451912	best: 0.8451912 (600)	total: 3m 19s	remaining: 2m 12s
700:	test: 0.8460714	best: 0.8460714 (700)	total: 3m 51s	remaining: 1m 38s
800:	test: 0.8468059	best: 0.8468059 (800)	total: 4m 24s	remaining: 1m 5s
900:	test: 0.8473954	best: 0.8473954 (900)	total: 4m 56s	remaining: 32.6s
999:	test: 0.8479049	best: 0.8479049 (999)	total: 5m 28s	remaining: 0us
bestTest = 0.8479048908
bestIteration = 999
Feature #158 eliminated
Feature #221 eliminated
Feature #413 eliminated
Feature #367 eliminated
Feature #35 eliminated
Feature #66 eliminated
Fe

CatBoost is using more CPU RAM (22.6GiB) than the limit (8.72GiB)
CatBoost is using more CPU RAM (22.6GiB) than the limit (8.72GiB)


0:	test: 0.6978785	best: 0.6978785 (0)	total: 350ms	remaining: 5m 50s
100:	test: 0.8266344	best: 0.8266344 (100)	total: 33.6s	remaining: 4m 59s
200:	test: 0.8357168	best: 0.8357168 (200)	total: 1m 7s	remaining: 4m 27s
300:	test: 0.8400982	best: 0.8400982 (300)	total: 1m 41s	remaining: 3m 54s
400:	test: 0.8427174	best: 0.8427174 (400)	total: 2m 13s	remaining: 3m 20s
500:	test: 0.8441893	best: 0.8441893 (500)	total: 2m 46s	remaining: 2m 45s
600:	test: 0.8452868	best: 0.8452868 (600)	total: 3m 18s	remaining: 2m 11s
700:	test: 0.8461957	best: 0.8461957 (700)	total: 3m 50s	remaining: 1m 38s
800:	test: 0.8470513	best: 0.8470513 (800)	total: 4m 22s	remaining: 1m 5s
900:	test: 0.8475927	best: 0.8475927 (900)	total: 4m 54s	remaining: 32.4s
999:	test: 0.8481097	best: 0.8481097 (999)	total: 5m 26s	remaining: 0us
bestTest = 0.8481096625
bestIteration = 999
Feature #159 eliminated
Feature #88 eliminated
Feature #186 eliminated
Feature #421 eliminated
Feature #234 eliminated
Feature #435 eliminated


CatBoost is using more CPU RAM (22.6GiB) than the limit (8.72GiB)
CatBoost is using more CPU RAM (22.6GiB) than the limit (8.72GiB)


0:	test: 0.6978785	best: 0.6978785 (0)	total: 349ms	remaining: 5m 48s
100:	test: 0.8266318	best: 0.8266318 (100)	total: 33.2s	remaining: 4m 55s
200:	test: 0.8358274	best: 0.8358274 (200)	total: 1m 6s	remaining: 4m 24s
300:	test: 0.8402327	best: 0.8402327 (300)	total: 1m 40s	remaining: 3m 52s
400:	test: 0.8426190	best: 0.8426190 (400)	total: 2m 12s	remaining: 3m 17s
500:	test: 0.8441675	best: 0.8441675 (500)	total: 2m 44s	remaining: 2m 44s
600:	test: 0.8453173	best: 0.8453173 (600)	total: 3m 16s	remaining: 2m 10s
700:	test: 0.8461435	best: 0.8461435 (700)	total: 3m 48s	remaining: 1m 37s
800:	test: 0.8468602	best: 0.8468602 (800)	total: 4m 20s	remaining: 1m 4s
900:	test: 0.8474756	best: 0.8474756 (900)	total: 4m 51s	remaining: 32.1s
999:	test: 0.8479716	best: 0.8479716 (999)	total: 5m 23s	remaining: 0us
bestTest = 0.847971648
bestIteration = 999
Feature #223 eliminated
Feature #313 eliminated
Feature #395 eliminated
Feature #119 eliminated
Feature #251 eliminated
Feature #444 eliminated


CatBoost is using more CPU RAM (22.6GiB) than the limit (8.72GiB)
CatBoost is using more CPU RAM (22.6GiB) than the limit (8.72GiB)


0:	test: 0.6978785	best: 0.6978785 (0)	total: 342ms	remaining: 5m 41s
100:	test: 0.8264906	best: 0.8264906 (100)	total: 32.9s	remaining: 4m 53s
200:	test: 0.8357668	best: 0.8357668 (200)	total: 1m 6s	remaining: 4m 22s
300:	test: 0.8401083	best: 0.8401083 (300)	total: 1m 39s	remaining: 3m 51s
400:	test: 0.8426537	best: 0.8426537 (400)	total: 2m 11s	remaining: 3m 17s
500:	test: 0.8442470	best: 0.8442470 (500)	total: 2m 43s	remaining: 2m 43s
600:	test: 0.8454361	best: 0.8454361 (600)	total: 3m 15s	remaining: 2m 10s
700:	test: 0.8462910	best: 0.8462910 (700)	total: 3m 47s	remaining: 1m 36s
800:	test: 0.8469826	best: 0.8469826 (800)	total: 4m 19s	remaining: 1m 4s
900:	test: 0.8476602	best: 0.8476610 (899)	total: 4m 50s	remaining: 31.9s
999:	test: 0.8481620	best: 0.8481620 (999)	total: 5m 21s	remaining: 0us
bestTest = 0.8481619954
bestIteration = 999
Feature #188 eliminated
Feature #89 eliminated
Feature #175 eliminated
Feature #76 eliminated
Feature #394 eliminated
Feature #259 eliminated
F

CatBoost is using more CPU RAM (22.6GiB) than the limit (8.72GiB)
CatBoost is using more CPU RAM (22.6GiB) than the limit (8.72GiB)


0:	test: 0.6996131	best: 0.6996131 (0)	total: 345ms	remaining: 5m 44s
100:	test: 0.8265028	best: 0.8265028 (100)	total: 32.8s	remaining: 4m 52s
200:	test: 0.8358750	best: 0.8358750 (200)	total: 1m 6s	remaining: 4m 22s
300:	test: 0.8402419	best: 0.8402419 (300)	total: 1m 39s	remaining: 3m 50s
400:	test: 0.8427222	best: 0.8427222 (400)	total: 2m 11s	remaining: 3m 16s
500:	test: 0.8442866	best: 0.8442866 (500)	total: 2m 43s	remaining: 2m 43s
600:	test: 0.8454144	best: 0.8454144 (600)	total: 3m 15s	remaining: 2m 9s
700:	test: 0.8463985	best: 0.8463985 (700)	total: 3m 47s	remaining: 1m 36s
800:	test: 0.8471608	best: 0.8471608 (800)	total: 4m 19s	remaining: 1m 4s
900:	test: 0.8477252	best: 0.8477252 (898)	total: 4m 50s	remaining: 31.9s
999:	test: 0.8482649	best: 0.8482649 (999)	total: 5m 21s	remaining: 0us
bestTest = 0.848264873
bestIteration = 999
Feature #374 eliminated
Feature #296 eliminated
Feature #483 eliminated
Feature #371 eliminated
Feature #249 eliminated
Feature #404 eliminated
F

CatBoost is using more CPU RAM (22.6GiB) than the limit (8.72GiB)
CatBoost is using more CPU RAM (22.6GiB) than the limit (8.72GiB)


0:	test: 0.6996131	best: 0.6996131 (0)	total: 339ms	remaining: 5m 38s
100:	test: 0.8260026	best: 0.8260026 (100)	total: 32.7s	remaining: 4m 50s
200:	test: 0.8358402	best: 0.8358402 (200)	total: 1m 5s	remaining: 4m 21s
300:	test: 0.8402007	best: 0.8402007 (300)	total: 1m 39s	remaining: 3m 49s
400:	test: 0.8425803	best: 0.8425803 (400)	total: 2m 11s	remaining: 3m 15s
500:	test: 0.8441847	best: 0.8441847 (500)	total: 2m 42s	remaining: 2m 42s
600:	test: 0.8452379	best: 0.8452379 (600)	total: 3m 14s	remaining: 2m 9s
700:	test: 0.8461094	best: 0.8461094 (700)	total: 3m 45s	remaining: 1m 36s
800:	test: 0.8469222	best: 0.8469222 (800)	total: 4m 16s	remaining: 1m 3s
900:	test: 0.8475890	best: 0.8475897 (899)	total: 4m 48s	remaining: 31.7s
999:	test: 0.8480877	best: 0.8480877 (999)	total: 5m 19s	remaining: 0us
bestTest = 0.8480876684
bestIteration = 999
Feature #68 eliminated
Feature #493 eliminated
Feature #227 eliminated
Feature #349 eliminated
Feature #497 eliminated
Feature #281 eliminated
F

CatBoost is using more CPU RAM (22.6GiB) than the limit (8.72GiB)
CatBoost is using more CPU RAM (22.6GiB) than the limit (8.72GiB)


0:	test: 0.6996131	best: 0.6996131 (0)	total: 318ms	remaining: 5m 17s
100:	test: 0.8262105	best: 0.8262105 (100)	total: 30.7s	remaining: 4m 33s
200:	test: 0.8357805	best: 0.8357805 (200)	total: 1m 1s	remaining: 4m 4s
300:	test: 0.8400215	best: 0.8400215 (300)	total: 1m 32s	remaining: 3m 34s
400:	test: 0.8424720	best: 0.8424720 (400)	total: 2m 2s	remaining: 3m 2s
500:	test: 0.8441525	best: 0.8441525 (500)	total: 2m 32s	remaining: 2m 31s
600:	test: 0.8452308	best: 0.8452308 (600)	total: 3m 1s	remaining: 2m
700:	test: 0.8461706	best: 0.8461706 (700)	total: 3m 30s	remaining: 1m 29s
800:	test: 0.8469747	best: 0.8469747 (800)	total: 4m	remaining: 59.6s
900:	test: 0.8476729	best: 0.8476729 (900)	total: 4m 29s	remaining: 29.6s
999:	test: 0.8481868	best: 0.8481868 (999)	total: 4m 58s	remaining: 0us
bestTest = 0.8481867909
bestIteration = 999
Feature #127 eliminated
Feature #348 eliminated
Feature #230 eliminated
Feature #343 eliminated
Feature #447 eliminated
Feature #28 eliminated
Feature #310

CatBoost is using more CPU RAM (22.6GiB) than the limit (8.72GiB)
CatBoost is using more CPU RAM (22.6GiB) than the limit (8.72GiB)


0:	test: 0.6996131	best: 0.6996131 (0)	total: 323ms	remaining: 5m 22s
100:	test: 0.8264921	best: 0.8264921 (100)	total: 30.4s	remaining: 4m 31s
200:	test: 0.8361848	best: 0.8361848 (200)	total: 1m 1s	remaining: 4m 4s
300:	test: 0.8403938	best: 0.8403938 (300)	total: 1m 32s	remaining: 3m 34s
400:	test: 0.8428808	best: 0.8428808 (400)	total: 2m 2s	remaining: 3m 3s
500:	test: 0.8445229	best: 0.8445229 (500)	total: 2m 32s	remaining: 2m 31s
600:	test: 0.8455910	best: 0.8455910 (600)	total: 3m 2s	remaining: 2m
700:	test: 0.8464871	best: 0.8464871 (700)	total: 3m 31s	remaining: 1m 30s
800:	test: 0.8473156	best: 0.8473156 (800)	total: 4m	remaining: 59.8s
900:	test: 0.8479240	best: 0.8479251 (899)	total: 4m 29s	remaining: 29.7s
999:	test: 0.8484308	best: 0.8484308 (999)	total: 4m 59s	remaining: 0us
bestTest = 0.8484308124
bestIteration = 999
Feature #416 eliminated
Feature #0 eliminated
Feature #179 eliminated
Feature #458 eliminated
Feature #32 eliminated
Feature #3 eliminated
Feature #120 eli

CatBoost is using more CPU RAM (22.6GiB) than the limit (8.72GiB)
CatBoost is using more CPU RAM (22.6GiB) than the limit (8.72GiB)


0:	test: 0.6996131	best: 0.6996131 (0)	total: 276ms	remaining: 4m 35s
100:	test: 0.8261715	best: 0.8261715 (100)	total: 26.1s	remaining: 3m 52s
200:	test: 0.8357878	best: 0.8357878 (200)	total: 52.9s	remaining: 3m 30s
300:	test: 0.8402117	best: 0.8402117 (300)	total: 1m 19s	remaining: 3m 4s
400:	test: 0.8426878	best: 0.8426878 (400)	total: 1m 45s	remaining: 2m 37s
500:	test: 0.8443968	best: 0.8443968 (500)	total: 2m 11s	remaining: 2m 10s
600:	test: 0.8455461	best: 0.8455461 (600)	total: 2m 36s	remaining: 1m 43s
700:	test: 0.8464294	best: 0.8464294 (700)	total: 3m 1s	remaining: 1m 17s
800:	test: 0.8471937	best: 0.8471937 (800)	total: 3m 27s	remaining: 51.5s
900:	test: 0.8478323	best: 0.8478337 (899)	total: 3m 52s	remaining: 25.6s
999:	test: 0.8483779	best: 0.8483779 (999)	total: 4m 18s	remaining: 0us
bestTest = 0.8483778536
bestIteration = 999


In [None]:
# Сохраняем отдельно категориальные и континуальные фичи в highlighted features
features = {'cat_features': list(set(cat_features) & set(summary['selected_features_names'])),
            'best_features': summary['selected_features_names']}
json_object = json.dumps(features, indent=4)
with open("features.json", "w") as outfile:
    outfile.write(json_object)

### Делаем feature importance на всем датасете и на выделенных фичах

In [None]:
# Загружаем выделенные выше фичи
with open('/kaggle/input/highlighted-features/features.json', 'r') as file:
    hl_features = json.load(file)

ct_features = hl_features['cat_features']
hl_features = hl_features['best_features']

In [None]:
# Функция для обучения CatBoostClassifier на всем датасете
def test(features, cat_features):

    # Выделяем категориальные признаки, фичи и таргеты
    new_cat_features = list(set(cat_features) & set(features))
    X = pd.read_csv(files[0], usecols=['target'] + features)
    y = X['target']
    X.drop(['target'], axis=1, inplace=True)

    for i in range(1, len(files)):

        # Добавляем данные из файла в общий DataFrame
        new_X = pd.read_csv(files[i], usecols=['target'] + features)
        new_y = new_X['target']
        new_X.drop(['target'], axis=1, inplace=True)
        X = pd.concat((X, new_X)).reset_index(drop=True)
        y = pd.concat((y, new_y)).reset_index(drop=True)
        # Удаляем для очистки памяти
        del new_X, new_y

    # Переопределяем тип категориальных фичей на int, производим train test split и удаляем для очистки памяти
    X[new_cat_features] = X[new_cat_features].astype('int', copy=False)
    print('получилость скачать')
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
    del X, y
    print('получилось сплитануть')

    classes = np.array([0, 1])
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))

    # Обучаем CatBoostClassifier
    if len(new_cat_features) == 0:
        clf = CatBoostClassifier(task_type='GPU', verbose=100, random_state=42, class_weights=class_weights)
    else:
        clf = CatBoostClassifier(task_type='GPU', verbose=100, random_state=42, class_weights=class_weights, cat_features=new_cat_features)
    clf.fit(X_train, y_train)

    del X_train, X_test, y_train, y_test

    return clf

In [None]:
# Обучаем модель
nice_model = test(hl_features, ct_features)

In [None]:
# Сохраняем модель
nice_model.save_model('nice_model')

In [None]:
# Делаем feature importance
nice_nice_model = CatBoostClassifier()
nice_nice_model.load_model('/kaggle/input/nice_nice_model/other/default/1/nice_model')
ft_imp = nice_nice_model.get_feature_importance(prettified=True)['Feature Id']
nice_nice_features = ft_imp[:172]  # путем гениального псевдо-тернарного поиска выбрали такое количество

In [None]:
# Сохраняем features
nice_nice_features_dct = {'features': nice_nice_features}
nice_nice_features_json_object = json.dumps(nice_nice_features_dct, indent=4)
with open('ultra_mega_last_features.json', 'w') as file:
    file.write(nice_nice_features_json_object)