In [70]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from catboost import CatBoostClassifier


**Загрузка данных**

In [92]:
X, y = fetch_openml(
    name="adult",
    version=2,
    as_frame=True,
    return_X_y=True
)

print('target : \n', y[:5])

# target в бинарный вид
y = (y == ">50K").astype(int)

# Соединяем df и target для совместного dropna
X = pd.concat([X, y], axis=1)


# Признаки с пропусками
feat_na = X.isna().any()
name__na = feat_na[feat_na == True].index
print('\n Признаки с NA: \n', name__na, '\n')

# Удаление NA
X = X.dropna()
print('Проверка пропусков : \n', X.isna().any()[X.isna().any() == True], '\n')

# Разделяем df и target
y = X['class']
X = X.drop(['class'], axis=1)



print(X.shape, y.shape)

# y.name

target : 
 0    <=50K
1    <=50K
2     >50K
3     >50K
4    <=50K
Name: class, dtype: category
Categories (2, object): ['<=50K', '>50K']

 Признаки с NA: 
 Index(['workclass', 'occupation', 'native-country'], dtype='object') 

Проверка пропусков : 
 Series([], dtype: bool) 

(45222, 14) (45222,)


**Определяем категориальные признаки**

In [88]:
cat_features = X.select_dtypes(include=["object", 'category']).columns.to_list()
cat_features


['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

**Train / test split**

In [89]:
X_train, X_test, y_train, y_test = train_test_split(
    X[cat_features],
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Типы значений в признаках
print(X_train.dtypes)



(36177, 8) (9045, 8) (36177,) (9045,)
workclass         category
education         category
marital-status    category
occupation        category
relationship      category
race              category
sex               category
native-country    category
dtype: object


**Модель CatBoost**

In [90]:
model = CatBoostClassifier(
    iterations=200,
    depth=6,
    learning_rate=0.1,
    loss_function="Logloss",
    eval_metric="Accuracy",

    # ключевое для нашей темы
    cat_features=cat_features,

    # чтобы было видно обучение
    verbose=50,

    # фиксируем random seed (перестановки!)
    random_seed=42
)


**Обучение**

In [91]:
model.fit(
    X_train,
    y_train,
    eval_set=(X_test, y_test)
)


0:	learn: 0.8181165	test: 0.8117192	best: 0.8117192 (0)	total: 132ms	remaining: 26.2s
50:	learn: 0.8314675	test: 0.8233278	best: 0.8234384 (49)	total: 5.29s	remaining: 15.5s
100:	learn: 0.8336512	test: 0.8257601	best: 0.8257601 (100)	total: 10.7s	remaining: 10.5s
150:	learn: 0.8355861	test: 0.8260918	best: 0.8266446 (113)	total: 15.3s	remaining: 4.98s
199:	learn: 0.8378804	test: 0.8257601	best: 0.8266446 (113)	total: 19.5s	remaining: 0us

bestTest = 0.826644555
bestIteration = 113

Shrink model to first 114 iterations.


<catboost.core.CatBoostClassifier at 0x21cb4a31210>