In [1]:
# ! pip install catboost

In [2]:
import warnings
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc, roc_curve
import plotly.express as px

In [3]:
def graphic_roc_auc(y_test, y_pred):
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    # False Positive Rate - доля ложно положительных ответов
    # True Positive Rate - доля истинно положительных ответов

    # закрашенная область
    fig = px.area(
        x=fpr,  # значения оси x
        y=tpr,  # значения оси y
        title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',  # название графика
        labels=dict(x='False Positive Rate', y='True Positive Rate'),  # названия осей
        width=700,  # ширина
        height=500,  # высота
    )

    # пунктирная линия
    fig.add_shape(
        type='line',  # тип объекта
        line=dict(dash='dash'),  # характеристики линии
        x0=0, x1=1, y0=0, y1=1,  # координаты - откуда и куда рисовать
    )

    fig.update_yaxes(scaleanchor='x', scaleratio=1)
    fig.update_xaxes(constrain='domain')

    fig.show()

In [4]:
def get_train_test(df_catboost: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame,
                                                 pd.Series, pd.Series):
    """Получаем тренировочные и тестовые наборы данных"""

    train_df = df_catboost.iloc[:int(len(df_catboost) * 0.72)] # делим тренировочный датасет
    test_df = df_catboost.iloc[int(len(df_catboost) * 0.72):]

    train_labels = train_df['target'] # таргеты
    test_labels = test_df['target']
    train_df.drop(['target'], axis=1, inplace=True) # дропаем колонки
    test_df.drop(['target'], axis=1, inplace=True)
    return train_df, test_df, train_labels, test_labels

In [5]:
warnings.filterwarnings("ignore") # отключение предупреждений


df_catboost = pd.read_csv('train.csv', index_col=0)


cat_features = [ # категориальные признаки
        'applicant_ethnicity',
        'applicant_race_1',
        'applicant_sex',
        'co_applicant_ethnicity',
        'co_applicant_race_1',
        'co_applicant_sex',
        'loan_purpose',
        'loan_type',
        'msamd',
        'preapproval',
        'property_type',
        'purchaser_type'
        ]

In [6]:
df_catboost['income'].fillna(df_catboost['income'].median(), inplace=True)
df_catboost['income'] = df_catboost['income'].astype(int)


# df_catboost['hud_median_family_income'].fillna(df_catboost['hud_median_family_income'].median(), inplace=True)


# df_catboost['income_ratio_to_average'] = df_catboost['hud_median_family_income'] // df_catboost['income']
# df_catboost['income_ratio_to_average'].fillna(df_catboost['income_ratio_to_average'].median(), inplace=True)


for feature in cat_features:

    df_catboost[feature].fillna(df_catboost[feature].mode().iloc[0], inplace=True) # заполняем модой, на всякий случай

    df_catboost[feature] = df_catboost[feature].astype(int)

In [7]:
train_df, test_df, train_labels, test_labels = get_train_test(df_catboost)

# initialize CatBoostClassifier
model_v1 = CatBoostClassifier(random_seed=42, iterations=250, depth=8, learning_rate=0.4, boosting_type='Ordered', bootstrap_type='Bernoulli', eval_metric='AUC', task_type='GPU')


# fit model
model_v1.fit(train_df, train_labels, cat_features=cat_features, eval_set=(test_df, test_labels))


predictions = model_v1.predict_proba(test_df) # получаем предсказания


print(f"ROC_AUC_score = {roc_auc_score(test_labels, predictions[:,1])}\n")

graphic_roc_auc(test_labels, predictions[:,1]) # рисуем график

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.9022748	best: 0.9022748 (0)	total: 331ms	remaining: 1m 22s
1:	total: 624ms	remaining: 1m 17s
2:	total: 940ms	remaining: 1m 17s
3:	total: 1.2s	remaining: 1m 14s
4:	total: 1.5s	remaining: 1m 13s
5:	test: 0.9174294	best: 0.9174294 (5)	total: 1.78s	remaining: 1m 12s
6:	total: 2.08s	remaining: 1m 12s
7:	total: 2.37s	remaining: 1m 11s
8:	total: 2.65s	remaining: 1m 10s
9:	total: 2.94s	remaining: 1m 10s
10:	test: 0.9199529	best: 0.9199529 (10)	total: 3.23s	remaining: 1m 10s
11:	total: 3.48s	remaining: 1m 9s
12:	total: 3.77s	remaining: 1m 8s
13:	total: 4.07s	remaining: 1m 8s
14:	total: 4.36s	remaining: 1m 8s
15:	test: 0.9210098	best: 0.9210098 (15)	total: 4.64s	remaining: 1m 7s
16:	total: 4.91s	remaining: 1m 7s
17:	total: 5.18s	remaining: 1m 6s
18:	total: 5.44s	remaining: 1m 6s
19:	total: 5.69s	remaining: 1m 5s
20:	test: 0.9216356	best: 0.9216356 (20)	total: 5.98s	remaining: 1m 5s
21:	total: 6.27s	remaining: 1m 4s
22:	total: 6.54s	remaining: 1m 4s
23:	total: 6.85s	remaining: 1m 4s
24

In [8]:
# тесты

df_catboost_test = pd.read_csv('test.csv', index_col=0)

df_catboost_test['income'].fillna(df_catboost_test['income'].median(), inplace=True)


# df_catboost_test['hud_median_family_income'].fillna(df_catboost_test['hud_median_family_income'].median(), inplace=True)


# df_catboost_test['income_ratio_to_average'] = df_catboost_test['hud_median_family_income'] // df_catboost_test['income']
# df_catboost_test['income_ratio_to_average'].fillna(df_catboost_test['income_ratio_to_average'].median(), inplace=True)


for feature in cat_features:

    df_catboost_test[feature].fillna(df_catboost_test[feature].mode().iloc[0], inplace=True)

    df_catboost_test[feature] = df_catboost_test[feature].astype(int)


df_catboost_test['target'] = model_v1.predict_proba(df_catboost_test)[:, 1]

df_catboost_test.target.to_csv('submission.csv')

In [11]:
# !zip submission.zip submission.csv