In [None]:
! pip install catboost
import warnings

import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import roc_auc_score


def get_train_test(df_catboost: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame,
                                                 pd.Series, pd.Series):
    """Получаем тренировочные и тестовые наборы данных"""
    # choose train test

    train_df = df_catboost.iloc[:int(len(df_catboost) * 0.72)]
    test_df = df_catboost.iloc[int(len(df_catboost) * 0.72):]
    # get target features
    train_labels = train_df['target']
    test_labels = test_df['target']
    train_df.drop(['target'], axis=1, inplace=True)
    test_df.drop(['target'], axis=1, inplace=True)
    return train_df, test_df, train_labels, test_labels


warnings.filterwarnings("ignore")

df_catboost = pd.read_csv('train.csv', index_col=0)

# дропаем мусор

# columns_to_drop = ['hud_median_family_income']

# df_catboost.drop(columns=columns_to_drop, inplace=True)

# добавить доход / на медиану семьи
# доход / сумму займа

cat_features = [
        'applicant_ethnicity',
        'applicant_race_1',
        'applicant_sex',
        'co_applicant_ethnicity',
        'co_applicant_race_1',
        'co_applicant_sex'
        ]

df_catboost['income'].fillna(df_catboost['income'].median(), inplace=True)
# df_catboost['income'] = df_catboost['income'].astype(int)

df_catboost['hud_median_family_income'].fillna(df_catboost['hud_median_family_income'].median(), inplace=True)

df_catboost['income_ratio_to_average'] = df_catboost['hud_median_family_income'] // df_catboost['income']
df_catboost['income_ratio_to_average'].fillna(df_catboost['income_ratio_to_average'].median(), inplace=True)
# df_catboost['income_ratio_to_average'] = df_catboost['income_ratio_to_average'].astype(int)

for feature in cat_features:
    df_catboost[feature].fillna(df_catboost[feature].mode().iloc[0], inplace=True)
    df_catboost[feature] = df_catboost[feature].astype(int)

# columns_to_drop = ['hud_median_family_income']
#
# # 0.9232499627992874
#
# df_catboost.drop(columns=columns_to_drop, inplace=True)

train_df, test_df, train_labels, test_labels = get_train_test(df_catboost)

# initialize CatBoostClassifier
model_v1 = CatBoostClassifier(random_seed=42, iterations=250, depth=8, learning_rate=0.4, boosting_type='Ordered', bootstrap_type='Bernoulli', eval_metric='AUC')

# 0.9248728480993


train_pool = Pool(train_df, label=train_labels, cat_features=cat_features)


cv_params = {
    'iterations': 10000,
    'depth': 8,
    'learning_rate': 0.4,
    'boosting_type': 'Ordered',
    'bootstrap_type': 'Bernoulli',
    'loss_function': 'Logloss',
}

cv_results = cv(train_pool, cv_params, fold_count=5, plot=False)

best_iteration = cv_results['test-Logloss-mean'].idxmin()


cv_params.pop('iterations')  # Remove iterations from cv_params
best_model = CatBoostClassifier(iterations=best_iteration, **cv_params, early_stopping_rounds=250)


best_model.fit(train_df, train_labels, cat_features=cat_features, eval_set=(test_df, test_labels))

# get score
predictions = best_model.predict_proba(test_df)

print(f"ROC_AUC_score = {roc_auc_score(test_labels, predictions[:,1])}\n")

# тесты
df_catboost_test = pd.read_csv('test.csv', index_col=0)

df_catboost_test['income'].fillna(df_catboost_test['income'].median(), inplace=True)
# df_catboost['income'] = df_catboost['income'].astype(int)

df_catboost_test['hud_median_family_income'].fillna(df_catboost_test['hud_median_family_income'].median(), inplace=True)

df_catboost_test['income_ratio_to_average'] = df_catboost_test['hud_median_family_income'] // df_catboost_test['income']
df_catboost_test['income_ratio_to_average'].fillna(df_catboost_test['income_ratio_to_average'].median(), inplace=True)
# df_catboost['income_ratio_to_average'] = df_catboost['income_ratio_to_average'].astype(int)

for feature in cat_features:
    df_catboost_test[feature].fillna(df_catboost_test[feature].mode().iloc[0], inplace=True)
    df_catboost_test[feature] = df_catboost_test[feature].astype(int)

df_catboost_test['target'] = best_model.predict_proba(df_catboost_test)[:, 1]

df_catboost_test.target.to_csv('submission.csv')
