In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


In [18]:
import random

train_df = pd.read_csv('train.csv')
cols_wo_nans = []
for col in train_df.columns:
    if (train_df[col].notnull()).sum() == 0:
        cols_wo_nans.append(col)
train_df.drop(cols_wo_nans, axis=1, inplace=True)


In [19]:
train_df.fillna(-99999, inplace=True)

df_len = len(train_df)
random_indices = random.sample(range(df_len), int(df_len / 5))
test_df = train_df.iloc[random_indices]
Y_test = test_df['Target']

In [20]:

X = train_df.drop('Target', axis=1)
Y = train_df['Target']
categorical_features_indices = np.where(X.dtypes != float)[0]

In [21]:

X_train, X_validation, y_train, y_validation = train_test_split(X, Y, train_size=0.8, random_state=42)

X_test = test_df

In [45]:
params = {
    'learning_rate' : 0.07,
    'iterations': 400,
    'eval_metric': metrics.Accuracy(),
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': True,
    'custom_loss' : 'AUC',
}
train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
validate_pool = Pool(X_validation, y_validation, cat_features=categorical_features_indices)

In [46]:
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool, plot=True)


print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Simple model validation accuracy: 0.9251


In [47]:
cv_params = model.get_params()
cv_params.update({
    'loss_function': metrics.Logloss()
})
cv_data = cv(
    Pool(X, Y, cat_features=categorical_features_indices),
    cv_params,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [48]:
cv_data.head()

Unnamed: 0,iterations,test-Accuracy-mean,test-Accuracy-std,train-Accuracy-mean,train-Accuracy-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std,test-AUC-mean,test-AUC-std
0,0,0.92656,1.35974e-16,0.926573,2.309401e-05,0.619961,0.000181,0.619891,0.000376,0.585839,0.015548
1,1,0.92656,1.35974e-16,0.92656,1.35974e-16,0.560399,0.000166,0.560336,0.000333,0.585839,0.015548
2,2,0.92656,1.35974e-16,0.92656,1.35974e-16,0.510594,0.000822,0.510525,0.000522,0.590807,0.015633
3,3,0.92656,1.35974e-16,0.92656,1.35974e-16,0.469553,0.000753,0.469492,0.000489,0.592121,0.013559
4,4,0.92656,1.35974e-16,0.92656,1.35974e-16,0.435464,0.000775,0.435433,0.000495,0.592784,0.012556


In [49]:
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Best validation accuracy score: 0.93±0.00 on step 193
Precise validation accuracy score: 0.9273333333333333


In [16]:
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)
print(predictions[:10])
print(predictions_probs[:10])

[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
[[0.86916969 0.13083031]
 [0.94209046 0.05790954]
 [0.95350202 0.04649798]
 [0.97099003 0.02900997]
 [0.95094221 0.04905779]
 [0.95927584 0.04072416]
 [0.88055633 0.11944367]
 [0.97276353 0.02723647]
 [0.95065098 0.04934902]
 [0.90340423 0.09659577]]


In [52]:
df_to_predict = pd.read_csv('data_predict.csv')
df_to_predict.drop(cols_wo_nans, axis=1, inplace=True)
df_to_predict.fillna(-99999, inplace=True)

submisstion = pd.DataFrame()
submisstion['ID'] = df_to_predict['ID']
submisstion['Target'] = model.predict_proba(df_to_predict)[:,1]
submisstion.to_csv('submission.csv', index=False)