In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ydata_profiling
import cufflinks as cf
import plotly.offline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score

cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
pd.set_option('display.max_columns', 38)

DATA_DIR = "/home/gangda/workspace/ds-richter/data"

X = pd.read_csv(DATA_DIR + '/train_values.csv', index_col='building_id')
y = pd.read_csv(DATA_DIR + '/train_labels.csv', index_col='building_id')
X_test = pd.read_csv(DATA_DIR + '/test_values.csv', index_col='building_id')

cat_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 'land_surface_condition', 'foundation_type',
            'roof_type',
            'ground_floor_type', 'other_floor_type', 'position',
            'plan_configuration', 'legal_ownership_status', 'count_floors_pre_eq', 'has_superstructure_adobe_mud',
            'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
            'has_superstructure_cement_mortar_stone',
            'has_superstructure_mud_mortar_brick',
            'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
            'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
            'has_superstructure_rc_engineered', 'has_superstructure_other',
            'legal_ownership_status', 'has_secondary_use',
            'has_secondary_use_agriculture', 'has_secondary_use_hotel',
            'has_secondary_use_rental', 'has_secondary_use_institution',
            'has_secondary_use_school', 'has_secondary_use_industry',
            'has_secondary_use_health_post', 'has_secondary_use_gov_office',
            'has_secondary_use_use_police', 'has_secondary_use_other']

In [5]:
# best_params = {
#     'depth': 8,
#     'iterations': 5000,
#     'learning_rate': 0.06,
#     'use_best_model': True,
#     'eval_metric': 'TotalF1',
#     # 'loss_function': 'MultiClass',
#     'random_seed': 42,
#     # 'task_type': 'GPU',
#     # 'devices': [2],
# }

# best_params = {
#     "bagging_temperature": 0.3451843796405421,
#     "depth": 10,
#     "l2_leaf_reg": 0.039115477787531897,
#     "learning_rate": 0.049800960339556964,
#     "max_bin": 319,
#     "min_data_in_leaf": 60,
#     "n_estimators": 6250,
#     "od_type": "IncToDec",
#     "od_wait": 36,
#     "random_strength": 4.899993862078418e-07,
#     'eval_metric': 'TotalF1',
#     'random_seed': 42,
#     'task_type': 'GPU',
#     'devices': [2],
# }

best_params = {
    "random_seed": 42,
    "use_best_model": True,
    "eval_metric": "TotalF1",
    "bagging_temperature": 0.008240891229822092,
    "boosting_type": "Ordered",
    "border_count": 11,
    "colsample_bylevel": 0.30528252170397946,
    "depth": 9,
    "l2_leaf_reg": 7,
    "learning_rate": 0.065,
    "min_data_in_leaf": 26,
    "n_estimators": 7537,
    "od_type": "IncToDec",
    "random_strength": 0.00043391429996536275
}

In [3]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20)

model = CatBoostClassifier(cat_features=cat_cols, **best_params)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=1000)
preds = model.predict(X_val)

# f1 score is the metric used in the competition
accuracy = f1_score(y_val, preds, average='micro')
print("Score on test set")
print("\n ========================================================")
print(accuracy)

0:	learn: 0.6807843	test: 0.6810495	best: 0.6810495 (0)	total: 850ms	remaining: 1h 10m 47s
1:	learn: 0.6843229	test: 0.6882527	best: 0.6882527 (1)	total: 1.38s	remaining: 57m 29s
2:	learn: 0.6848108	test: 0.6886375	best: 0.6886375 (2)	total: 1.89s	remaining: 52m 32s
3:	learn: 0.6874725	test: 0.6919788	best: 0.6919788 (3)	total: 2.33s	remaining: 48m 31s
4:	learn: 0.6861632	test: 0.6901978	best: 0.6919788 (3)	total: 2.77s	remaining: 46m 7s
5:	learn: 0.6872971	test: 0.6906062	best: 0.6919788 (3)	total: 3.17s	remaining: 44m
6:	learn: 0.6882505	test: 0.6923486	best: 0.6923486 (6)	total: 3.6s	remaining: 42m 48s
7:	learn: 0.6887240	test: 0.6930448	best: 0.6930448 (7)	total: 4.04s	remaining: 41m 58s
8:	learn: 0.6897374	test: 0.6937974	best: 0.6937974 (8)	total: 4.47s	remaining: 41m 17s
9:	learn: 0.6900552	test: 0.6944463	best: 0.6944463 (9)	total: 4.9s	remaining: 40m 45s
10:	learn: 0.6899853	test: 0.6939366	best: 0.6944463 (9)	total: 5.31s	remaining: 40m 8s
11:	learn: 0.6900094	test: 0.6939139

KeyboardInterrupt: 

In [15]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)
best_models = []
val_acc = 0
for train_index, val_index in kf.split(X):
    X_train = X.iloc[train_index, :]
    y_train = y.iloc[train_index, :]

    X_val = X.iloc[val_index, :]
    y_val = y.iloc[val_index, :]

    model = CatBoostClassifier(cat_features=cat_cols, **best_params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=1000)

    val_acc += f1_score(y_val, model.predict(X_val), average='micro')
    best_models.append(model)

print(f'val_acc: {val_acc/len(best_models): .5f}')

0:	learn: 0.6945784	test: 0.7020350	best: 0.7020350 (0)	total: 423ms	remaining: 53m 8s
1:	learn: 0.6969129	test: 0.7021709	best: 0.7021709 (1)	total: 836ms	remaining: 52m 30s
2:	learn: 0.6952067	test: 0.7011230	best: 0.7021709 (1)	total: 1.24s	remaining: 51m 58s
3:	learn: 0.6950920	test: 0.7009380	best: 0.7021709 (1)	total: 1.64s	remaining: 51m 36s
4:	learn: 0.6957600	test: 0.7018098	best: 0.7021709 (1)	total: 1.94s	remaining: 48m 45s
5:	learn: 0.6970448	test: 0.7025341	best: 0.7025341 (5)	total: 2.32s	remaining: 48m 34s
6:	learn: 0.6973159	test: 0.7020059	best: 0.7025341 (5)	total: 2.7s	remaining: 48m 26s
7:	learn: 0.6982010	test: 0.7035435	best: 0.7035435 (7)	total: 3.09s	remaining: 48m 31s
8:	learn: 0.6980443	test: 0.7031734	best: 0.7035435 (7)	total: 3.46s	remaining: 48m 12s
9:	learn: 0.6988886	test: 0.7038109	best: 0.7038109 (9)	total: 3.86s	remaining: 48m 22s
10:	learn: 0.6993613	test: 0.7037857	best: 0.7038109 (9)	total: 4.23s	remaining: 48m 15s
11:	learn: 0.6996575	test: 0.7040

In [16]:
print(f'val_acc: {val_acc/len(best_models): .5f}')

val_acc:  0.75245


In [17]:
val_probs = []
for i, (train_index, val_index) in enumerate(kf.split(X)):
    X_val = X.iloc[val_index, :]
    val_probs.append(best_models[i].predict_proba(X_val))

val_probs = np.concatenate(val_probs, axis=0)
val_probs

array([[1.25208996e-03, 2.38623956e-01, 7.60123954e-01],
       [6.68751247e-03, 7.07615755e-01, 2.85696732e-01],
       [1.37735085e-03, 2.49011686e-01, 7.49610964e-01],
       ...,
       [1.63587937e-04, 1.91700267e-02, 9.80666385e-01],
       [1.68493184e-01, 8.25611434e-01, 5.89538199e-03],
       [2.38215882e-04, 1.43039719e-01, 8.56722065e-01]])

In [18]:
test_probs = None
for best_model in best_models:
    if test_probs is None:
        test_probs = best_model.predict_proba(X_test)
    else:
        test_probs += best_model.predict_proba(X_test)

test_probs /= len(best_models)
test_probs

array([[7.87927177e-04, 2.77004964e-01, 7.22207109e-01],
       [2.71815686e-03, 9.22723788e-01, 7.45580550e-02],
       [2.35453596e-02, 7.57099562e-01, 2.19355078e-01],
       ...,
       [3.55178437e-02, 7.71384363e-01, 1.93097793e-01],
       [4.79080223e-03, 8.05362935e-01, 1.89846263e-01],
       [5.30063170e-01, 4.12898243e-01, 5.70385873e-02]])

In [19]:
version = 2

In [20]:
np.save(DATA_DIR + '/intermediate/catboost_{}fold_v{}_train.npy'.format(kf.n_splits, version), val_probs)
np.save(DATA_DIR + '/intermediate/catboost_{}fold_v{}_test.npy'.format(kf.n_splits, version), test_probs)

In [21]:
test_preds = np.argmax(test_probs, axis=1) + 1

submission_format = pd.read_csv(DATA_DIR + '/submission_format.csv', index_col='building_id')
my_submission = pd.DataFrame(data=test_preds,
                             columns=submission_format.columns,
                             index=submission_format.index)
my_submission['damage_grade'] = my_submission['damage_grade'].astype(int)
my_submission.to_csv(DATA_DIR + '/submission/catboost_{}fold_v{}.csv'.format(kf.n_splits, version))

---

In [26]:
best_models[0].get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'TotalF1',
 'combinations_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=2:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1',
  'Counter:CtrBorderCount=15:CtrBorderType=Uniform:Prior=0/1'],
 'iterations': 5000,
 'sampling_frequency': 'PerTree',
 'fold_permutation_block': 0,
 'leaf_estimation_method': 'Newton',
 'od_pval': 0,
 'counter_calc_method': 'SkipTest',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'ctr_leaf_count_limit': 18446744073709551615,
 'bayesian_matrix_reg': 0.10000000149011612,
 'one_hot_max_size': 2,
 'eval_fraction': 0,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'od_type': 'Iter',
 'rsm': 1,
 'boost_from_average': False,
 'max_ctr_complexity': 4,
 'model_size_reg': 0.5,
 'simple_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:Targe